In [6]:
# import necessary libraries
import pandas as pd
import numpy as np

# load the dataset
print("Arican startups Dataset...") 
df = pd.read_csv('African_startups.csv')

# check for missing values

print("\n Missing values in each columns:")
print(df.isnull().sum())





Arican startups Dataset...

 Missing values in each columns:
Company Name              0
Industry                  0
Location                  0
Funding (USD)             0
Employees                 0
Founding Year             0
Growth Rate (%)           0
Competitors               0
Revenue Estimate (USD)    0
Stage                     0
dtype: int64


In [15]:
# Step 3: Clean column names (remove spaces, standardize)
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace('_(USD)', '', regex=False)
# Fix the column name for Revenue Estimate (USD)
df.rename(columns={
    'Funding_USD': 'Funding',
    'Revenue_Estimate_USD': 'Revenue_Estimate',
    'Growth_Rate_(%)': 'Growth_Rate',
    'Founding_Year': 'Founding_Year'
}, inplace=True)



In [16]:
# Step 4: Handle missing or invalid data
# Drop rows with missing critical data (Funding, Industry)
df.dropna(subset=['Funding', 'Industry'], inplace=True)

In [17]:
# Convert numeric columns to correct data types
df['Funding'] = pd.to_numeric(df['Funding'], errors='coerce')
df['Employees'] = pd.to_numeric(df['Employees'], errors='coerce')
df['Growth_Rate'] = pd.to_numeric(df['Growth_Rate'], errors='coerce')

In [18]:
# Step 5: Create new strategic metrics
print("\n Creating new business metrics...")

# Funding per Employee (efficiency metric)
df['Funding_per_Employee'] = df['Funding'] / (df['Employees'] + 1)  # +1 to avoid division by zero



 Creating new business metrics...


In [19]:
# High-Growth Flag (Growth Rate > 20%)
df['High_Growth'] = df['Growth_Rate'] > 20

In [20]:
# Funding Stage Grouping (for analysis)
df['Funding_Stage'] = pd.cut(
    df['Funding'],
    bins=[0, 500000, 1000000, 2000000, float('inf')],
    labels=['Seed (<500K)', 'Series A (500K–1M)', 'Series B (1M–2M)', 'Series C+ (>2M)']
)

In [21]:
# Step 6: Clean text fields (strip extra spaces)
df['Company_Name'] = df['Company_Name'].str.strip()
df['Location'] = df['Location'].str.strip()
df['Industry'] = df['Industry'].str.strip()

In [22]:
# Step 7: Preview cleaned data
print("\n First 5 rows of cleaned data:")
print(df[['Company_Name', 'Industry', 'Location', 'Funding', 'Employees', 'Growth_Rate', 'High_Growth']].head())


 First 5 rows of cleaned data:
    Company_Name    Industry Location  Funding  Employees  Growth_Rate  \
0  EdTech Africa      EdTech    Lagos  1200000         45         28.5   
1       PayNaija     Fintech    Lagos  2500000         60         18.2   
2  HealthPlus NG  HealthTech    Abuja   800000         30         32.1   
3   LearnFast NG      EdTech    Lagos   600000         25         35.7   
4    QuickPay GH     Fintech    Accra  1800000         50         21.4   

   High_Growth  
0         True  
1        False  
2         True  
3         True  
4         True  


In [23]:
# Step 8: Save cleaned data for Power BI and SQL
output_file = 'cleaned_african_startups.csv'
df.to_csv(output_file, index=False)
print(f"\n Cleaned data saved as '{output_file}'")
print(" Ready for Power BI dashboarding and SQL analysis!")


 Cleaned data saved as 'cleaned_african_startups.csv'
 Ready for Power BI dashboarding and SQL analysis!


In [24]:
# Display summary statistics
print("\n Summary: Key Stats")
print(f"Total Startups: {len(df)}")
print(f"Average Funding: ${df['Funding'].mean():,.0f}")
print(f"Top Industry: {df['Industry'].mode()[0]}")
print(f"High-Growth Startups (>20% growth): {df['High_Growth'].sum()}")



 Summary: Key Stats
Total Startups: 47
Average Funding: $942,979
Top Industry: EdTech
High-Growth Startups (>20% growth): 40


In [25]:
# Show top 5 high-growth startups
print("\n Top 5 High-Growth Startups:")
top_growth = df[df['High_Growth']].sort_values('Growth_Rate', ascending=False).head(5)
print(top_growth[['Company_Name', 'Industry', 'Location', 'Growth_Rate', 'Funding']])


 Top 5 High-Growth Startups:
    Company_Name    Industry Location  Growth_Rate  Funding
17      ShopZuri  E-commerce    Lagos         45.0   350000
38          Zuri  E-commerce  Nairobi         43.8   370000
26    BrightPath      EdTech   Kigali         42.3   320000
8   Uzuri Beauty  E-commerce  Nairobi         41.2   400000
24       CodeHub    Software    Lagos         39.5   400000
