In [140]:
import pandas as pd

# Load Dataset

This code loads the dataset from `NewDatasetCollection.csv` and displays its shape before any cleaning, along with the first few rows.


In [141]:
df = pd.read_csv("NewDatasetCollection.csv")

In [142]:
print("Shape before cleaning:", df.shape)
df.head()

Shape before cleaning: (466, 17)


Unnamed: 0,Name,Website,Featured,Stage,Industry,Tags,Region,Startup HQ,NumOfTags,Funding (USD),CompanySize,Funding_min,rate_percent,Allign with 2030,Funding_Avg,Opportunity_Level,Year of establishment
0,Lean,https://www.leantech.me,,Series B,Financial Services,Fintech Infrastructure,Riyadh,Saudi Arabia,1,15M – 30M,Startup / Small,15000000.0,75.6,Yes,22.50M,High,2019.0
1,Foodics,https://www.foodics.com/,,Series C,"Technology ,Food & Beverages","SaaS,Financial Solution",Riyadh,Saudi Arabia,2,25M – 50M,Startup / Small,25000000.0,75.6,Yes,37.50M,High,2014.0
2,Salla,https://salla.com/,True,Private Equity,Technology,SaaS,Makkah,Saudi Arabia,1,50M – 500M+,Large,50000000.0,13.68,Yes,275.00M,High,2016.0
3,Nana,https://nana.co/en,True,Series C,Food & Beverages,"Last Mile Delivery,Marketplace,E-Commerce",Riyadh,Saudi Arabia,3,25M – 50M,Startup / Small,25000000.0,75.6,Yes,37.50M,High,2016.0
4,Mozn,https://www.mozn.sa,True,Pre-Series B,Technology,AI & Machine Learning,Riyadh,Saudi Arabia,1,10M – 18M,Mid-level,10000000.0,75.6,Yes,14.00M,Medium,2017.0


# Data Cleaning
#### Our dataset contained some missing values, and outliers that could affect the accuracy of our analysis. Therefore, we performed several cleaning steps to ensure data quality:
#### 1. Handled Missing Values: Filled columns with the correct text or number .
#### 2. Treated Outliers: Used the Interquartile Range (IQR) method to detect and remove extreme values.
#### 3. Standardized Data Types: Converted columns to appropriate formats (e.g., dates, categories).
#### 4. Normalized Text Data: Removed extra spaces and converted all text to lowercase for consistency.
#### 5. Saved Cleaned Data: Exported the cleaned dataset into a processed file for further analysis.

### Dataset Information

This command displays a concise summary of the dataset, including the number of non-null entries, column names, and data types.


In [143]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466 entries, 0 to 465
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Name                   466 non-null    object 
 1   Website                465 non-null    object 
 2   Featured               56 non-null     object 
 3   Stage                  466 non-null    object 
 4   Industry               466 non-null    object 
 5   Tags                   457 non-null    object 
 6   Region                 466 non-null    object 
 7   Startup HQ             466 non-null    object 
 8   NumOfTags              466 non-null    int64  
 9   Funding (USD)          466 non-null    object 
 10  CompanySize            466 non-null    object 
 11  Funding_min            466 non-null    float64
 12  rate_percent           466 non-null    float64
 13  Allign with 2030       466 non-null    object 
 14  Funding_Avg            466 non-null    object 
 15  Opport

In [144]:
"""
We identified three columns with missing values: Featured, Tags, and Website.

"""
missing_value = df.isnull().sum()
missing_value

Name                       0
Website                    1
Featured                 410
Stage                      0
Industry                   0
Tags                       9
Region                     0
Startup HQ                 0
NumOfTags                  0
Funding (USD)              0
CompanySize                0
Funding_min                0
rate_percent               0
Allign with 2030           0
Funding_Avg                0
Opportunity_Level          0
Year of establishment    187
dtype: int64

In [145]:
df["Year of establishment"] = df["Year of establishment"].fillna(0).astype(int)

In [146]:
df[df['Website'].isnull()]

Unnamed: 0,Name,Website,Featured,Stage,Industry,Tags,Region,Startup HQ,NumOfTags,Funding (USD),CompanySize,Funding_min,rate_percent,Allign with 2030,Funding_Avg,Opportunity_Level,Year of establishment
343,AI Atlas Inc.,,,Pre-Seed,Technology,,Riyadh,Saudi Arabia,0,500K – 2M,Startup / Small,500000000.0,75.6,No,1.25M,Low,0


In [147]:
"""
For the Website column, only one value was missing. 
We addressed this by retrieving the company’s website URL and assigning it to the corresponding row.

"""
web = "https://www.aiatlasinc.ca"
df['Website'] = df['Website'].fillna(web)

In [148]:
"""
For the Tags column, we handled missing values by first checking each company’s website and LinkedIn profile to identify a tag related to its industry, 
which we then used to fill the missing entries. 
In cases where companies did not have sufficient data, 
we referred to similar companies with the same activity and assigned their tags accordingly. 
To maintain consistency, we kept the tags general rather than highly specific.

"""
df["Tags"].isna().sum()

np.int64(9)

In [149]:
filling_nan_tags = {
    "Financial Services" : "Fintech Infrastructure" , 
    "Travel & Tourism"   : "Marketplace" , 
    "Technology" : "SaaS" , 
    "Art & Design,Construction": "Design Services" ,
    "Technology ,Real Estate,Financial Services": "SaaS,RegTech,Financial Solution" , 
    "Technology ,Financial Services" : "Financial Solution" ,  
    "Financial Services,Retail" :  "Financial Solution" ,    
}

In [150]:
df["Industry"] = df["Industry"].str.strip()
df.loc[df["Tags"].isna(), "Tags"] = df.loc[df["Tags"].isna(), "Industry"].map(filling_nan_tags)

In [151]:
"""
The Featured column was not relevant to our analysis and contained around 90% missing values; therefore, we decided to drop it.

"""

df.drop(columns=['Featured'], inplace=True)
df.head(10)

Unnamed: 0,Name,Website,Stage,Industry,Tags,Region,Startup HQ,NumOfTags,Funding (USD),CompanySize,Funding_min,rate_percent,Allign with 2030,Funding_Avg,Opportunity_Level,Year of establishment
0,Lean,https://www.leantech.me,Series B,Financial Services,Fintech Infrastructure,Riyadh,Saudi Arabia,1,15M – 30M,Startup / Small,15000000.0,75.6,Yes,22.50M,High,2019
1,Foodics,https://www.foodics.com/,Series C,"Technology ,Food & Beverages","SaaS,Financial Solution",Riyadh,Saudi Arabia,2,25M – 50M,Startup / Small,25000000.0,75.6,Yes,37.50M,High,2014
2,Salla,https://salla.com/,Private Equity,Technology,SaaS,Makkah,Saudi Arabia,1,50M – 500M+,Large,50000000.0,13.68,Yes,275.00M,High,2016
3,Nana,https://nana.co/en,Series C,Food & Beverages,"Last Mile Delivery,Marketplace,E-Commerce",Riyadh,Saudi Arabia,3,25M – 50M,Startup / Small,25000000.0,75.6,Yes,37.50M,High,2016
4,Mozn,https://www.mozn.sa,Pre-Series B,Technology,AI & Machine Learning,Riyadh,Saudi Arabia,1,10M – 18M,Mid-level,10000000.0,75.6,Yes,14.00M,Medium,2017
5,Rewaa,https://rewaatech.com,Series A,Technology,"SaaS,ERP",Riyadh,Saudi Arabia,2,6M – 15M,Startup / Small,6000000.0,75.6,Yes,10.50M,Medium,2018
6,Zid,https://zid.sa/,Series B,"Technology ,Retail","SaaS,E-Commerce",Riyadh,Saudi Arabia,2,15M – 30M,Startup / Small,15000000.0,75.6,Yes,22.50M,High,2017
7,Classera,https://classera.com,Series A,Education,SaaS,Makkah,Saudi Arabia,1,6M – 15M,Startup / Small,6000000.0,13.68,Yes,10.50M,Medium,2012
8,Gathern,https://gathern.co,Pre-Series A,"Travel & Tourism,Real Estate","Marketplace,Hospitality",Riyadh,Saudi Arabia,2,2M – 6M,Startup / Small,2000000.0,75.6,Yes,4.00M,Medium,2016
9,Sary,https://www.sary.com/en,Series C,"Food & Beverages ,Retail","B2B Financing,Marketplace",Riyadh,Saudi Arabia,2,25M – 50M,Startup / Small,25000000.0,75.6,Yes,37.50M,High,2018


In [152]:
"""
Status of missing values after applying the handling process.
"""
missing_value = df.isnull().sum()
missing_value

Name                     0
Website                  0
Stage                    0
Industry                 0
Tags                     0
Region                   0
Startup HQ               0
NumOfTags                0
Funding (USD)            0
CompanySize              0
Funding_min              0
rate_percent             0
Allign with 2030         0
Funding_Avg              0
Opportunity_Level        0
Year of establishment    0
dtype: int64

In [153]:
"""
No duplicate values were found in the dataset.
"""
df.duplicated().sum()

np.int64(0)

### Removing Duplicate Companies

This code identifies specific companies listed in `companies_to_dedup` that have duplicate entries in the dataset.  
For each company with multiple rows, it removes the first occurrence and resets the DataFrame index.  
Finally, it displays the remaining rows for these companies to verify de-duplication.


In [154]:
companies_to_dedup = ["Truc-King","Resal", "Rintel", "Maghsala", "COGNNA", "Dunes Aero"]

for company in companies_to_dedup:
    company_rows = df[df["Name"] == company]
    
    if len(company_rows) > 1:
        index_to_drop = company_rows.index[0]
        df = df.drop(index_to_drop)
df = df.reset_index(drop=True)

df[df["Name"].isin(companies_to_dedup)]


Unnamed: 0,Name,Website,Stage,Industry,Tags,Region,Startup HQ,NumOfTags,Funding (USD),CompanySize,Funding_min,rate_percent,Allign with 2030,Funding_Avg,Opportunity_Level,Year of establishment
222,Resal,Www.resal.me,Pre-Series B,"Technology ,Financial Services",Financial Solution,Makkah,Saudi Arabia,0,10M – 18M,Mid-level,10000000.0,13.68,No,14.00M,Medium,2018
308,Truc-King,www.truc-king.com,Seed,"Transportation & Automotive,Technology ,Logistics","Aggregator,AI & Machine Learning,Autonomous Dr...",Riyadh,Saudi Arabia,10,1M – 3M,Startup / Small,1000000.0,75.6,Yes,2.00M,Medium,2024
336,Dunes Aero,Dunesaero.com,Seed,Technology,"Hardware & loT & Drones,AgriTech,AI & Machine ...",Makkah,Saudi Arabia,7,1M – 3M,Startup / Small,1000000.0,13.68,Yes,2.00M,Medium,2022
352,COGNNA,https://www.cognna.com/,Seed,"Technology ,Defence",Cyber Security,Riyadh,Saudi Arabia,1,1M – 3M,Startup / Small,1000000.0,75.6,Yes,2.00M,Medium,2022
380,Maghsala,https://www.maghsala.co/,Pre-Seed,Consumer Service,Marketplace,Riyadh,Saudi Arabia,1,500K – 2M,Startup / Small,500000000.0,75.6,Yes,1.25M,Low,2024
405,Rintel,https://www.rintel.co/,Pre-Seed,Food & Beverages,Consumer Services,Riyadh,Saudi Arabia,1,500K – 2M,Startup / Small,500000000.0,75.6,No,1.25M,Low,0


In [155]:
duplicates = df["Name"].value_counts()
print(duplicates[duplicates > 1])

Name
Barq      2
RevYou    2
Marn      2
Name: count, dtype: int64


### Unique Values per Column

This code iterates over all columns in the dataset and prints the number of unique values for each column, helping to understand data variety and identify potential categorical features.


In [156]:
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

Name: 457 unique values
Website: 453 unique values
Stage: 9 unique values
Industry: 146 unique values
Tags: 218 unique values
Region: 7 unique values
Startup HQ: 2 unique values
NumOfTags: 10 unique values
Funding (USD): 9 unique values
CompanySize: 4 unique values
Funding_min: 8 unique values
rate_percent: 7 unique values
Allign with 2030: 2 unique values
Funding_Avg: 9 unique values
Opportunity_Level: 3 unique values
Year of establishment: 22 unique values


### Detecting Outliers in Numeric Columns

This code examines the numeric columns `NumOfTags`, `Funding_min`, and `rate_percent` for outliers using the IQR method.  
For each column, it calculates the lower and upper bounds, identifies outlier rows, and prints their count along with the corresponding company names.


In [157]:
numeric_cols = ['NumOfTags', 'Funding_min', 'rate_percent']

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    
    print(f"Column: {col}")
    print(f"Lower bound: {lower_bound}, Upper bound: {upper_bound}")
    print(f"Number of outliers: {outliers.shape[0]}")
    print(outliers[[col, 'Name']]) 
    print("---------------------------------------------------\n")

Column: NumOfTags
Lower bound: -0.5, Upper bound: 3.5
Number of outliers: 46
     NumOfTags                            Name
190          7                     Aman Vision
193          4                          Declic
196          5                          Levers
205          4                      Amwal Tech
209         10                       Yourparts
228          7               Hysabat Solutions
231          7  Pnode Auditing |  بنود للتدقيق
232          4                         Nokhbah
238          7               FlyNow Arabia Ltd
240          4                          Maqsam
243          4                           Awfar
244          4                            Sahl
251          4                           Bynow
253          4                      Zerofunnel
262          5                  Swarm robotics
264          4                        Go Steer
266          4            Slangit Technologies
268          4              AQUIVIO Arabia LLC
272          5                

### Dataset Shape After Cleaning

This code displays the shape of the DataFrame after the cleaning steps and shows the first few rows to verify the changes.


In [158]:
print("Shape after cleaning:", df.shape)
df.head()

Shape after cleaning: (460, 16)


Unnamed: 0,Name,Website,Stage,Industry,Tags,Region,Startup HQ,NumOfTags,Funding (USD),CompanySize,Funding_min,rate_percent,Allign with 2030,Funding_Avg,Opportunity_Level,Year of establishment
0,Lean,https://www.leantech.me,Series B,Financial Services,Fintech Infrastructure,Riyadh,Saudi Arabia,1,15M – 30M,Startup / Small,15000000.0,75.6,Yes,22.50M,High,2019
1,Foodics,https://www.foodics.com/,Series C,"Technology ,Food & Beverages","SaaS,Financial Solution",Riyadh,Saudi Arabia,2,25M – 50M,Startup / Small,25000000.0,75.6,Yes,37.50M,High,2014
2,Salla,https://salla.com/,Private Equity,Technology,SaaS,Makkah,Saudi Arabia,1,50M – 500M+,Large,50000000.0,13.68,Yes,275.00M,High,2016
3,Nana,https://nana.co/en,Series C,Food & Beverages,"Last Mile Delivery,Marketplace,E-Commerce",Riyadh,Saudi Arabia,3,25M – 50M,Startup / Small,25000000.0,75.6,Yes,37.50M,High,2016
4,Mozn,https://www.mozn.sa,Pre-Series B,Technology,AI & Machine Learning,Riyadh,Saudi Arabia,1,10M – 18M,Mid-level,10000000.0,75.6,Yes,14.00M,Medium,2017


# Save new dataset after collection 

In [None]:
#df.to_csv("cleaned_dataset.csv", index=False)
#print("\nData cleaning completed. Cleaned data saved.") 