# **Indian Start-up Funding Analysis (2018 - 2021)**

## Importing all necessary modules

In [107]:
import pyodbc
from dotenv import dotenv_values
import pandas as pd
import numpy as np
import re
import warnings

warnings. filterwarnings('ignore')

## Data Loading 

### loading from csv files

In [108]:
# Loading 2018 funds data
data_2018 = pd.read_csv('data\startup_funding2018.csv')

# Data preview
data_2018.head()

Unnamed: 0,Company Name,Industry,Round/Series,Amount,Location,About Company
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f..."
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,"₹65,000,000","Gurgaon, Haryana, India",Leading Online Loans Marketplace in India
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...


In [109]:
# Loading 2019 funds data
data_2019 = pd.read_csv('data\startup_funding2019.csv')

# Data preview
data_2019.head()

Unnamed: 0,Company/Brand,Founded,HeadQuarter,Sector,What it does,Founders,Investor,Amount($),Stage
0,Bombay Shaving,,,Ecommerce,Provides a range of male grooming products,Shantanu Deshpande,Sixth Sense Ventures,"$6,300,000",
1,Ruangguru,2014.0,Mumbai,Edtech,A learning platform that provides topic-based ...,"Adamas Belva Syah Devara, Iman Usman.",General Atlantic,"$150,000,000",Series C
2,Eduisfun,,Mumbai,Edtech,It aims to make learning fun via games.,Jatin Solanki,"Deepak Parekh, Amitabh Bachchan, Piyush Pandey","$28,000,000",Fresh funding
3,HomeLane,2014.0,Chennai,Interior design,Provides interior designing solutions,"Srikanth Iyer, Rama Harinath","Evolvence India Fund (EIF), Pidilite Group, FJ...","$30,000,000",Series D
4,Nu Genes,2004.0,Telangana,AgriTech,"It is a seed company engaged in production, pr...",Narayana Reddy Punyala,Innovation in Food and Agriculture (IFA),"$6,000,000",


### Loading from the database

In [110]:
# Loading environment variables from .env file
environment_variables = dotenv_values('.env')

# Getting the values for the credentials set in the .env' file
server = environment_variables.get("SERVER")
database = environment_variables.get("DATABASE")
username = environment_variables.get("USERNAME")
password = environment_variables.get("PASSWORD")

# Creating a connection string
connection_string = f"DRIVER={{SQL Server}}; \
                    SERVER={server}; \
                    DATABASE={database}; \
                    UID={username}; \
                    PWD={password};"

# Connecting to the server
connection = pyodbc.connect(connection_string)

In [111]:
# Loading 2020 funds data
data_2020 = pd.read_sql_query(
    "SELECT * FROM LP1_startup_funding2020", connection)

# Saving the DataFrame to a CSV file
data_2020.to_csv('data/startup_funding2020.csv', index=False)

data_2020.head()

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage,column10
0,Aqgromalin,2019.0,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,200000.0,,
1,Krayonnz,2019.0,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem ...,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,100000.0,Pre-seed,
2,PadCare Labs,2018.0,Pune,Hygiene management,Converting bio-hazardous waste to harmless waste,Ajinkya Dhariya,Venture Center,,Pre-seed,
3,NCOME,2020.0,New Delhi,Escrow,Escrow-as-a-service platform,Ritesh Tiwari,"Venture Catalysts, PointOne Capital",400000.0,,
4,Gramophone,2016.0,Indore,AgriTech,Gramophone is an AgTech platform enabling acce...,"Ashish Rajan Singh, Harshit Gupta, Nishant Mah...","Siana Capital Management, Info Edge",340000.0,,


In [112]:
# Loading 2021 funds data
data_2021 = pd.read_sql_query(
    "SELECT * FROM LP1_startup_funding2021", connection)

# Saving the DataFrame to a CSV file
data_2021.to_csv('data/startup_funding2021.csv', index=False)

# Data preview
data_2021.head()

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage
0,Unbox Robotics,2019.0,Bangalore,AI startup,Unbox Robotics builds on-demand AI-driven ware...,"Pramod Ghadge, Shahid Memon","BEENEXT, Entrepreneur First","$1,200,000",Pre-series A
1,upGrad,2015.0,Mumbai,EdTech,UpGrad is an online higher education platform.,"Mayank Kumar, Phalgun Kompalli, Ravijot Chugh,...","Unilazer Ventures, IIFL Asset Management","$120,000,000",
2,Lead School,2012.0,Mumbai,EdTech,LEAD School offers technology based school tra...,"Smita Deorah, Sumeet Mehta","GSV Ventures, Westbridge Capital","$30,000,000",Series D
3,Bizongo,2015.0,Mumbai,B2B E-commerce,Bizongo is a business-to-business online marke...,"Aniket Deb, Ankit Tomar, Sachin Agrawal","CDC Group, IDG Capital","$51,000,000",Series C
4,FypMoney,2021.0,Gurugram,FinTech,"FypMoney is Digital NEO Bank for Teenagers, em...",Kapil Banwari,"Liberatha Kallat, Mukesh Yadav, Dinesh Nagpal","$2,000,000",Seed


## Data information

In [113]:
data_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company Name   526 non-null    object
 1   Industry       526 non-null    object
 2   Round/Series   526 non-null    object
 3   Amount         526 non-null    object
 4   Location       526 non-null    object
 5   About Company  526 non-null    object
dtypes: object(6)
memory usage: 24.8+ KB


In [114]:
data_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company/Brand  89 non-null     object 
 1   Founded        60 non-null     float64
 2   HeadQuarter    70 non-null     object 
 3   Sector         84 non-null     object 
 4   What it does   89 non-null     object 
 5   Founders       86 non-null     object 
 6   Investor       89 non-null     object 
 7   Amount($)      89 non-null     object 
 8   Stage          43 non-null     object 
dtypes: float64(1), object(8)
memory usage: 6.4+ KB


In [115]:
data_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  1055 non-null   object 
 1   Founded        842 non-null    float64
 2   HeadQuarter    961 non-null    object 
 3   Sector         1042 non-null   object 
 4   What_it_does   1055 non-null   object 
 5   Founders       1043 non-null   object 
 6   Investor       1017 non-null   object 
 7   Amount         801 non-null    float64
 8   Stage          591 non-null    object 
 9   column10       2 non-null      object 
dtypes: float64(2), object(8)
memory usage: 82.6+ KB


In [116]:
data_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  1209 non-null   object 
 1   Founded        1208 non-null   float64
 2   HeadQuarter    1208 non-null   object 
 3   Sector         1209 non-null   object 
 4   What_it_does   1209 non-null   object 
 5   Founders       1205 non-null   object 
 6   Investor       1147 non-null   object 
 7   Amount         1206 non-null   object 
 8   Stage          781 non-null    object 
dtypes: float64(1), object(8)
memory usage: 85.1+ KB


In [117]:
data_2018.shape, data_2019.shape, data_2020.shape, data_2021.shape

((526, 6), (89, 9), (1055, 10), (1209, 9))

## Data Cleaning

### Columns

In [118]:
def standardize_column_names(df):
    # Creating a mapping based on common patterns found in the column names
    df.columns = [re.sub(r'(?i)^Company.*Name$', 'Company_Brand', col)
                  for col in df.columns]
    df.columns = [re.sub(r'(?i)^Amount.*$', 'Amount', col)
                  for col in df.columns]
    df.columns = [re.sub(r'(?i)^What.*does$', 'What_it_does', col)
                  for col in df.columns]
    df.columns = [re.sub(r'(?i)^Industry$', 'Sector', col)
                  for col in df.columns]
    df.columns = [re.sub(r'(?i)^Round.*Series$', 'Stage', col)
                  for col in df.columns]
    df.columns = [re.sub(r'(?i)^Location$', 'HeadQuarter', col)
                  for col in df.columns]
    df.columns = [re.sub(r'(?i)^About.*Company$', 'What_it_does', col)
                  for col in df.columns]
    return df


# Applying the function to standardize names
data_2018 = standardize_column_names(data_2018)
data_2019 = standardize_column_names(data_2019)
data_2020 = standardize_column_names(data_2020)
data_2021 = standardize_column_names(data_2021)

# Ensuring all DataFrames have the same set of columns
columns = ['Company_Brand', 'Founded', 'HeadQuarter', 'Sector', 'What_it_does',
           'Founders', 'Investor', 'Amount', 'Stage', 'Fund_Year',]

# Add a new column 'Fund_Year' to each DataFrame
data_2018['Fund_Year'] = 2018
data_2019['Fund_Year'] = 2019
data_2020['Fund_Year'] = 2020
data_2021['Fund_Year'] = 2021

data_2018 = data_2018.reindex(columns=columns, fill_value=None)
data_2019 = data_2019.reindex(columns=columns)
data_2020 = data_2020.reindex(columns=columns)
data_2021 = data_2021.reindex(columns=columns)

# Merging all DataFrames
data = pd.concat([data_2018, data_2019, data_2020,
                 data_2021], ignore_index=True)

# Saving the DataFrame to a CSV file
data.to_csv('data/startup_funding_merged.csv', index=False)

# info of the combined DataFrame
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2879 entries, 0 to 2878
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  2790 non-null   object 
 1   Founded        2110 non-null   float64
 2   HeadQuarter    2765 non-null   object 
 3   Sector         2861 non-null   object 
 4   What_it_does   2879 non-null   object 
 5   Founders       2334 non-null   object 
 6   Investor       2253 non-null   object 
 7   Amount         2622 non-null   object 
 8   Stage          1941 non-null   object 
 9   Fund_Year      2879 non-null   int64  
dtypes: float64(1), int64(1), object(8)
memory usage: 225.1+ KB


In [119]:
data = pd.read_csv('data\startup_funding_merged.csv')
data.shape

(2879, 10)

### Duplicates

In [120]:
data.duplicated().sum()


23

In [121]:
data = data.drop_duplicates()

In [122]:
data

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage,Fund_Year
0,TheCollegeFever,,"Bangalore, Karnataka, India","Brand Marketing, Event Promotion, Marketing, S...","TheCollegeFever is a hub for fun, fiesta and f...",,,250000,Seed,2018
1,Happy Cow Dairy,,"Mumbai, Maharashtra, India","Agriculture, Farming",A startup which aggregates milk from dairy far...,,,"₹40,000,000",Seed,2018
2,MyLoanCare,,"Gurgaon, Haryana, India","Credit, Financial Services, Lending, Marketplace",Leading Online Loans Marketplace in India,,,"₹65,000,000",Series A,2018
3,PayMe India,,"Noida, Uttar Pradesh, India","Financial Services, FinTech",PayMe India is an innovative FinTech organizat...,,,2000000,Angel,2018
4,Eunimart,,"Hyderabad, Andhra Pradesh, India","E-Commerce Platforms, Retail, SaaS",Eunimart is a one stop solution for merchants ...,,,—,Seed,2018
...,...,...,...,...,...,...,...,...,...,...
2874,Gigforce,2019.0,Gurugram,Staffing & Recruiting,A gig/on-demand staffing company.,"Chirag Mittal, Anirudh Syal",Endiya Partners,$3000000,Pre-series A,2021
2875,Vahdam,2015.0,New Delhi,Food & Beverages,VAHDAM is among the world’s first vertically i...,Bala Sarda,IIFL AMC,$20000000,Series D,2021
2876,Leap Finance,2019.0,Bangalore,Financial Services,International education loans for high potenti...,"Arnav Kumar, Vaibhav Singh",Owl Ventures,$55000000,Series C,2021
2877,CollegeDekho,2015.0,Gurugram,EdTech,"Collegedekho.com is Student’s Partner, Friend ...",Ruchir Arora,"Winter Capital, ETS, Man Capital",$26000000,Series B,2021


### Company_Brand

In [123]:
# Trim white spaces and standardize text format to title case
data['Company_Brand'] = data['Company_Brand'].str.strip().str.title()

# checking for null values
data['Company_Brand'].isnull().sum()

89

In [124]:
# Replacing null values with the placeholder "Unknown"
data['Company_Brand'].fillna('Unknown', inplace=True)

### Founded

In [125]:
# Replacing null values with the median year
median_year = data['Founded'].median()
data['Founded'].fillna(median_year, inplace=True)

# Ensuring all entries are integers
data['Founded'] = data['Founded'].astype(int)

### Headquarter

In [126]:
# Function to simplify and standardize headquarters
def standardize_headquarters(hq):
    if pd.isna(hq):
        return "Unknown"              # Filling null values with a placeholder
    city = hq.split(',')[0].strip()   # Simplifying the entry to city name only
    return city.title()               # Converting to title case

data['HeadQuarter'] = data['HeadQuarter'].apply(standardize_headquarters)

# Check the unique values after cleaning
data['HeadQuarter'].unique()

array(['Bangalore', 'Mumbai', 'Gurgaon', 'Noida', 'Hyderabad',
       'Bengaluru', 'Kalkaji', 'Delhi', 'India', 'Hubli', 'New Delhi',
       'Chennai', 'Mohali', 'Kolkata', 'Pune', 'Jodhpur', 'Kanpur',
       'Ahmedabad', 'Azadpur', 'Haryana', 'Cochin', 'Faridabad', 'Jaipur',
       'Kota', 'Anand', 'Bangalore City', 'Belgaum', 'Thane', 'Margão',
       'Indore', 'Alwar', 'Kannur', 'Trivandrum', 'Ernakulam',
       'Kormangala', 'Uttar Pradesh', 'Andheri', 'Mylapore', 'Ghaziabad',
       'Kochi', 'Powai', 'Guntur', 'Kalpakkam', 'Bhopal', 'Coimbatore',
       'Worli', 'Alleppey', 'Chandigarh', 'Guindy', 'Lucknow', 'Unknown',
       'Telangana', 'Gurugram', 'Surat', 'Rajasthan', 'Tirunelveli',
       'Singapore', 'Gujarat', 'Kerala', 'Frisco', 'California',
       'Dhingsara', 'New York', 'Patna', 'San Francisco', 'San Ramon',
       'Paris', 'Plano', 'Sydney', 'San Francisco Bay Area', 'Bangaldesh',
       'London', 'Milano', 'Palmwoods', 'France', 'Samastipur', 'Irvine',
       'Tumkur

In [127]:
# Listing irrelevant entries 
irrelevant_entries = ['Computer Games', 'Food & Beverages',
                      'Online Media', 'Information Technology & Services']

# Mapping of common misspellings or variations to standardized city names
city_corrections = {
    'Bangalore City': 'Bangalore',
    'Bengaluru': 'Bangalore',
    'Gurugram': 'Gurgaon',
    'Noida': 'Noida',
    'Hyderebad': 'Hyderabad',
    'Banglore': 'Bangalore',
    'Ahmadabad': 'Ahmedabad',
    'Rajastan': 'Rajasthan',
    'San Franciscao': 'San Francisco',
    'San Francisco Bay Area': 'San Francisco',
    'Telugana': 'Telangana'
}

# Removing "\t#Ref!" from any entries
data['HeadQuarter'] = data['HeadQuarter'].replace(
    to_replace=r'\t#Ref!', value='', regex=True)


def correct_city_names(city):
    if city in irrelevant_entries:
        return "Unknown"  # Changing to "Unknown" for irrelevant categories
    return city_corrections.get(city, city)

data['HeadQuarter'] = data['HeadQuarter'].apply(correct_city_names)

# Verify the corrections 
data['HeadQuarter'].unique()

array(['Bangalore', 'Mumbai', 'Gurgaon', 'Noida', 'Hyderabad', 'Kalkaji',
       'Delhi', 'India', 'Hubli', 'New Delhi', 'Chennai', 'Mohali',
       'Kolkata', 'Pune', 'Jodhpur', 'Kanpur', 'Ahmedabad', 'Azadpur',
       'Haryana', 'Cochin', 'Faridabad', 'Jaipur', 'Kota', 'Anand',
       'Belgaum', 'Thane', 'Margão', 'Indore', 'Alwar', 'Kannur',
       'Trivandrum', 'Ernakulam', 'Kormangala', 'Uttar Pradesh',
       'Andheri', 'Mylapore', 'Ghaziabad', 'Kochi', 'Powai', 'Guntur',
       'Kalpakkam', 'Bhopal', 'Coimbatore', 'Worli', 'Alleppey',
       'Chandigarh', 'Guindy', 'Lucknow', 'Unknown', 'Telangana', 'Surat',
       'Rajasthan', 'Tirunelveli', 'Singapore', 'Gujarat', 'Kerala',
       'Frisco', 'California', 'Dhingsara', 'New York', 'Patna',
       'San Francisco', 'San Ramon', 'Paris', 'Plano', 'Sydney',
       'Bangaldesh', 'London', 'Milano', 'Palmwoods', 'France',
       'Samastipur', 'Irvine', 'Tumkur', 'Newcastle Upon Tyne',
       'Shanghai', 'Jiaxing', 'Ludhiana', 'Dehradu

### Sectors

In [128]:
# Converting all entries to lower case
data['Sector'] = data['Sector'].str.lower()

category_dict = {
    'Finance': set(['Credit Cards', 'Banking','Insuretech','Infratech','Saas\xa0\xa0Startup', 'Equity Management','Wealth Management','Saas  Startup','Insurtech','Crowdsourcing','Cryptocurrency','Online Financial Service', 'Neo-Banking', 'Capital Markets', 'Mutual Funds','Bank', 'Finance', 'Crypto', 'Account', 'Credit', 'Venture', 'Crowd', 'Blockchain', 'Fund', 'Lending', 'Trading', 'Wealth', 'Insurance', 'Remittance', 'Money', 'Equity', 'Investment', 'Mortgage', 'Financial Services', 'Nft', 'Payments']),
    'Agriculture': set(['Agritech','Agriculture', 'Soil-Tech','Fishery','Agri', 'Biotechnology', 'Industrial', 'Farming', 'Fish', 'Milk', 'Diary', 'Dairy', 'Dairy Startup']),
    'Technology': set(['Machine Learning','Hrtech','Ar/Vr','Technology','Ai', 'E-Connect','E-Market', 'Traveltech','Biotech','Medtech','Ad-Tech','Healthtech', 'Games', 'Computer & Network Security', 'Saas Startup', 'Scanning App', 'Cloud Company', 'Cybersecurity', 'Aero Company', 'Cloud Computing', 'Techonology', 'E-Learning', 'Content Management', 'Recruitment', 'Consultancy', 'Ecommerce', 'Ev', 'Designing', 'Networking', 'Product Studio', 'Ecommerce', 'Proptech', 'Techonology', 'Milk Startup', 'Craft Beer', 'Craft Beer', 'Online Credit Management Startup', 'Foodtech', 'Spacetech', 'Deisgning', 'Clothing', 'Logitech', 'Femtech', 'D2C', 'Skill Development', 'Martech', 'Luxury Car Startup', 'Emobility', 'It', 'Healthcare', 'Qsr Startup', 'Sportstech', 'E-Marketplace', 'Cleantech', 'Heathtech', 'Digital Mortgage', 'Innovation Management', 'Photonics Startup', 'Life Sciences', 'Cloud Kitchen', 'Content Marktplace', 'Vehicle Repair Startup', 'Photonics Startup', 'Nano Distribution Network','Artificial Intelligence', 'Fintech', 'Tech', 'Cloud', 'Artificial', 'Data', 'Internet', 'Things', 'Apps', 'Android', 'Software', 'Computer', 'Mobile', '3d Printing', 'Funding Platform', 'Applications', 'File', 'Embedded Systems', 'Portals', 'Fraud Detection', 'Search Engine', 'Nanotechnology', 'Security', 'Saas', 'Bit Company', 'Augmented Reality', 'Drone', 'Ar Startup', ']baas', 'App', '/', 'Virtual', 'It Startup', 'Photonics', 'E Tailor', 'Bai', 'Ai & Debt', 'Ai Company', 'Ai Chatbot', 'Iot Startup', 'Ai Startup', 'Iot', 'Social Platform', 'Ar Platform', 'Api Platform', 'Mlops Platform', 'Online Storytelling', 'Digital Platform', 'Paas Startup', 'Taas Startup', 'Digital Assistant']),
    'Food & Beverage': set(['Food & Beverage', 'Beverages', 'Foodtech', 'Craft Beer', 'Milk Startup','Beverage', 'Catering', 'Cook', 'Food', 'Restaurants']),
    'Transport': set(['Auto-Tech', 'Tyre Management', 'Automobiles','Automobile', 'E-Mobility', 'Autonomous Vehicles', 'Vehicle Repair Startup','Automotive', 'Air Transport', 'Transport', 'Logistics', 'Vehicle', 'Transportation', 'Aviation', 'Vehicles', 'Tyre', 'Fleet', 'Wheels', 'Aero', 'Mobility', 'Aeorspace', 'Wl & Rac Protection', 'Micro-Mobiity', 'Delivery Service']),
    'Business Intelligence': set(['Business Intelligence','Data Science','Analytics', 'Consulting', 'Human', 'Career', 'Erp', 'Advertising', 'Advertisement', 'Market Research', 'Entrepre', 'Recruit', 'Hr', 'Working', 'Sultancy', 'Advisory', 'Work', 'Job', 'Management', 'Skill', 'Legal', 'Crm', 'Specific Domain To Individuals', 'Information Services']),
    'Energy': set(['Renewables & Environment', 'Renewable Player','Electric Vehicle', 'Pollution Control Equiptment', 'Cleantech','Clean Energy', 'Energy', 'Boil &', 'Boil', 'Solar', 'Electricity', 'Environment']),
    'Hospitality': set(['Hospitality','Customer Service','Home Services', 'E Store','Customer Service Company', 'Co-Working', 'Accomodation', 'Cloud Kitchen','Customer', 'Hospital', 'Tourism', 'Events', 'Wedding', 'Travel', 'Hosts', 'Booking', 'Wedding', 'Qsr']),
    'Commerce': set(['Trading Platform','Consumer','Supply Chain Platform','B2B','Business Supplies & Equipment','Fmcg','E-Tail','Entreprenurship','Car Trade', 'Reatil Startup','E-Mobility','Estore', 'Capital Markets','E-Commerce', 'Sales & Services','Sales And Distribution', 'Estore', 'Retail Startup', 'Packaging Services', 'E-Marketplace','2', 'Trade', 'Enterprise', 'Commerce', 'Business', 'Commercial', 'Consumer Goods', 'Marketplace', 'Business Consumer', 'Marketing', 'Retail', 'Market', 'Store', 'Furniture', 'Wholesale', 'Wine & Spirits', 'Multinational', 'E-', 'Packaging', 'Sales', 'Tplace', 'Warehouse', 'Fm', 'Product', 'Merchandise', 'Reatil', 'Conglomerates', 'Invoice Discounting', 'Supply Chain', 'Car Service', 'Service Industry', 'Company-As-A-Service', 'Consumer Service', 'Facilities Support Services', 'Facilities Services']),
    'Manufacturing': set(['Mechanical & Industrial Engineering', 'Packaging Solution Startup', 'Manufacturing', 'Home Interior Services', 'Craft Beer', 'Product Studio', 'Luxury Car Startup', 'Mechanical Or Industrial Engineering','Battery', 'Manufacturing', 'Electronics', 'Industrial Automation', 'Aerospace', 'Conductor', 'Gaming', 'Robotics', 'Engineering', 'Mechanical', 'Appliance', 'Automation', 'Ev Startup', 'Startup Laboratory', 'E-Vehicle', 'Luxury Car']),
    'Media and Entertainment': set(['Media and Entertainment', 'Games', 'E-Sports', 'Celebrity Engagement', 'Content Creation', 'Virtual Auditing Startup', 'Content Marktplace','Media', 'Dating', 'Music', 'Audio', 'Gaming', 'Creative', 'Entertainment', 'Broadcasting', 'Video', 'Blogging', 'Content', 'Celebrity', 'Ott']),
    'Real Estate': set(['Commercial Real Estate',  'Interior & Decor', 'Co-Living','Apartment', 'Real Estate', 'Home', 'Interior', 'Construction', 'Rental', 'Housing', 'Accommodation', 'Hauz']),
    'Telecommunications': set(['Telecommunications','Telecommuncation','Telecommunication','Telecom', 'News', 'Escrow', 'Publication']),
    'Health': set(['Healthtech', 'Healthcare','Pharmaceuticals','Pharmaceuticals', 'Healtcare','Pharmaceutical','Pharmacy', 'Helathcare', 'Medical', 'Healthtech','Dental', 'Health', 'Health Insurance', 'Medic', 'Supplement', 'Biopharma', 'Veterinary', 'Pharma', 'Heathcare', 'Nutrition', 'Hygiene', 'Care', 'Sanitation', 'Bio', 'Cannabis', 'Tobacco', 'Sciences']),
    'Sports & Fitness': set(['Sports & Fitness', 'Sportstech','Sports', 'Esports', 'Game', 'Ball', 'Player', 'Manchester']),
    'Beauty and Fashion': set(['Skincare Startup', 'Foootwear','Eye Wear', 'Personal Care Startup', 'Beauty and Fashion', 'Clothing','Beauty', 'Cosmetic', 'Skincare', 'Fashion', 'Wear', 'Cosmetics', 'Textiles', 'Eyewear', 'Jewellery', 'Cloth', 'Eyeglasses']),
    'Government': set(['Defense & Space', 'Government', 'Advisory Firm','Communities', 'Smart Cities', 'Government', 'Classifieds', 'Community', 'Water', 'Defense', 'Pollution', 'Translation & Localization', 'Taxation', 'Maritime']),
    'Education': set(['E-Learning', 'EduTech','Edttech', 'E-Learning', 'Skill Development', 'E-Learning', 'Job Discovery Platform', 'E-Learning', 'Preschool Daycare', 'E-Learning', 'E-Learning','Edutech', 'Education', 'Learn', 'Edtech']),
    'NaN': set(['Nan','-', 'nan','NaN','—', None]),
    'LifeStyle': set(['LifeStyle', 'Lifestyle','Decor', 'Fitness','Home Decor','Arts & Crafts', 'Training', 'Wellness', 'Personal Care', 'Deisgn', 'Craft', 'Design', 'Podcast', 'Lifestyle', 'Spiritual', 'Matrimony', 'Living', 'Cultural', 'Home']),
    'Others': set([ 'Water Purification','Job Portal','Social Audio','Others','Cannabis Startup','Staffing & Recruiting','Human Resources','Venture Capital','Multinational Conglomerate Company','Venture Capitalist','Hauz Khas','Social Network','Coworking', 'Biomaterial Startup',  'Environmental Service',  'Content Publishing', 'Legaltech', 'Environmental Services', 'Data Intelligence', 'Work Fulfillment', 'Pet Care',  'Deeptech',  'Martech',  'Photonics Startup', 'Sanitation Solutions',   'Mutual Funds'])
}



### What_it_does

In [129]:
# Trim white spaces and standardize text format to title case
data['What_it_does'] = data['What_it_does'].str.strip().str.lower()

### Founders

In [130]:
# Trim white spaces and standardize text format to title case
data['Founders'] = data['Founders'].str.strip().str.title()

# Filling null values  with 'Unknown'
data['Founders'].fillna('Unknown', inplace=True)

### Investors

In [131]:
# Trim white spaces and standardize text format to title case
data['Investor'] = data['Investor'].str.strip().str.title()

# Filling null values  with 'Undisclosed' 
data['Investor'].fillna('Undisclosed', inplace=True)

### Amount

In [139]:
data

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage,Fund_Year
0,Thecollegefever,2017,Bangalore,"brand marketing, event promotion, marketing, s...","thecollegefever is a hub for fun, fiesta and f...",Unknown,Undisclosed,250000,Seed,2018
1,Happy Cow Dairy,2017,Mumbai,"agriculture, farming",a startup which aggregates milk from dairy far...,Unknown,Undisclosed,"₹40,000,000",Seed,2018
2,Myloancare,2017,Gurgaon,"credit, financial services, lending, marketplace",leading online loans marketplace in india,Unknown,Undisclosed,"₹65,000,000",Series A,2018
3,Payme India,2017,Noida,"financial services, fintech",payme india is an innovative fintech organizat...,Unknown,Undisclosed,2000000,Pre-Seed,2018
4,Eunimart,2017,Hyderabad,"e-commerce platforms, retail, saas",eunimart is a one stop solution for merchants ...,Unknown,Undisclosed,—,Seed,2018
...,...,...,...,...,...,...,...,...,...,...
2874,Gigforce,2019,Gurgaon,staffing & recruiting,a gig/on-demand staffing company.,"Chirag Mittal, Anirudh Syal",Endiya Partners,$3000000,Pre-Series A,2021
2875,Vahdam,2015,New Delhi,food & beverages,vahdam is among the world’s first vertically i...,Bala Sarda,Iifl Amc,$20000000,Series D,2021
2876,Leap Finance,2019,Bangalore,financial services,international education loans for high potenti...,"Arnav Kumar, Vaibhav Singh",Owl Ventures,$55000000,Series C,2021
2877,Collegedekho,2015,Gurgaon,edtech,"collegedekho.com is student’s partner, friend ...",Ruchir Arora,"Winter Capital, Ets, Man Capital",$26000000,Series B,2021


In [133]:
data['Amount'].unique()

array(['250000', '₹40,000,000', '₹65,000,000', '2000000', '—', '1600000',
       '₹16,000,000', '₹50,000,000', '₹100,000,000', '150000', '1100000',
       '₹500,000', '6000000', '650000', '₹35,000,000', '₹64,000,000',
       '₹20,000,000', '1000000', '5000000', '4000000', '₹30,000,000',
       '2800000', '1700000', '1300000', '₹5,000,000', '₹12,500,000',
       '₹15,000,000', '500000', '₹104,000,000', '₹45,000,000', '13400000',
       '₹25,000,000', '₹26,400,000', '₹8,000,000', '₹60,000', '9000000',
       '100000', '20000', '120000', '₹34,000,000', '₹342,000,000',
       '$143,145', '₹600,000,000', '$742,000,000', '₹1,000,000,000',
       '₹2,000,000,000', '$3,980,000', '$10,000', '₹100,000',
       '₹250,000,000', '$1,000,000,000', '$7,000,000', '$35,000,000',
       '₹550,000,000', '$28,500,000', '$2,000,000', '₹240,000,000',
       '₹120,000,000', '$2,400,000', '$30,000,000', '₹2,500,000,000',
       '$23,000,000', '$150,000', '$11,000,000', '₹44,000,000',
       '$3,240,000', '₹60

### Stage

In [134]:
data['Stage'].unique()

array(['Seed', 'Series A', 'Angel', 'Series B', 'Pre-Seed',
       'Private Equity', 'Venture - Series Unknown', 'Grant',
       'Debt Financing', 'Post-IPO Debt', 'Series H', 'Series C',
       'Series E', 'Corporate Round', 'Undisclosed',
       'https://docs.google.com/spreadsheets/d/1x9ziNeaz6auNChIHnMI8U6kS7knTr3byy_YBGfQaoUA/edit#gid=1861303593',
       'Series D', 'Secondary Market', 'Post-IPO Equity',
       'Non-equity Assistance', 'Funding Round', nan, 'Fresh funding',
       'Pre series A', 'Series G', 'Post series A', 'Seed funding',
       'Seed fund', 'Series F', 'Series B+', 'Seed round', 'Pre-series A',
       'Pre-seed', 'Pre-series', 'Debt', 'Pre-series C', 'Pre-series B',
       'Bridge', 'Series B2', 'Pre- series A', 'Edge', 'Pre-Series B',
       'Seed A', 'Series A-1', 'Seed Funding', 'Pre-seed Round',
       'Seed Round & Series A', 'Pre Series A', 'Pre seed Round',
       'Angel Round', 'Pre series A1', 'Series E2', 'Seed Round',
       'Bridge Round', 'Pre seed

In [135]:
data['Stage'].isnull().sum()

929

In [136]:
# Dictionary for mapping stages to standardized terms
stage_mapping = {
    r'(?i)^angel$': 'Pre-Seed',
    r'(?i)^seed (funding|fund|round|investment|a|\+)?$': 'Seed',
    r'(?i)^pre[-\s]?seed( round)?$': 'Pre-Seed',
    r'(?i)^pre[-\s]?series[-\s]?a1?$': 'Pre-Series A',
    r'(?i)^pre[-\s]?series[-\s]?a$': 'Pre-Series A',
    r'(?i)^pre- series a$': 'Pre-Series A',
    r'(?i)^pre[-\s]?series[-\s]?b$': 'Pre-Series B',
    r'(?i)^pre[-\s]?series[-\s]?c$': 'Pre-Series C',
    r'(?i)^early seed$': 'Pre-Seed',
    r'(?i)^series a[-\s]?[1+2]?$': 'Series A',
    r'(?i)^series b[-\s]?[+2-3]?$': 'Series B',
    r'(?i)^series c, d$': 'Series C',
    r'(?i)^series d1$': 'Series D',
    r'(?i)^series e2$': 'Series E',
    r'(?i)^series f[1-2]?$': 'Series F',
    r'(?i)^venture - series unknown$': 'Venture',
    r'(?i)^post series a$': 'Post-Series A',
    r'(?i)^non-equity assistance$': 'Other',
    r'(?i)^corporate round$': 'Other',
    r'(?i)^bridge( round)?$': 'Other',
    r'(?i)^private equity$': 'PE',
    r'(?i)^secondary market$': 'Other',
    r'(?i)^debt financing$': 'Debt',
    r'(?i)^post-ipo (debt|equity)$': 'Post-IPO',
    r'(?i)^undisclosed$': 'Other',
    r'(?i)^funding round$': 'Other',
    r'(?i)^fresh funding$': 'Other',
    r'(?i)^mid series$': 'Other',
    r'(?i)^edge$': 'Other',
    r'(?i)^grant$': 'Grant',
    r'(?i)^seies a$': 'Series A',
    r'(?i)^pre[-\s]?series$': 'Pre-Seed',
    r'(?i)^angel round$': 'Pre-Seed',
    r'(?i)^seed round & series a$': 'Seed',
    r'(?i)^series i$': 'Series I'
}

# Applying the mappings using regular expressions
for pattern, replacement in stage_mapping.items():
    data['Stage'] = data['Stage'].str.replace(pattern, replacement, regex=True)

# Replacing numeric and erroneous entries with 'Unknown'
data['Stage'] = data['Stage'].replace(
    r'\$\d+', 'Unknown', regex=True)  # Catch dollar amounts
data['Stage'] = data['Stage'].replace(
    r'https?://\S+', 'Unknown', regex=True)  # Catch URLs

# Handle NaN values
data['Stage'].fillna('Unknown', inplace=True)


data['Stage'].unique()

array(['Seed', 'Series A', 'Pre-Seed', 'Series B', 'PE', 'Venture',
       'Grant', 'Debt', 'Post-IPO', 'Series H', 'Series C', 'Series E',
       'Other', 'Unknown', 'Series D', 'Pre-Series A', 'Series G',
       'Post-Series A', 'Series F', 'Pre-Series C', 'Pre-Series B',
       'Seed+', 'Series I'], dtype=object)

In [137]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2856 entries, 0 to 2878
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company_Brand  2856 non-null   object
 1   Founded        2856 non-null   int32 
 2   HeadQuarter    2856 non-null   object
 3   Sector         2838 non-null   object
 4   What_it_does   2856 non-null   object
 5   Founders       2856 non-null   object
 6   Investor       2856 non-null   object
 7   Amount         2600 non-null   object
 8   Stage          2856 non-null   object
 9   Fund_Year      2856 non-null   int64 
dtypes: int32(1), int64(1), object(8)
memory usage: 234.3+ KB


In [138]:
# # Exchange rate for INR to USD in 2018
# INR_to_USD_rate = 68

# # Function to clean and convert amounts
# def convert_amount_2018(amount):
#     amount_str = str(amount)
#     if '₹' in amount_str:
#         amount_value = float(amount_str.replace(
#             '₹', '').replace(',', '').strip())
#         return round(amount_value / INR_to_USD_rate, 2)
#     elif amount_str == '—':
#         return '0'
#     elif '$' in amount_str:
#         return float(amount_str.replace('$', '').replace(',', '').strip())
#     else:
#         return float(amount)

# data_2018['Amount'] = data_2018['Amount'].apply(convert_amount_2018)