# **Indian Start-up Funding Analysis (2018 - 2021)**

## Importing all necessary modules

In [94]:
import pyodbc
from dotenv import dotenv_values
import pandas as pd
import numpy as np
import re
import warnings

warnings. filterwarnings('ignore')

## Data Loading 

### loading from csv files

In [35]:
# Loading 2018 funds data
data_2018 = pd.read_csv('data\startup_funding2018.csv')

# Data preview
data_2018.head()

Unnamed: 0,Company Name,Industry,Round/Series,Amount,Location,About Company
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f..."
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,"₹65,000,000","Gurgaon, Haryana, India",Leading Online Loans Marketplace in India
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...


In [36]:
# Loading 2019 funds data
data_2019 = pd.read_csv('data\startup_funding2019.csv')

# Data preview
data_2019.head()

Unnamed: 0,Company/Brand,Founded,HeadQuarter,Sector,What it does,Founders,Investor,Amount($),Stage
0,Bombay Shaving,,,Ecommerce,Provides a range of male grooming products,Shantanu Deshpande,Sixth Sense Ventures,"$6,300,000",
1,Ruangguru,2014.0,Mumbai,Edtech,A learning platform that provides topic-based ...,"Adamas Belva Syah Devara, Iman Usman.",General Atlantic,"$150,000,000",Series C
2,Eduisfun,,Mumbai,Edtech,It aims to make learning fun via games.,Jatin Solanki,"Deepak Parekh, Amitabh Bachchan, Piyush Pandey","$28,000,000",Fresh funding
3,HomeLane,2014.0,Chennai,Interior design,Provides interior designing solutions,"Srikanth Iyer, Rama Harinath","Evolvence India Fund (EIF), Pidilite Group, FJ...","$30,000,000",Series D
4,Nu Genes,2004.0,Telangana,AgriTech,"It is a seed company engaged in production, pr...",Narayana Reddy Punyala,Innovation in Food and Agriculture (IFA),"$6,000,000",


### Loading from the database

In [4]:
# Loading environment variables from .env file
environment_variables = dotenv_values('.env')

# Getting the values for the credentials set in the .env' file
server = environment_variables.get("SERVER")
database = environment_variables.get("DATABASE")
username = environment_variables.get("USERNAME")
password = environment_variables.get("PASSWORD")

# Creating a connection string
connection_string = f"DRIVER={{SQL Server}}; \
                    SERVER={server}; \
                    DATABASE={database}; \
                    UID={username}; \
                    PWD={password};"

# Connecting to the server
connection = pyodbc.connect(connection_string)

In [37]:
# Loading 2020 funds data
data_2020 = pd.read_sql_query(
    "SELECT * FROM LP1_startup_funding2020", connection)

# Saving the DataFrame to a CSV file
data_2020.to_csv('data/startup_funding2020.csv', index=False)

data_2020.head()

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage,column10
0,Aqgromalin,2019.0,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,200000.0,,
1,Krayonnz,2019.0,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem ...,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,100000.0,Pre-seed,
2,PadCare Labs,2018.0,Pune,Hygiene management,Converting bio-hazardous waste to harmless waste,Ajinkya Dhariya,Venture Center,,Pre-seed,
3,NCOME,2020.0,New Delhi,Escrow,Escrow-as-a-service platform,Ritesh Tiwari,"Venture Catalysts, PointOne Capital",400000.0,,
4,Gramophone,2016.0,Indore,AgriTech,Gramophone is an AgTech platform enabling acce...,"Ashish Rajan Singh, Harshit Gupta, Nishant Mah...","Siana Capital Management, Info Edge",340000.0,,


In [72]:
# Loading 2021 funds data
data_2021 = pd.read_sql_query(
    "SELECT * FROM LP1_startup_funding2021", connection)

# Saving the DataFrame to a CSV file
data_2021.to_csv('data/startup_funding2021.csv', index=False)

# Data preview
data_2021.head()

Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage
0,Unbox Robotics,2019.0,Bangalore,AI startup,Unbox Robotics builds on-demand AI-driven ware...,"Pramod Ghadge, Shahid Memon","BEENEXT, Entrepreneur First","$1,200,000",Pre-series A
1,upGrad,2015.0,Mumbai,EdTech,UpGrad is an online higher education platform.,"Mayank Kumar, Phalgun Kompalli, Ravijot Chugh,...","Unilazer Ventures, IIFL Asset Management","$120,000,000",
2,Lead School,2012.0,Mumbai,EdTech,LEAD School offers technology based school tra...,"Smita Deorah, Sumeet Mehta","GSV Ventures, Westbridge Capital","$30,000,000",Series D
3,Bizongo,2015.0,Mumbai,B2B E-commerce,Bizongo is a business-to-business online marke...,"Aniket Deb, Ankit Tomar, Sachin Agrawal","CDC Group, IDG Capital","$51,000,000",Series C
4,FypMoney,2021.0,Gurugram,FinTech,"FypMoney is Digital NEO Bank for Teenagers, em...",Kapil Banwari,"Liberatha Kallat, Mukesh Yadav, Dinesh Nagpal","$2,000,000",Seed


## Data information

In [60]:
data_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company Name   526 non-null    object
 1   Industry       526 non-null    object
 2   Round/Series   526 non-null    object
 3   Amount         526 non-null    object
 4   Location       526 non-null    object
 5   About Company  526 non-null    object
dtypes: object(6)
memory usage: 24.8+ KB


In [61]:
data_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company/Brand  89 non-null     object 
 1   Founded        60 non-null     float64
 2   HeadQuarter    70 non-null     object 
 3   Sector         84 non-null     object 
 4   What it does   89 non-null     object 
 5   Founders       86 non-null     object 
 6   Investor       89 non-null     object 
 7   Amount($)      89 non-null     object 
 8   Stage          43 non-null     object 
dtypes: float64(1), object(8)
memory usage: 6.4+ KB


In [62]:
data_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  1055 non-null   object 
 1   Founded        842 non-null    float64
 2   HeadQuarter    961 non-null    object 
 3   Sector         1042 non-null   object 
 4   What_it_does   1055 non-null   object 
 5   Founders       1043 non-null   object 
 6   Investor       1017 non-null   object 
 7   Amount         801 non-null    float64
 8   Stage          591 non-null    object 
 9   column10       2 non-null      object 
dtypes: float64(2), object(8)
memory usage: 82.6+ KB


In [63]:
data_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  1209 non-null   object 
 1   Founded        1208 non-null   float64
 2   HeadQuarter    1208 non-null   object 
 3   Sector         1209 non-null   object 
 4   What_it_does   1209 non-null   object 
 5   Founders       1205 non-null   object 
 6   Investor       1147 non-null   object 
 7   Amount         1206 non-null   object 
 8   Stage          781 non-null    object 
dtypes: float64(1), object(8)
memory usage: 85.1+ KB


In [52]:
data_2018.shape, data_2019.shape, data_2020.shape, data_2021.shape

((526, 6), (89, 9), (1055, 10), (1209, 9))

## Data Cleaning

### Columns

In [95]:
def standardize_column_names(df):
    # Creating a mapping based on common patterns found in the column names
    df.columns = [re.sub(r'(?i)^Company.*Name$', 'Company_Brand', col) for col in df.columns]
    df.columns = [re.sub(r'(?i)^Amount.*$', 'Amount', col) for col in df.columns]
    df.columns = [re.sub(r'(?i)^What.*does$', 'What_it_does', col) for col in df.columns]
    df.columns = [re.sub(r'(?i)^Industry$', 'Sector', col) for col in df.columns]
    df.columns = [re.sub(r'(?i)^Round.*Series$', 'Stage', col) for col in df.columns]
    df.columns = [re.sub(r'(?i)^Location$', 'HeadQuarter', col) for col in df.columns]
    df.columns = [re.sub(r'(?i)^About.*Company$', 'What_it_does', col) for col in df.columns]
    return df

# Applying the function to standardize names
data_2018 = standardize_column_names(data_2018)
data_2019 = standardize_column_names(data_2019)
data_2020 = standardize_column_names(data_2020)
data_2021 = standardize_column_names(data_2021)

# Ensuring all DataFrames have the same set of columns
columns = ['Company_Brand', 'Founded', 'HeadQuarter', 'Sector', 'What_it_does',
           'Founders', 'Investor', 'Amount', 'Stage']
data_2018 = data_2018.reindex(columns=columns, fill_value=None)
data_2019 = data_2019.reindex(columns=columns)
data_2020 = data_2020.reindex(columns=columns)
data_2021 = data_2021.reindex(columns=columns)

# Merging all DataFrames
data = pd.concat(
    [data_2018, data_2019, data_2020, data_2021], ignore_index=True)

# Saving the DataFrame to a CSV file
data.to_csv('data/startup_funding_merged.csv', index=False)

#info of the combined DataFrame
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2879 entries, 0 to 2878
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  2790 non-null   object 
 1   Founded        2110 non-null   float64
 2   HeadQuarter    2765 non-null   object 
 3   Sector         2861 non-null   object 
 4   What_it_does   2879 non-null   object 
 5   Founders       2334 non-null   object 
 6   Investor       2253 non-null   object 
 7   Amount         2622 non-null   object 
 8   Stage          1941 non-null   object 
dtypes: float64(1), object(8)
memory usage: 202.6+ KB


In [160]:
data = pd.read_csv('data\startup_funding_merged.csv')

### Duplicates

In [161]:
data.duplicated().sum()


23

In [162]:
data = data.drop_duplicates()

### Company_Brand

In [163]:
# Trim white spaces and standardize text format to title case
data['Company_Brand'] = data['Company_Brand'].str.strip().str.title()

# checking for null values
data['Company_Brand'].isnull().sum()

89

In [164]:
# Replacing null values with the placeholder "Unknown"
data['Company_Brand'].fillna('Unknown', inplace=True)

### Founded

In [165]:
# Replacing null values with the median year
median_year = data['Founded'].median()
data['Founded'].fillna(median_year, inplace=True)

# Ensuring all entries are integers
data['Founded'] = data['Founded'].astype(int)

### Headquarter

In [166]:
# Function to simplify and standardize headquarters
def standardize_headquarters(hq):
    if pd.isna(hq):
        return "Unknown"              # Filling null values with a placeholder
    city = hq.split(',')[0].strip()   # Simplifying the entry to city name only
    return city.title()               # Converting to title case

data['HeadQuarter'] = data['HeadQuarter'].apply(standardize_headquarters)

# Check the unique values after cleaning
data['HeadQuarter'].unique()

array(['Bangalore', 'Mumbai', 'Gurgaon', 'Noida', 'Hyderabad',
       'Bengaluru', 'Kalkaji', 'Delhi', 'India', 'Hubli', 'New Delhi',
       'Chennai', 'Mohali', 'Kolkata', 'Pune', 'Jodhpur', 'Kanpur',
       'Ahmedabad', 'Azadpur', 'Haryana', 'Cochin', 'Faridabad', 'Jaipur',
       'Kota', 'Anand', 'Bangalore City', 'Belgaum', 'Thane', 'Margão',
       'Indore', 'Alwar', 'Kannur', 'Trivandrum', 'Ernakulam',
       'Kormangala', 'Uttar Pradesh', 'Andheri', 'Mylapore', 'Ghaziabad',
       'Kochi', 'Powai', 'Guntur', 'Kalpakkam', 'Bhopal', 'Coimbatore',
       'Worli', 'Alleppey', 'Chandigarh', 'Guindy', 'Lucknow', 'Unknown',
       'Telangana', 'Gurugram', 'Surat', 'Rajasthan', 'Tirunelveli',
       'Singapore', 'Gujarat', 'Kerala', 'Frisco', 'California',
       'Dhingsara', 'New York', 'Patna', 'San Francisco', 'San Ramon',
       'Paris', 'Plano', 'Sydney', 'San Francisco Bay Area', 'Bangaldesh',
       'London', 'Milano', 'Palmwoods', 'France', 'Samastipur', 'Irvine',
       'Tumkur

In [167]:
# Listing irrelevant entries 
irrelevant_entries = ['Computer Games', 'Food & Beverages',
                      'Online Media', 'Information Technology & Services']

# Mapping of common misspellings or variations to standardized city names
city_corrections = {
    'Bangalore City': 'Bangalore',
    'Bengaluru': 'Bangalore',
    'Gurugram': 'Gurgaon',
    'Noida': 'Noida',
    'Hyderebad': 'Hyderabad',
    'Banglore': 'Bangalore',
    'Ahmadabad': 'Ahmedabad',
    'Rajastan': 'Rajasthan',
    'San Franciscao': 'San Francisco',
    'San Francisco Bay Area': 'San Francisco',
    'Telugana': 'Telangana'
}

# Removing "\t#Ref!" from any entries
data['HeadQuarter'] = data['HeadQuarter'].replace(
    to_replace=r'\t#Ref!', value='', regex=True)


def correct_city_names(city):
    if city in irrelevant_entries:
        return "Unknown"  # Changing to "Unknown" for irrelevant categories
    return city_corrections.get(city, city)

data['HeadQuarter'] = data['HeadQuarter'].apply(correct_city_names)

# Verify the corrections 
data['HeadQuarter'].unique()

array(['Bangalore', 'Mumbai', 'Gurgaon', 'Noida', 'Hyderabad', 'Kalkaji',
       'Delhi', 'India', 'Hubli', 'New Delhi', 'Chennai', 'Mohali',
       'Kolkata', 'Pune', 'Jodhpur', 'Kanpur', 'Ahmedabad', 'Azadpur',
       'Haryana', 'Cochin', 'Faridabad', 'Jaipur', 'Kota', 'Anand',
       'Belgaum', 'Thane', 'Margão', 'Indore', 'Alwar', 'Kannur',
       'Trivandrum', 'Ernakulam', 'Kormangala', 'Uttar Pradesh',
       'Andheri', 'Mylapore', 'Ghaziabad', 'Kochi', 'Powai', 'Guntur',
       'Kalpakkam', 'Bhopal', 'Coimbatore', 'Worli', 'Alleppey',
       'Chandigarh', 'Guindy', 'Lucknow', 'Unknown', 'Telangana', 'Surat',
       'Rajasthan', 'Tirunelveli', 'Singapore', 'Gujarat', 'Kerala',
       'Frisco', 'California', 'Dhingsara', 'New York', 'Patna',
       'San Francisco', 'San Ramon', 'Paris', 'Plano', 'Sydney',
       'Bangaldesh', 'London', 'Milano', 'Palmwoods', 'France',
       'Samastipur', 'Irvine', 'Tumkur', 'Newcastle Upon Tyne',
       'Shanghai', 'Jiaxing', 'Ludhiana', 'Dehradu

### Sectors

In [152]:
# Converting all entries to lower case
data['Sector'] = data['Sector'].str.lower()



### What_it_does

In [175]:
# Trim white spaces and standardize text format to title case
data['What_it_does'] = data['What_it_does'].str.strip().str.lower()

### Founders

In [179]:
# Trim white spaces and standardize text format to title case
data['Founders'] = data['Founders'].str.strip().str.title()

# Filling null values  with 'Unknown'
data['Founders'].fillna('Unknown', inplace=True)

In [180]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2856 entries, 0 to 2878
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company_Brand  2856 non-null   object
 1   Founded        2856 non-null   int32 
 2   HeadQuarter    2856 non-null   object
 3   Sector         2838 non-null   object
 4   What_it_does   2856 non-null   object
 5   Founders       2856 non-null   object
 6   Investor       2232 non-null   object
 7   Amount         2600 non-null   object
 8   Stage          1927 non-null   object
dtypes: int32(1), object(8)
memory usage: 276.5+ KB


In [33]:
# Exchange rate for INR to USD in 2018
INR_to_USD_rate = 68

# Function to clean and convert amounts
def convert_amount_2018(amount):
    amount_str = str(amount)
    if '₹' in amount_str:
        amount_value = float(amount_str.replace(
            '₹', '').replace(',', '').strip())
        return round(amount_value / INR_to_USD_rate, 2)
    elif amount_str == '—':
        return '0'
    elif '$' in amount_str:
        return float(amount_str.replace('$', '').replace(',', '').strip())
    else:
        return float(amount)

data_2018['Amount'] = data_2018['Amount'].apply(convert_amount_2018)