# The Indian Start-up Ecosytem

### Installations

In [778]:
# %pip install seaborn
# %pip install plotly --upgrade

### Importation

In [779]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px
import re

### Load Data

In [780]:
data_2018 =  pd.read_csv('data/startup_funding2018.csv')
data_2019 =  pd.read_csv('data/startup_funding2019.csv')
data_2020 =  pd.read_csv('data/startup_funding2020.csv')
data_2021 =  pd.read_csv('data/startup_funding2021.csv')

### Exploratory Data Analysis

In [781]:
data_2018.columns

Index(['Company Name', 'Industry', 'Round/Series', 'Amount', 'Location',
       'About Company'],
      dtype='object')

In [782]:
data_2018.rename(columns={
    'Company Name':'Company/Brand',
    'Industry':'Sector',
    'About Company':'What it does',
    'Amount':'Amount($)',
    'Location':'HeadQuarter',
    'Round/Series':'Stage',
}, inplace=True)

In [783]:
data_2018.columns

Index(['Company/Brand', 'Sector', 'Stage', 'Amount($)', 'HeadQuarter',
       'What it does'],
      dtype='object')

In [784]:
data_2018.head()

Unnamed: 0,Company/Brand,Sector,Stage,Amount($),HeadQuarter,What it does
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f..."
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,"₹65,000,000","Gurgaon, Haryana, India",Leading Online Loans Marketplace in India
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...


In [785]:
data_2018['Founders'] = 'Unknown'
data_2018['Investor'] = 'Unknown'
data_2018.head()

Unnamed: 0,Company/Brand,Sector,Stage,Amount($),HeadQuarter,What it does,Founders,Investor
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f...",Unknown,Unknown
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...,Unknown,Unknown
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,"₹65,000,000","Gurgaon, Haryana, India",Leading Online Loans Marketplace in India,Unknown,Unknown
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...,Unknown,Unknown
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...,Unknown,Unknown


In [786]:
data_2018.columns

Index(['Company/Brand', 'Sector', 'Stage', 'Amount($)', 'HeadQuarter',
       'What it does', 'Founders', 'Investor'],
      dtype='object')

#### Dropping all Founded columns since it is not required for the project

In [787]:
data_2019.drop(columns='Founded', inplace=True)
data_2020.drop(columns='Founded', inplace=True)
data_2021.drop(columns='Founded', inplace=True)
data_2020.drop(columns='Unnamed: 9', inplace=True)


## Data Cleaning

In [788]:
data_2019.columns

Index(['Company/Brand', 'HeadQuarter', 'Sector', 'What it does', 'Founders',
       'Investor', 'Amount($)', 'Stage'],
      dtype='object')

In [789]:
data_2020.columns

Index(['Company/Brand', 'HeadQuarter', 'Sector', 'What it does', 'Founders',
       'Investor', 'Amount($)', 'Stage'],
      dtype='object')

In [790]:
data_2021.columns

Index(['Company/Brand', 'HeadQuarter', 'Sector', 'What it does', 'Founders',
       'Investor', 'Amount($)', 'Stage'],
      dtype='object')

In [791]:
dataframes = [data_2018, data_2019, data_2020, data_2021]

data = pd.concat(dataframes, ignore_index=True)
data

Unnamed: 0,Company/Brand,Sector,Stage,Amount($),HeadQuarter,What it does,Founders,Investor
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f...",Unknown,Unknown
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...,Unknown,Unknown
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,"₹65,000,000","Gurgaon, Haryana, India",Leading Online Loans Marketplace in India,Unknown,Unknown
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...,Unknown,Unknown
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...,Unknown,Unknown
...,...,...,...,...,...,...,...,...
2874,Gigforce,Staffing & Recruiting,Pre-series A,$3000000,Gurugram,A gig/on-demand staffing company.,"Chirag Mittal, Anirudh Syal",Endiya Partners
2875,Vahdam,Food & Beverages,Series D,$20000000,New Delhi,VAHDAM is among the world’s first vertically i...,Bala Sarda,IIFL AMC
2876,Leap Finance,Financial Services,Series C,$55000000,Bangalore,International education loans for high potenti...,"Arnav Kumar, Vaibhav Singh",Owl Ventures
2877,CollegeDekho,EdTech,Series B,$26000000,Gurugram,"Collegedekho.com is Student’s Partner, Friend ...",Ruchir Arora,"Winter Capital, ETS, Man Capital"


#### Engineering a feature 'Original Currency' to track on what rows the conversion of the currencies should be done 

In [792]:
data['Amount($)'] = data['Amount($)'].str.strip()
data['Amount($)'] = data['Amount($)'].apply(lambda x: str(x).replace(',', ''))
data['Original Currency'] = data['Amount($)'].apply(lambda x: 'Rupees' if '₹' in x or ('$' not in x and len(x) > 8) else 'Dollars')
data.head()

Unnamed: 0,Company/Brand,Sector,Stage,Amount($),HeadQuarter,What it does,Founders,Investor,Original Currency
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f...",Unknown,Unknown,Dollars
1,Happy Cow Dairy,"Agriculture, Farming",Seed,₹40000000,"Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...,Unknown,Unknown,Rupees
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,₹65000000,"Gurgaon, Haryana, India",Leading Online Loans Marketplace in India,Unknown,Unknown,Rupees
3,PayMe India,"Financial Services, FinTech",Angel,2000000,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...,Unknown,Unknown,Dollars
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,—,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...,Unknown,Unknown,Dollars


In [793]:
def clean_amount():
    data['Amount($)'] = data['Amount($)'].astype(str)
    data['Amount($)'] = data['Amount($)'].apply(lambda x: str(x).replace('$', ""))
    data['Amount($)'] = data['Amount($)'].apply(lambda x: str(x).replace('₹', ''))
    data['Amount($)'] = data['Amount($)'].apply(lambda x: '—' if re.search('osed|n', x) else x)
    data['Amount($)'] = data['Amount($)'].apply(lambda x: str(x).replace('—', '0'))
    data['Amount($)'] = data['Amount($)'].apply(lambda x: str(x).replace(',', ""))
    data['Amount($)'] = data['Amount($)'].apply(lambda x: str(x).replace(' ', ""))
    data['Amount($)'] = pd.to_numeric(data['Amount($)'], errors='coerce')
    data['Amount($)'] = data['Amount($)'].astype(float)
    

clean_amount()
data.head()

Unnamed: 0,Company/Brand,Sector,Stage,Amount($),HeadQuarter,What it does,Founders,Investor,Original Currency
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000.0,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f...",Unknown,Unknown,Dollars
1,Happy Cow Dairy,"Agriculture, Farming",Seed,40000000.0,"Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...,Unknown,Unknown,Rupees
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,65000000.0,"Gurgaon, Haryana, India",Leading Online Loans Marketplace in India,Unknown,Unknown,Rupees
3,PayMe India,"Financial Services, FinTech",Angel,2000000.0,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...,Unknown,Unknown,Dollars
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,0.0,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...,Unknown,Unknown,Dollars


In [794]:
null_values = data['Amount($)'].isna().sum()
percentage = (null_values/data.shape[0]) * 100

print(f'This is {percentage:.2f}% of our data')

#Not bad

This is 0.35% of our data


#### Now let's convert the values in the amount columns that were tracked as rupee to dollars and delete the 'original currency' column. It has served its purpose

#### Asumptions

1. Each value in the amount column if it has a symbol, belongs to that currency
2. If value has no symbol and length of value - symbol > 8, then that value is in Rupees
3. Conversion rate been used is middle of the year 30th June 2018 

In [795]:
exchange_rate = 0.0146
my_condition = data['Original Currency'] == 'Rupees'
data.loc[my_condition, 'Amount($)'] = data.loc[my_condition, 'Amount($)'] * exchange_rate

data.head()

Unnamed: 0,Company/Brand,Sector,Stage,Amount($),HeadQuarter,What it does,Founders,Investor,Original Currency
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000.0,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f...",Unknown,Unknown,Dollars
1,Happy Cow Dairy,"Agriculture, Farming",Seed,584000.0,"Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...,Unknown,Unknown,Rupees
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,949000.0,"Gurgaon, Haryana, India",Leading Online Loans Marketplace in India,Unknown,Unknown,Rupees
3,PayMe India,"Financial Services, FinTech",Angel,2000000.0,"Noida, Uttar Pradesh, India",PayMe India is an innovative FinTech organizat...,Unknown,Unknown,Dollars
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,0.0,"Hyderabad, Andhra Pradesh, India",Eunimart is a one stop solution for merchants ...,Unknown,Unknown,Dollars


In [796]:
# dropping the helper column - Original Currency

data.drop(columns='Original Currency', inplace=True)

In [797]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2879 entries, 0 to 2878
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company/Brand  2879 non-null   object 
 1   Sector         2861 non-null   object 
 2   Stage          1941 non-null   object 
 3   Amount($)      2869 non-null   float64
 4   HeadQuarter    2765 non-null   object 
 5   What it does   2879 non-null   object 
 6   Founders       2860 non-null   object 
 7   Investor       2779 non-null   object 
dtypes: float64(1), object(7)
memory usage: 180.1+ KB


#### Now to deal with the headQuarters with multiple locations

Assumption

1. The first place listed is the principal headquaters
2. Since the entire dataset is about India, India can be dropped as well


Helper function to clean up the headquaters column

In [798]:
data['HeadQuarter'] = data['HeadQuarter'].apply(str)

def remove_excess_headquaters(string):
    new = []
    for word in string:
        new += word
    if ',' in new:
        comma_index = string.index(',')
        del new[comma_index:len(string)]
        ''.join(new)
        return ''.join(new)
    
data['HeadQuarter'] = data['HeadQuarter'].apply(remove_excess_headquaters)
data.head()

Unnamed: 0,Company/Brand,Sector,Stage,Amount($),HeadQuarter,What it does,Founders,Investor
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000.0,Bangalore,"TheCollegeFever is a hub for fun, fiesta and f...",Unknown,Unknown
1,Happy Cow Dairy,"Agriculture, Farming",Seed,584000.0,Mumbai,A startup which aggregates milk from dairy far...,Unknown,Unknown
2,MyLoanCare,"Credit, Financial Services, Lending, Marketplace",Series A,949000.0,Gurgaon,Leading Online Loans Marketplace in India,Unknown,Unknown
3,PayMe India,"Financial Services, FinTech",Angel,2000000.0,Noida,PayMe India is an innovative FinTech organizat...,Unknown,Unknown
4,Eunimart,"E-Commerce Platforms, Retail, SaaS",Seed,0.0,Hyderabad,Eunimart is a one stop solution for merchants ...,Unknown,Unknown


In [799]:
data['Sector'].unique()

array(['Brand Marketing, Event Promotion, Marketing, Sponsorship, Ticketing',
       'Agriculture, Farming',
       'Credit, Financial Services, Lending, Marketplace',
       'Financial Services, FinTech',
       'E-Commerce Platforms, Retail, SaaS',
       'Cloud Infrastructure, PaaS, SaaS',
       'Internet, Leisure, Marketplace', 'Market Research',
       'Information Services, Information Technology', 'Mobile Payments',
       'B2B, Shoes', 'Internet',
       'Apps, Collaboration, Developer Platform, Enterprise Software, Messaging, Productivity Tools, Video Chat',
       'Food Delivery', 'Industrial Automation',
       'Automotive, Search Engine, Service Industry',
       'Finance, Internet, Travel',
       'Accounting, Business Information Systems, Business Travel, Finance, SaaS',
       'Artificial Intelligence, Product Search, SaaS, Service Industry, Software',
       'Internet of Things, Waste Management',
       'Air Transportation, Freight Service, Logistics, Marine Transport

### Create a function that pushes specific keywords or expression into a particluar list. This will be used to recategorize the sectors

Goals 
1. There are nan and '-' that should be handled first. 
2. The goal is to replace those values with what the company does, this way we have some keywords to use to gauge what sector the company is in
3. Changing industries that are just have consumer to 'Business Consumer upon further investigation(regex prurpose)

In [800]:
data['Sector'] = data['Sector'].apply(str)
data['Sector'] = data['Sector'].str.lower()

condition_1 = (data['Sector'] == 'nan') | (data['Sector'] == '—')
data.loc[condition_1, 'Sector'] = data.loc[condition_1, 'What it does']
data.head()

Unnamed: 0,Company/Brand,Sector,Stage,Amount($),HeadQuarter,What it does,Founders,Investor
0,TheCollegeFever,"brand marketing, event promotion, marketing, s...",Seed,250000.0,Bangalore,"TheCollegeFever is a hub for fun, fiesta and f...",Unknown,Unknown
1,Happy Cow Dairy,"agriculture, farming",Seed,584000.0,Mumbai,A startup which aggregates milk from dairy far...,Unknown,Unknown
2,MyLoanCare,"credit, financial services, lending, marketplace",Series A,949000.0,Gurgaon,Leading Online Loans Marketplace in India,Unknown,Unknown
3,PayMe India,"financial services, fintech",Angel,2000000.0,Noida,PayMe India is an innovative FinTech organizat...,Unknown,Unknown
4,Eunimart,"e-commerce platforms, retail, saas",Seed,0.0,Hyderabad,Eunimart is a one stop solution for merchants ...,Unknown,Unknown


In [801]:
data['Sector'].unique()

array(['brand marketing, event promotion, marketing, sponsorship, ticketing',
       'agriculture, farming',
       'credit, financial services, lending, marketplace',
       'financial services, fintech',
       'e-commerce platforms, retail, saas',
       'cloud infrastructure, paas, saas',
       'internet, leisure, marketplace', 'market research',
       'information services, information technology', 'mobile payments',
       'b2b, shoes', 'internet',
       'apps, collaboration, developer platform, enterprise software, messaging, productivity tools, video chat',
       'food delivery', 'industrial automation',
       'automotive, search engine, service industry',
       'finance, internet, travel',
       'accounting, business information systems, business travel, finance, saas',
       'artificial intelligence, product search, saas, service industry, software',
       'internet of things, waste management',
       'air transportation, freight service, logistics, marine transport

#### Create a function that pushes specific keywords or expression into a particluar list. This will be used to recategorize the sectors

Dealing with the multiple sector column to make it a little bit more concise, I will create a function 
and use the re module fo search for specific key words for  a particluar industry/sector


In [802]:
# for a more efficient regex grouping convert some more brief sector names

data['Sector'] = data['Sector'].str.lower()
data['Sector'] = data['Sector'].apply(lambda x: 'business consumer' if x == 'consumer' else x)
data['Sector'] = data['Sector'].apply(lambda x: 'e-vehicle' if x == 'ev' else x)
data['Sector'] = data['Sector'].apply(lambda x: 'tech' if x == 'it' else x)


known_sectors = data['Sector'].unique()

finance = []
tech = []
beauty = []
education = [] 
food = []
sports = []
hospitality = []
commerce = [] 
manufacturing = []
agriculture = []
health = []
government = []
entertainment = []
business_intelligence = []
transport = []
lifestyle =[]
energy =[]
real_estate = []
tele_comm = []
fashion = []


def sector_redistribution(known_sectors):
    for sector in known_sectors:
        if re.search('bank|fintech|financ|crypto|account|credit|venture|crowd|blockchain|fund|lending|trading|wealth|insurance|insurance|remittance|money|wealth|equity|investment|mortgage|nft|payments', sector):
            finance.append(sector)
        elif re.search('automotive|air transport|transport|logistics|vehicle|transportation|delivery|aviation|vehicles|tyre|fleet|wheels|aero|luxury car|mobility|aeorspace|wl & rac protection|micro-mobiity', sector):
            transport.append(sector)
        elif re.search('clean energy|energy|oil &|oil|solar|electricity|environment', sector):
            energy.append(sector)
        elif re.search('analytics|consulting|human|career|erp|advertising|advertisement|market research|business|entrepre|recruit|hr|working|sultancy|advisory|work|job|management|skill|legal|crm', sector):
            business_intelligence.append(sector)
        elif re.search('intelligence|tech|iot|crypto|cloud|artificial|data|net|things|apps|droid|software|computer|mobile|3d printing|funding platform|applications|file|embedded systems|online portals|fraud detection|search engine|nanotechnology|security|ai|saas|it company|augmented reality|platform|drone|ar startup|aas|app|online|/|virtual|it startup|photonics',sector):
            tech.append(sector)
        elif re.search('beauty|cosmetic|skincare', sector):
            beauty.append(sector)
        elif re.search('fashion|wear|cosmetics|textiles|eye|jewellery|cloth', sector):
            fashion.append(sector)
        elif re.search('decor|fitness|training|wellness|personal care|deisgn|craft|design|podcast|lifestyle|spiritual|matrimony|living|cultural', sector):
            lifestyle.append(sector)
        elif re.search('edutech|education|learn|children|child|collaboration|edtech', sector):
            education.append(sector)
        elif re.search('beverage|catering|cook|food', sector):
            food.append(sector)
        elif re.search('sports|esports|game|ball|player|manchester', sector):
            sports.append(sector)
        elif re.search('customer|service|hospital|tourism|events|weedding|travel|hosts|booking|wedding|qsr', sector):
            hospitality.append(sector)
        elif re.search('2|trade|enterpise|commerce|business|commercial|consumer goods|Marketplace|business consumer|marketing|retail|market|store|furniture|wholesale|wine & spirits|multinational|e-|packaging|sales|tplace|warehouse|fm|product|merchandise|reatil', sector):
            commerce.append(sector)
        elif re.search('dental|health|health insurance|medic|supplement|biopharma|veterinary|pharma|heathcare|nutrition|hygiene|care|sanitation|bio|cannabis|tobacco|sciences', sector):
            health.append(sector)
        elif re.search('agri|biotechnology|Industrial|farming|fish|milk', sector):
            agriculture.append(sector)
        elif re.search('battery|manufacturing|electronics|industrial automation|aerospace|conductor|gaming|robotics|engineering|mechanical|appliance|automation|ev startup|startup laboratory|e-vehicle', sector):
            manufacturing.append(sector)
        elif re.search('communities|smart cities|government|classifieds|community|water|defense|pollution|translation & localization|taxation|maritime', sector):
            government.append(sector)
        elif re.search('media|dating|music|audio|gaming|creative|entertainment|broadcasting|video|blogging|content|celebrity|ott', sector):
            entertainment.append(sector)
        elif re.search('apartment|real estate|home|interior|construction|rental|housing|accomodation|hauz', sector):
            real_estate.append(sector)
        elif re.search('telecom|news|escrow|publication', sector):
            tele_comm.append(sector)

        data['Sector'] = data["Sector"].apply(lambda x: 'Finance' if x in finance else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Technology' if x in tech else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Beauty' if x in beauty else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Education' if x in education else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Food & Beverages' if x in food else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Sports' if x in sports else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Hospitality' if x in hospitality else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Commerce' if x in commerce else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Manufacturing' if x in manufacturing else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Agriculture' if x in agriculture else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Health' if x in health else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Government' if x in government else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Media & Entertainment' if x in entertainment  else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Business Intelligence' if x in business_intelligence else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Transport' if x in transport else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Lifestyle' if x in lifestyle else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Energy' if x in energy else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Real Estate' if x in real_estate else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Telecommunications' if x in tele_comm else x)
        data['Sector'] = data["Sector"].apply(lambda x: 'Fashion' if x in fashion else x)
       

sector_redistribution(known_sectors)


In [803]:
data['Sector'].unique()

array(['Commerce', 'Agriculture', 'Finance', 'Technology',
       'Business Intelligence', 'Transport', 'Manufacturing',
       'Food & Beverages', 'Education', 'Energy', 'Lifestyle',
       'Hospitality', 'Health', 'Sports', 'Fashion', 'Beauty',
       'Real Estate', 'Media & Entertainment', 'Government',
       'Telecommunications'], dtype=object)

In [804]:
data['Sector'].unique().shape

(20,)

In [805]:
data[(data['Sector'] == 'nan') | (data['Sector'] == '—')] # all set

Unnamed: 0,Company/Brand,Sector,Stage,Amount($),HeadQuarter,What it does,Founders,Investor


### Creating a function to remove all the excess sectors form the HeadQuaters column except the first words before a comma

### Clean up stage column

In [806]:
data['Stage'].unique()

array(['Seed', 'Series A', 'Angel', 'Series B', 'Pre-Seed',
       'Private Equity', 'Venture - Series Unknown', 'Grant',
       'Debt Financing', 'Post-IPO Debt', 'Series H', 'Series C',
       'Series E', 'Corporate Round', 'Undisclosed',
       'https://docs.google.com/spreadsheets/d/1x9ziNeaz6auNChIHnMI8U6kS7knTr3byy_YBGfQaoUA/edit#gid=1861303593',
       'Series D', 'Secondary Market', 'Post-IPO Equity',
       'Non-equity Assistance', 'Funding Round', nan, 'Fresh funding',
       'Pre series A', 'Series G', 'Post series A', 'Seed funding',
       'Seed fund', 'Series F', 'Series B+', 'Seed round', 'Pre-series A',
       'Pre-seed', 'Pre-series', 'Debt', 'Pre-series C', 'Pre-series B',
       'Bridge', 'Series B2', 'Pre- series A', 'Edge', 'Pre-Series B',
       'Seed A', 'Series A-1', 'Seed Funding', 'Pre-seed Round',
       'Seed Round & Series A', 'Pre Series A', 'Pre seed Round',
       'Angel Round', 'Pre series A1', 'Series E2', 'Seed Round',
       'Bridge Round', 'Pre seed

In [807]:
data.head()

Unnamed: 0,Company/Brand,Sector,Stage,Amount($),HeadQuarter,What it does,Founders,Investor
0,TheCollegeFever,Commerce,Seed,250000.0,Bangalore,"TheCollegeFever is a hub for fun, fiesta and f...",Unknown,Unknown
1,Happy Cow Dairy,Agriculture,Seed,584000.0,Mumbai,A startup which aggregates milk from dairy far...,Unknown,Unknown
2,MyLoanCare,Finance,Series A,949000.0,Gurgaon,Leading Online Loans Marketplace in India,Unknown,Unknown
3,PayMe India,Finance,Angel,2000000.0,Noida,PayMe India is an innovative FinTech organizat...,Unknown,Unknown
4,Eunimart,Technology,Seed,0.0,Hyderabad,Eunimart is a one stop solution for merchants ...,Unknown,Unknown


In [817]:
# a = data['Investor'].isna().sum()
# pd.Series(a)
# a

data['Investor'].unique().shape

(1779,)

In [820]:
data['Investor'].unique()

array(['Unknown', 'Sixth Sense Ventures', 'General Atlantic', ...,
       'Owl Ventures', 'Winter Capital, ETS, Man Capital',
       '3one4 Capital, Kalaari Capital'], dtype=object)

In [814]:
from IPython.display import display

display(data['Investor'])

0                                Unknown
1                                Unknown
2                                Unknown
3                                Unknown
4                                Unknown
                      ...               
2874                     Endiya Partners
2875                            IIFL AMC
2876                        Owl Ventures
2877    Winter Capital, ETS, Man Capital
2878      3one4 Capital, Kalaari Capital
Name: Investor, Length: 2879, dtype: object