In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import plotly
import plotly.express as px
from matplotlib.pyplot import figure
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go

In [3]:
df = pd.read_csv('./Unicorn_Companies.csv')

In [4]:
df.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
0,Bytedance,$180B,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$100B,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100B,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,$95B,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,$46B,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita..."


In [5]:
df.columns

Index(['Company', 'Valuation', 'Date Joined', 'Industry', 'City', 'Country',
       'Continent', 'Year Founded', 'Funding', 'Select Investors'],
      dtype='object')

In [6]:
df.rename(columns={'Date Joined': 'UniDate'}, inplace=True)
df.rename(columns={'Year Founded': 'FoundedOn'}, inplace=True)
df.rename(columns={'Select Investors': 'Investors'}, inplace=True)

In [7]:
df.columns

Index(['Company', 'Valuation', 'UniDate', 'Industry', 'City', 'Country',
       'Continent', 'FoundedOn', 'Funding', 'Investors'],
      dtype='object')

In [8]:
df.dtypes

Company      object
Valuation    object
UniDate      object
Industry     object
City         object
Country      object
Continent    object
FoundedOn     int64
Funding      object
Investors    object
dtype: object

Before changing the datatypes, first we need to alter the column values such as remove $ and B from 'Valuation' and 'Funding'

In [9]:
df.sample()

Unnamed: 0,Company,Valuation,UniDate,Industry,City,Country,Continent,FoundedOn,Funding,Investors
466,WEMAKEPRICE,$2B,2015-09-09,E-commerce & direct-to-consumer,Seoul,South Korea,Asia,2009,$397M,"IMM Investment, NXC"


In [10]:
df = pd.DataFrame(df)

def convert(value):
    # Check if value is a string
    if isinstance(value, str):
        try:
            # Remove dollar sign
            value = value.replace('$', '')
            # Convert billions to float
            if 'B' in value:
                return float(value.replace('B', ''))
            # Convert millions to billions and then to float
            elif 'M' in value:
                return float(value.replace('M', '')) / 1000
            else:
                return np.nan  # For non-numeric strings
        except ValueError:
            return np.nan  # In case of unexpected conversion issues
    return float(value)  # Ensure the value is returned as a float

# Apply the conversion to the 'Valuation' and 'Funding' columns
df['Valuation'] = df['Valuation'].apply(convert)
df['Funding'] = df['Funding'].apply(convert)

In [11]:
df.head()

Unnamed: 0,Company,Valuation,UniDate,Industry,City,Country,Continent,FoundedOn,Funding,Investors
0,Bytedance,180.0,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,8.0,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,100.0,2012-12-01,Other,Hawthorne,United States,North America,2002,7.0,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,100.0,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,2.0,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,95.0,2014-01-23,Fintech,San Francisco,United States,North America,2010,2.0,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,46.0,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,4.0,"Institutional Venture Partners, Sequoia Capita..."


In [12]:
df.rename(columns={'Valuation': 'Valuation(in $ Billions)'},inplace=True)
df.rename(columns={'Funding': 'Funding(in $ Billions)'},inplace=True)

In [13]:
df.dtypes

Company                      object
Valuation(in $ Billions)    float64
UniDate                      object
Industry                     object
City                         object
Country                      object
Continent                    object
FoundedOn                     int64
Funding(in $ Billions)      float64
Investors                    object
dtype: object

change the datatype of UniDate and FoundedOn, for FoundedOn use .year function

In [14]:
df['UniDate'] = pd.to_datetime(df['UniDate'], errors='coerce')

In [15]:
df['FoundedOn'] = pd.to_datetime(df['FoundedOn'], format='%Y', errors='coerce').dt.year

In [16]:
df.dtypes

Company                             object
Valuation(in $ Billions)           float64
UniDate                     datetime64[ns]
Industry                            object
City                                object
Country                             object
Continent                           object
FoundedOn                            int64
Funding(in $ Billions)             float64
Investors                           object
dtype: object

fetching year of becoming a unicorn from UniDate

In [17]:
df['UniYear'] = df['UniDate'].dt.year

### Data Analysis ###

In [18]:
df.head()

Unnamed: 0,Company,Valuation(in $ Billions),UniDate,Industry,City,Country,Continent,FoundedOn,Funding(in $ Billions),Investors,UniYear
0,Bytedance,180.0,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,8.0,"Sequoia Capital China, SIG Asia Investments, S...",2017
1,SpaceX,100.0,2012-12-01,Other,Hawthorne,United States,North America,2002,7.0,"Founders Fund, Draper Fisher Jurvetson, Rothen...",2012
2,SHEIN,100.0,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,2.0,"Tiger Global Management, Sequoia Capital China...",2018
3,Stripe,95.0,2014-01-23,Fintech,San Francisco,United States,North America,2010,2.0,"Khosla Ventures, LowercaseCapital, capitalG",2014
4,Klarna,46.0,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,4.0,"Institutional Venture Partners, Sequoia Capita...",2011


In [19]:
df['Industry'].unique()

array(['Artificial intelligence', 'Other',
       'E-commerce & direct-to-consumer', 'Fintech',
       'Internet software & services',
       'Supply chain, logistics, & delivery', 'Consumer & retail',
       'Data management & analytics', 'Edtech', 'Health', 'Hardware',
       'Auto & transportation', 'Travel', 'Cybersecurity',
       'Mobile & telecommunications', 'Artificial Intelligence'],
      dtype=object)

In [20]:
df['Industry'] = df['Industry'].replace('Artificial Intelligence', 'Artificial intelligence')

In [21]:
no_of_companies = df['Company'].unique()
print(f'No of listed companies: {len(no_of_companies)}')

no_of_ind = df['Industry'].unique()
print(f'No of Industries: {len(no_of_ind)}')

no_of_country = df['Country'].unique()
print(f'No of country: {len(no_of_country)}')

no_of_city = df['City'].unique()
print(f'No of cities: {len(no_of_city)}')

No of listed companies: 1073
No of Industries: 15
No of cities: 46
No of cities: 257


In [23]:
industry_counts = df['Industry'].value_counts()

fig = go.Figure(data=[go.Pie(labels=industry_counts.index, values=industry_counts.values, textinfo='percent', hole=0.5)])

fig.update_layout(
    title='Distribution of Companies per Industry',
    template='plotly_white',
    showlegend=True
)

fig.show()

In [25]:
industry_counts = df['Industry'].value_counts()

fig = go.Figure(data=[go.Bar(x=industry_counts.index, y=industry_counts.values, 
    text=industry_counts.values, marker_color='green')])

fig.update_layout(
    title='Number of Companies per Industry',
    xaxis_tickangle=-45,
    xaxis_title='Industry',
    yaxis_title='Count',
    template='plotly_white',
    showlegend=False
)

fig.show()

In [27]:
foundation_year_counts = df['UniYear'].value_counts().sort_index()

fig = go.Figure(data=[go.Bar(
    x=foundation_year_counts.index,
    y=foundation_year_counts.values,
    text=foundation_year_counts.values,
    textposition='auto',
    marker_color='skyblue'
)])

fig.update_layout(
    title='Count of Companies per Unicorn Label Year',
    xaxis=dict(
        title='Unicorn Label Year',
        tickmode='array',
        tickvals=foundation_year_counts.index,
        ticktext=[str(year) for year in foundation_year_counts.index],
    ),
    yaxis_title='Count of Companies',
    template='plotly_white',
    showlegend=False
)

fig.show()

In [29]:
foundation_year_counts = df['FoundedOn'].value_counts().sort_index()

fig = go.Figure(data=[go.Bar(
    x=foundation_year_counts.index,
    y=foundation_year_counts.values,
    text=foundation_year_counts.values,
    textposition='outside',
    marker_color='skyblue'
)])

fig.update_layout(
    title='Count of Companies per Foundation Year',
    xaxis=dict(
        title='Foundation Year',
        tickmode='array',
        tickvals=foundation_year_counts.index,
        ticktext=[str(year) for year in foundation_year_counts.index],
        tickangle=90,
    ),
    yaxis_title='Count of Companies',
    template='plotly_white',
    showlegend=False
)

fig.show()

In [30]:
result = df[df['FoundedOn']==1919]
print(result)

                  Company  Valuation(in $ Billions)    UniDate Industry  \
189  Otto Bock HealthCare                       4.0 2017-06-24   Health   

           City  Country Continent  FoundedOn  Funding(in $ Billions)  \
189  Duderstadt  Germany    Europe       1919                     0.0   

        Investors  UniYear  
189  EQT Partners     2017  


### Ottobock Healthcare is the oldest company to be listed

In [32]:
country_counts = df['Country'].value_counts().sort_index()

fig = go.Figure(data=go.Scatter(
    x=country_counts.index,
    y=country_counts.values,
    mode='lines+markers',
    line=dict(color='red', width=2),
    marker=dict(color='black', size=10),
))

fig.update_layout(
    title='Number of Companies per Country',
    xaxis=dict(title='Country', showgrid=False),
    yaxis=dict(title='Number of Companies', showgrid=True, gridwidth=1, gridcolor='lightgray'),
    template='plotly_white',
    showlegend=False,
)

fig.show()

In [34]:
continent_counts = df['Continent'].value_counts().sort_index()

fig = go.Figure(data=go.Scatter(
    x=continent_counts.index,
    y=continent_counts.values,
    mode='lines+markers',
    line=dict(color='red', width=2),
    marker=dict(color='black', size=10),
))

fig.update_layout(
    title='Number of Companies per Continent',
    xaxis=dict(title='Continent', showgrid=False),
    yaxis=dict(title='Number of Companies', showgrid=True, gridwidth=1, gridcolor='lightgray'),
    template='plotly_white',
    showlegend=False,
)

fig.show()

In [35]:
total_funding = df['Funding(in $ Billions)'].sum().round(2)
print(f'Total Funded Amount = ${total_funding} Billion')

Total Funded Amount = $591.82 Billion


In [37]:
industry_funding = df.groupby('Industry')['Funding(in $ Billions)'].sum().reset_index()

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=industry_funding['Industry'],
    y=industry_funding['Funding(in $ Billions)'],
    mode='lines+markers',
    line=dict(color='blue', width=2),
    marker=dict(color='red', size=10),
    text=industry_funding['Funding(in $ Billions)'],
    textposition='top center',
    hovertemplate='<b>Industry</b>: %{x}<br><b>Funding</b>: $%{y:.2f}B<br>',
))

fig.update_layout(
    title='Funding per Industry',
    xaxis=dict(title='Industry', showgrid=False, tickangle=330),
    yaxis=dict(title='Funded Amount (Billion $)', showgrid=True, gridwidth=1, gridcolor='lightgray'),
    template='plotly_white',
    showlegend=False,
)

fig.show()

In [38]:
total_valuation = df['Valuation(in $ Billions)'].sum().round(2)
print(f'Total Valuation Amount = ${total_valuation} Billion')

Total Valuation Amount = $3711.0 Billion


In [40]:
industry_valuation = df.groupby('Industry')['Valuation(in $ Billions)'].sum().reset_index()

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=industry_valuation['Industry'],
    y=industry_valuation['Valuation(in $ Billions)'],
    mode='lines+markers',
    line=dict(color='blue', width=2),
    marker=dict(color='red', size=10),
    text=industry_valuation['Valuation(in $ Billions)'],
    textposition='top center',
    hovertemplate='<b>Industry</b>: %{x}<br><b>valuation</b>: $%{y:.2f}B<br>',
))

fig.update_layout(
    title='Valuation per Industry',
    xaxis=dict(title='Industry', showgrid=False, tickangle=330),
    yaxis=dict(title='Valuation Amount (Billion $)', showgrid=True, gridwidth=1, gridcolor='lightgray'),
    template='plotly_white',
    showlegend=False,
)

fig.show()

In [41]:
df_investors = df.set_index(['Company', 'Valuation(in $ Billions)', 'UniDate', 'Industry', 'City', 'Country', 'Continent', 'FoundedOn', 'Funding(in $ Billions)', 'UniYear'])['Investors'].str.split(', ', expand=True).stack().reset_index()

df_investors = df_investors.drop(columns=['level_10'])

df_investors.columns = ['Company', 'Valuation(in $ Billions)', 'UniDate', 'Industry', 'City', 'Country', 'Continent', 'FoundedOn', 'Funding(in $ Billions)', 'UniYear', 'Investors']

print(df_investors)

        Company  Valuation(in $ Billions)    UniDate  \
0     Bytedance                     180.0 2017-04-07   
1     Bytedance                     180.0 2017-04-07   
2     Bytedance                     180.0 2017-04-07   
3     Bytedance                     180.0 2017-04-07   
4        SpaceX                     100.0 2012-12-01   
...         ...                       ...        ...   
3041       Zopa                       1.0 2021-10-19   
3042       Zopa                       1.0 2021-10-19   
3043      Zwift                       1.0 2020-09-16   
3044      Zwift                       1.0 2020-09-16   
3045      Zwift                       1.0 2020-09-16   

                             Industry        City         Country  \
0             Artificial intelligence     Beijing           China   
1             Artificial intelligence     Beijing           China   
2             Artificial intelligence     Beijing           China   
3             Artificial intelligence     Beijing  

In [42]:
replacements = {
    'and Sequoia Capital China': 'Sequoia Capital China',
    'Anthermis': 'Anthemis',
    'Benhcmark': 'Benchmark',
    'BlackRock': 'Blackrock',
    'Coatue Managemeny': 'Coatue Management',
    'D1 Capita Partners': 'D1 Capital Partners',
    'index Ventures': 'Index Ventures',
    'IndexVentures': 'Index Ventures',
    'Tiger Global management': 'Tiger Global Management',
    'SoftBank Group': 'Softbank Group',
    'SoftBankGroup': 'Softbank Group'
}

df_investors['Investors'] = df_investors['Investors'].replace(replacements)

In [56]:
investors_count = df_investors['Investors'].value_counts()

top_10_investors = investors_count.head(10).reset_index()
top_10_investors.columns = ['Investors', 'Frequency']

fig = px.bar(
    top_10_investors,
    x='Investors',
    y='Frequency',
    text='Frequency',
    title='Top-10 Investors by No. of Companies Invested in',
    labels={'Investors': 'Investors', 'Frequency': 'Frequency'},
)

# Update layout for better readability
fig.update_layout(
    xaxis=dict(tickangle=335),
    template='plotly_white'
)
fig.update_traces(texttemplate='%{text}', textposition='inside')

# Show the plot
fig.show()

In [46]:
investor_funding = df_investors.groupby('Investors')['Funding(in $ Billions)'].sum()

top_10_investors_funding = investor_funding.nlargest(10)

fig = px.line(
    top_10_investors_funding,
    x=top_10_investors_funding.index,
    y=top_10_investors_funding.values,
    labels={'x': 'Investors', 'y': 'Total Funding (in $ Billions)'},
    title='Top-10 Investors by Total Funding'
)

fig.update_traces(line=dict(color='green'))
fig.update_layout(
    xaxis_tickangle=-45,
    xaxis_title='Investors',
    yaxis_title='Total Funding (in $ Billions)',
    title_x=0.5,
    template='plotly_white'
)

fig.show()

In [48]:
fig = px.bar(
    top_10_compny_by_valuation,
    x=top_10_compny_by_valuation.index,
    y=top_10_compny_by_valuation.values,
    labels={'x': 'Company', 'y': 'Total Valuation (in $ Billions)'},
    title='Top-10 Company in terms of Valuation',
    text=top_10_compny_by_valuation.values,
)

fig.update_layout(
    xaxis_tickangle=-45,
    xaxis_title='Company',
    yaxis_title='Total Valuation (in $ Billions)',
    title_x=0.5,
    template='plotly_white'
)

fig.show()

In [49]:
df['ROI'] = (df['Valuation(in $ Billions)'] - df['Funding(in $ Billions)']) / df['Funding(in $ Billions)']

In [50]:
roi_companies = df[['Company','ROI']].sort_values(by='ROI', ascending=False).head(10)
print(f'Top-10 Companies in terms of ROI \n{roi_companies}')

Top-10 Companies in terms of ROI 
                  Company          ROI
189  Otto Bock HealthCare          inf
239                Zapier  3999.000000
61                 Dunamu   125.760563
792             Workhuman   110.111111
361                  CFGI   104.263158
743                Manner    99.000000
72        DJI Innovations    75.190476
760           GalaxySpace    70.428571
5                   Canva    68.930070
368            Il Makiage    67.965517


In [51]:
df[df['Company'] == 'Otto Bock HealthCare']['Funding(in $ Billions)']

189    0.0
Name: Funding(in $ Billions), dtype: float64

## ottobock healthcare has no initial funding