In [105]:
#import packages
import pandas as pd
import numpy as np
import datetime 
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px



In [106]:
#this cell ensures pandas displays all the columns
pd.set_option('display.max_columns', None)

In [107]:
companies = pd.read_csv('Modified_Unicorn_Companies.csv')

In [108]:
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors
0,Bytedance,180,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,100,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,100,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,95,2014-01-23,FinTech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,46,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita..."


In [109]:
#display the datatypes
companies.dtypes

Company             object
Valuation            int64
Date Joined         object
Industry            object
City                object
Country/Region      object
Continent           object
Year Founded         int64
Funding             object
Select Investors    object
dtype: object

In [110]:
#convert `date` to `datetime` data type
companies['Date Joined'] = pd.to_datetime(companies['Date Joined'])

In [111]:
print(companies.dtypes)

Company                     object
Valuation                    int64
Date Joined         datetime64[ns]
Industry                    object
City                        object
Country/Region              object
Continent                   object
Year Founded                 int64
Funding                     object
Select Investors            object
dtype: object


In [112]:
#create new column `Year to Unicorn`
companies['Years To Unicorn'] = companies['Date Joined'].dt.year - companies['Year Founded'] 

In [113]:
#confirm the new column `Years To Unicorn` has been created and properly calculated
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
0,Bytedance,180,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S...",5
1,SpaceX,100,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen...",10
2,SHEIN,100,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China...",10
3,Stripe,95,2014-01-23,FinTech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG",4
4,Klarna,46,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita...",6


In [114]:
#summary statistics
companies.describe()


Unnamed: 0,Valuation,Date Joined,Year Founded,Years To Unicorn
count,1074.0,1074,1074.0,1074.0
mean,3.445996,2020-05-14 20:22:47.597765376,2012.870577,7.013035
min,1.0,2007-07-02 00:00:00,1919.0,-3.0
25%,1.0,2019-05-15 06:00:00,2011.0,4.0
50%,2.0,2021-03-26 00:00:00,2014.0,6.0
75%,3.0,2021-09-16 00:00:00,2016.0,9.0
max,180.0,2022-04-05 00:00:00,2021.0,98.0
std,8.544242,,5.705494,5.331842


In [None]:
#The `Years To Join` has a negative 3(-3) as the minimum number of years
#A company cannot become unicorn befor it was founded
#isolate the rows with negative `Year To Join` values

In [115]:
companies[companies['Years To Unicorn'] < 0]

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
527,InVision,2,2017-11-01,Internet software & services,New York,United States,North America,2020,$349M,"FirstMark Capital, Tiger Global Management, IC...",-3


In [116]:
#replace the `Year Founded` with 2011 - confirmed from further research
companies.loc[companies['Company'] == 'InVision', 'Year Founded'] = 2011

In [117]:
#confirm the year has been replaced
companies[companies['Company'] == 'InVision']

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
527,InVision,2,2017-11-01,Internet software & services,New York,United States,North America,2011,$349M,"FirstMark Capital, Tiger Global Management, IC...",-3


In [78]:
#recalculate the `Years To Unicorn` to depct the changes(corrections)
companies['Years To Unicorn'] = companies['Date Joined'].dt.year - companies['Year Founded']

In [118]:
#descriptive statistics to confirm the change has been effected
companies['Years To Unicorn'].describe()

count    1074.000000
mean        7.013035
std         5.331842
min        -3.000000
25%         4.000000
50%         6.000000
75%         9.000000
max        98.000000
Name: Years To Unicorn, dtype: float64

In [119]:
#list of `industry` labels - categories
#list provided by the company of expected industry labels in the data
industry_list = ['Artificial intelligence', 'Other', 'E-commerce & direct-to-consumer' , 'Fintech', \
                 'Internet software & services', 'Supply chain, logistics, & delivery', \
                    'Consumer & retail', 'Data management & analytics', 'Edtech', 'Health', 'Hardware', \
                        'Auto & transportation', 'Travel', 'Cybersecurity', 'Mobile & telecommunications']



In [120]:
#check if there any values in the `Industry` column that are not in the provided list
set(companies['Industry']) - set(industry_list)


{'Artificial Intelligence', 'Data management and analytics', 'FinTech'}

In [None]:
#1. The entries are present with different spellings and formating

In [130]:
#correct the bad entries
#create a replacement dictionary
replacement_dict = {'Artificial Inteligence' : 'Artificial intelligence', \
                    'Artificial Intelligence' : 'Artificial intelligence', \
                    'Data management and analytics' : 'Data management & analytics', \
                        'FinTech' : 'Fintech'}

In [131]:
#replace the bad entries in the `Industry` column
companies['Industry'] = companies['Industry'].replace(replacement_dict)

In [132]:
#verify there are no more mismatching `Industry` entries
set(companies['Industry']) - set(industry_list)

set()

In [133]:
#check for duplicates specifically in the `Company` column and use `subset` parameter to indicate duplicates
#set `keep=False` to return all occurences
#companies['Company'].duplicated(keep=False)
companies[companies.duplicated(subset=['Company'], keep=False)]

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
385,BrewDog,2,2017-04-10,Consumer & retail,Aberdeen,United Kingdom,Europe,2007,$233M,"TSG Consumer Partners, Crowdcube",10
386,BrewDog,2,2017-04-10,Consumer & retail,Aberdeen,UnitedKingdom,Europe,2007,$233M,TSG Consumer Partners,10
510,ZocDoc,2,2015-08-20,Health,New York,United States,North America,2007,$374M,"Founders Fund, Khosla Ventures, Goldman Sachs",8
511,ZocDoc,2,2015-08-20,Health,,United States,North America,2007,$374M,Founders Fund,8
1031,SoundHound,1,2018-05-03,Artificial intelligence,Santa Clara,United States,North America,2005,$215M,"Tencent Holdings, Walden Venture Capital, Glob...",13
1032,SoundHound,1,2018-05-03,Other,Santa Clara,United States,North America,2005,$215M,Tencent Holdings,13


In [134]:
#drop rows of the duplicates and keep the first row
companies = companies.drop_duplicates(subset=['Company'], keep='first')

In [135]:
#convert numerical data to categorical data
#create `High Valuation` column and divive the `Valuation` to `high` and `low` categories
#low valuation < 50, high valuation > 50
companies['High Valuation'] = pd.qcut(companies['Valuation'], 2, labels=['low', 'high'])

In [89]:
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn,High Valuation
0,Bytedance,180,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S...",5,high
1,SpaceX,100,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen...",10,high
2,SHEIN,100,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China...",10,high
3,Stripe,95,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG",4,high
4,Klarna,46,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita...",6,high


In [136]:
#Convert `Continent` to numeric
#rank the continents in descending order - hierchical importance
companies['Continent'].value_counts()

Continent
North America    586
Asia             310
Europe           143
South America     21
Oceania            8
Africa             3
Name: count, dtype: int64

In [None]:
#fewer unicorns might signify high untapped potential in those continents


In [137]:
#create a numeric `Continent Number` column - ordinal variables
continent_dict = {'North America': 1,
                  'Asia': 2,
                  'Europe': 3,
                  'South America': 4,
                  'Oceania' : 5,
                  'Africa' :6}
companies['Continent Number'] = companies['Continent'].replace(continent_dict)


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [None]:
companies.head(50)

In [None]:
companies.tail(50)

In [141]:
#convert `Country/Region` to numeric - nominal variables
#create `Country/Region Numeric` column
companies['Country/Region Number'] = companies['Country/Region'].astype('category').cat.codes

In [139]:
#convert `Industry` to numeric - dummy variables
#create dummy variables with industry values
industry_encoded = pd.get_dummies(companies['Industry'], dtype=int)

#combine `companies` data frame  with the new dummy `Industry` columns
companies = pd.concat([companies, industry_encoded], axis=1)

In [140]:
#display the data frame
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn,High Valuation,Continent Number,Country/Region Number,Artificial intelligence,Auto & transportation,Consumer & retail,Cybersecurity,Data management & analytics,E-commerce & direct-to-consumer,Edtech,Fintech,Hardware,Health,Internet software & services,Mobile & telecommunications,Other,"Supply chain, logistics, & delivery",Travel
0,Bytedance,180,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S...",5,high,2,9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,SpaceX,100,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen...",10,high,1,44,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,SHEIN,100,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China...",10,high,2,9,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,Stripe,95,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG",4,high,1,44,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,Klarna,46,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita...",6,high,3,38,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [None]:
#conclusion
#1. Label encoding simplifies analysis and creates data useful for machine learning
#2. Label encoding may introduce unintended relationships between categorical data in a dataset
#3. Input validation is essintial for producing error free high quality dadaset for analysis
#4. There's no one-fit-for-all in input validation - each dataset requires unique methods for validation
#5.Label encoding is a case-by-case practise on each dataset