# Preprocessing

## Imports

In [20]:
import numpy as np
import pandas as pd
import posixpath
from info_vis import preprocessing, PROJECT_PATH, DATA_PATH, PREPROCESSED_DATA_PATH

## Load and Preprocess Data

### Unicorn Startups

In [2]:
filename = "World_Wide_Unicorn_Startups.csv"
unicorn_startups_df = pd.read_csv(px.join(DATA_PATH, filename))
unicorn_startups_df

Unnamed: 0,Company,Valuation,Date,Country,City,Industry,Investors,year,month,day
0,Bytedance,140.0,4/7/2017,China,Beijing,Artificial intelligence,"0 Sequoia Capital China, SIG Asia Investm...",2017,7,4
1,SpaceX,100.3,12/1/2012,United States,Hawthorne,Other,"0 Sequoia Capital China, SIG Asia Investm...",2012,1,12
2,Stripe,95.0,1/23/2014,United States,San Francisco,Fintech,"0 Sequoia Capital China, SIG Asia Investm...",2014,23,1
3,Klarna,45.6,12/12/2011,Sweden,Stockholm,Fintech,"0 Sequoia Capital China, SIG Asia Investm...",2011,12,12
4,Canva,40.0,1/8/2018,Australia,Surry Hills,Internet software & services,"0 Sequoia Capital China, SIG Asia Investm...",2018,8,1
...,...,...,...,...,...,...,...,...,...,...
931,YipitData,1.0,12/6/2021,United States,New York,Internet software & services,"0 Sequoia Capital China, SIG Asia Investm...",2021,6,12
932,Anyscale,1.0,12/7/2021,United States,Berkeley,Artificial Intelligence,"0 Sequoia Capital China, SIG Asia Investm...",2021,7,12
933,Iodine Software,1.0,12/1/2021,United States,Austin,Data management & analytics,"0 Sequoia Capital China, SIG Asia Investm...",2021,1,12
934,ReliaQuest,1.0,12/1/2021,United States,Tampa,Cybersecurity,"0 Sequoia Capital China, SIG Asia Investm...",2021,1,12


#### Drop wanted columns

In [3]:
unicorn_startups_df = unicorn_startups_df.drop(columns=["Investors", "month", "day", "year"])
unicorn_startups_df

Unnamed: 0,Company,Valuation,Date,Country,City,Industry
0,Bytedance,140.0,4/7/2017,China,Beijing,Artificial intelligence
1,SpaceX,100.3,12/1/2012,United States,Hawthorne,Other
2,Stripe,95.0,1/23/2014,United States,San Francisco,Fintech
3,Klarna,45.6,12/12/2011,Sweden,Stockholm,Fintech
4,Canva,40.0,1/8/2018,Australia,Surry Hills,Internet software & services
...,...,...,...,...,...,...
931,YipitData,1.0,12/6/2021,United States,New York,Internet software & services
932,Anyscale,1.0,12/7/2021,United States,Berkeley,Artificial Intelligence
933,Iodine Software,1.0,12/1/2021,United States,Austin,Data management & analytics
934,ReliaQuest,1.0,12/1/2021,United States,Tampa,Cybersecurity


#### 1. Change date format into yyyy
#### 2. Change Date column name into Year
#### 3. Sort by (year, valuation), in ascending and descending orders respectivly

In [4]:
unicorn_startups_df["Date"] = unicorn_startups_df["Date"].apply(lambda x: int(x[-4:]))
unicorn_startups_df = unicorn_startups_df.rename(columns={'Date': 'Year'})
unicorn_startups_df = unicorn_startups_df.sort_values(by=["Year", "Valuation"], ascending=[True, False]).reset_index(drop=True)
unicorn_startups_df

Unnamed: 0,Company,Valuation,Year,Country,City,Industry
0,Veepee,1.38,2007,France,La Plaine Saint-Denis,E-commerce & direct-to-consumer
1,VANCL,3.00,2010,China,Beijing,E-commerce & direct-to-consumer
2,Klarna,45.60,2011,Sweden,Stockholm,Fintech
3,Vice Media,5.70,2011,United States,Brooklyn,Internet software & services
4,SpaceX,100.30,2012,United States,Hawthorne,Other
...,...,...,...,...,...,...
931,YipitData,1.00,2021,United States,New York,Internet software & services
932,Anyscale,1.00,2021,United States,Berkeley,Artificial Intelligence
933,Iodine Software,1.00,2021,United States,Austin,Data management & analytics
934,ReliaQuest,1.00,2021,United States,Tampa,Cybersecurity


#### Clean data
1. Fill missing data (singapore and Hong Kong's city column is missing, take the capital by default)

In [5]:
mask = unicorn_startups_df['City'].isna()
unicorn_startups_df.loc[mask, 'City'] = unicorn_startups_df.loc[mask, 'Country']
unicorn_startups_df.isna().sum()

Company      0
Valuation    0
Year         0
Country      0
City         0
Industry     0
dtype: int64

#### Extract year range and existing countries

In [6]:
unicorn_startups_years = set(unicorn_startups_df["Year"])
unicorn_startups_countries = set(unicorn_startups_df["Country"])
print(unicorn_startups_years)
print(unicorn_startups_countries)

{2016, 2017, 2018, 2019, 2020, 2021, 2007, 2010, 2011, 2012, 2013, 2014, 2015}
{'Mexico', 'United States', 'United Arab Emirates', 'Croatia', 'Bermuda', 'Philippines', 'Singapore', 'Ireland', 'Luxembourg', 'Switzerland', 'China', 'South Africa', 'Germany', 'Denmark', 'Vietnam', 'Spain', 'India', 'Australia', 'France', 'Argentina', 'Nigeria', 'Sweden', 'Israel', 'Norway', 'Indonesia', 'South Korea', 'Santa Clara', 'Colombia', 'Finland', 'Chile', 'Czech Republic', 'Netherlands', 'Thailand', 'Turkey', 'Indonesia,', 'United States,', 'Malaysia', 'Japan', 'Canada', 'Austria', 'Brazil', 'United Kingdom', 'Lithuania', 'Belgium', 'Senegal', 'Hong Kong', 'Estonia'}


### Quality Of Life

In [7]:
filename = "QOL.csv"
qol_df = pd.read_csv(px.join(DATA_PATH, filename))
qol_df

Unnamed: 0,Rank,Country,Quality of Life Index,Purchasing Power Index,Safety Index,Health Care Index,Cost of Living Index,Property Price to Income Ratio,Traffic Commute Time Index,Pollution Index,Climate Index,Year
0,1,Switzerland,190.8,111.0,78.7,74.5,131.7,8.4,28.7,20.1,80.0,2021
1,2,Denmark,190.0,94.7,73.3,80.0,91.7,6.7,28.7,20.4,81.8,2021
2,3,Netherlands,183.3,83.9,72.8,75.8,78.6,7.4,27.8,25.3,87.1,2021
3,4,Finland,182.8,89.1,73.0,76.4,77.5,8.6,29.0,11.9,56.6,2021
4,5,Austria,182.4,78.2,74.8,78.4,75.5,10.4,25.7,19.2,77.8,2021
...,...,...,...,...,...,...,...,...,...,...,...,...
498,82,Bangladesh,-5.4,33.4,35.8,43.3,39.2,10.9,58.0,93.6,-,2015
499,83,Egypt,-7.1,25.8,39.9,54.7,37.2,11.3,56.9,96.6,-,2015
500,84,Vietnam,-19.5,21.3,47.7,34.3,41.9,23.9,36.4,84.3,-,2015
501,85,Mongolia,-35.7,23.5,29.7,28.3,52.3,18.6,36.9,94.6,-,2015


#### Drop wanted columns

In [8]:
qol_df = qol_df.drop(columns=["Rank", "Safety Index", "Health Care Index", "Traffic Commute Time Index", "Pollution Index", "Climate Index"])
qol_df

Unnamed: 0,Country,Quality of Life Index,Purchasing Power Index,Cost of Living Index,Property Price to Income Ratio,Year
0,Switzerland,190.8,111.0,131.7,8.4,2021
1,Denmark,190.0,94.7,91.7,6.7,2021
2,Netherlands,183.3,83.9,78.6,7.4,2021
3,Finland,182.8,89.1,77.5,8.6,2021
4,Austria,182.4,78.2,75.5,10.4,2021
...,...,...,...,...,...,...
498,Bangladesh,-5.4,33.4,39.2,10.9,2015
499,Egypt,-7.1,25.8,37.2,11.3,2015
500,Vietnam,-19.5,21.3,41.9,23.9,2015
501,Mongolia,-35.7,23.5,52.3,18.6,2015


#### Sort by (year, country) in ascending order

In [9]:
qol_df = qol_df.sort_values(by=["Year", "Country"]).reset_index(drop=True)
qol_df

Unnamed: 0,Country,Quality of Life Index,Purchasing Power Index,Cost of Living Index,Property Price to Income Ratio,Year
0,Argentina,77.0,59.4,67.1,11.6,2015
1,Armenia,49.1,27.7,40.7,13.4,2015
2,Australia,180.8,110.4,99.3,7.1,2015
3,Austria,182.6,104.6,76.9,9.6,2015
4,Bahrain,84.1,59.7,56.2,7.9,2015
...,...,...,...,...,...,...
498,United Arab Emirates,156.0,85.7,61.7,4.9,2021
499,United Kingdom,159.0,82.6,71.0,9.6,2021
500,United States,167.0,102.6,71.9,4.0,2021
501,Uruguay,124.6,30.6,51.1,17.3,2021


#### Extract year range and existing countries

In [10]:
qol_years = set(qol_df["Year"])
qol_countries = set(qol_df["Country"])
print(qol_years)
print(qol_countries)

{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{'Jordan', 'United Arab Emirates', 'Bolivia', 'Singapore', 'Serbia', 'Kazakhstan', 'Russia', 'South Africa', 'Egypt', 'France', 'Israel', 'Finland', 'Morocco', 'Kenya', 'Uruguay', 'Malaysia', 'Canada', 'Japan', 'Mongolia', 'Turkmenistan', 'Romania', 'Bahrain', 'Cyprus', 'Saudi Arabia', 'Latvia', 'Ireland', 'Denmark', 'Vietnam', 'Indonesia', 'Bangladesh', 'Poland', 'Chile', 'Thailand', 'Armenia', 'Brazil', 'Italy', 'Lebanon', 'Azerbaijan', 'Oman', 'Bulgaria', 'Portugal', 'Switzerland', 'Iran', 'Germany', 'Pakistan', 'Ecuador', 'Cambodia', 'Sweden', 'Puerto Rico', 'Czech Republic', 'South Korea', 'Peru', 'New Zealand', 'Netherlands', 'Turkey', 'Moldova', 'Panama', 'Greece', 'Ukraine', 'Costa Rica', 'Austria', 'Sri Lanka', 'United Kingdom', 'Hungary', 'Taiwan', 'Belgium', 'Iceland', 'Estonia', 'Slovenia', 'Bosnia And Herzegovina', 'Mexico', 'United States', 'Croatia', 'Georgia', 'Philippines', 'Venezuela', 'Belarus', 'Kuwait', 'China', 'Spain', '

### GDP

In [11]:
filename = "GDP.csv"
gdp_df = pd.read_csv(px.join(DATA_PATH, filename))
gdp_df

Unnamed: 0,Country/Area,Year,Unit,"GDP, Per Capita GDP - US Dollars"
0,Afghanistan,2015,US$,552.722397
1,Afghanistan,2016,US$,525.188137
2,Afghanistan,2017,US$,533.339054
3,Afghanistan,2018,US$,513.194331
4,Afghanistan,2019,US$,511.736489
...,...,...,...,...
1689,Zimbabwe,2017,US$,1487.978497
1690,Zimbabwe,2018,US$,1572.738770
1691,Zimbabwe,2019,US$,1479.534798
1692,Zimbabwe,2020,US$,1395.305090


#### Drop unwanted columns

In [12]:
gdp_df = gdp_df.drop(columns=["Unit"])
gdp_df

Unnamed: 0,Country/Area,Year,"GDP, Per Capita GDP - US Dollars"
0,Afghanistan,2015,552.722397
1,Afghanistan,2016,525.188137
2,Afghanistan,2017,533.339054
3,Afghanistan,2018,513.194331
4,Afghanistan,2019,511.736489
...,...,...,...
1689,Zimbabwe,2017,1487.978497
1690,Zimbabwe,2018,1572.738770
1691,Zimbabwe,2019,1479.534798
1692,Zimbabwe,2020,1395.305090


In [13]:
gdp_df = gdp_df.rename(columns={'GDP, Per Capita GDP - US Dollars': 'GDP Per Capita', "Country/Area": "Country"})
gdp_df

Unnamed: 0,Country,Year,GDP Per Capita
0,Afghanistan,2015,552.722397
1,Afghanistan,2016,525.188137
2,Afghanistan,2017,533.339054
3,Afghanistan,2018,513.194331
4,Afghanistan,2019,511.736489
...,...,...,...
1689,Zimbabwe,2017,1487.978497
1690,Zimbabwe,2018,1572.738770
1691,Zimbabwe,2019,1479.534798
1692,Zimbabwe,2020,1395.305090


#### Sort by (year, Country) in ascending order

In [14]:
gdp_df = gdp_df.sort_values(by=["Year", "Country"]).reset_index(drop=True)
gdp_df

Unnamed: 0,Country,Year,GDP Per Capita
0,Afghanistan,2015,552.722397
1,Africa,2015,1983.750763
2,Albania,2015,3928.362400
3,Algeria,2015,4147.453064
4,Americas,2015,25903.772991
...,...,...,...
1689,Western Europe,2021,51910.529680
1690,World,2021,12313.588733
1691,Yemen,2021,243.188488
1692,Zambia,2021,1127.523151


#### Extract year range and existing countries

In [15]:
gdp_years = set(gdp_df["Year"])
gdp_countries = set(gdp_df["Country"])
print(gdp_years)
print(gdp_countries)

{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{'Antigua and Barbuda', 'Australia and New Zealand', 'French Polynesia', 'Jordan', 'World', 'United Arab Emirates', 'Uzbekistan', 'Czechia', 'Republic of North Macedonia', 'Anguilla', 'Bolivia (Plurinational State of)', 'Gambia', 'United Kingdom of Great Britain and Northern Ireland', 'Viet Nam', 'Libya', 'San Marino', 'Albania', 'Singapore', 'Mauritius', 'Western Asia', 'Guinea-Bissau', 'Serbia', 'Kazakhstan', 'South Africa', 'Western Europe', 'Egypt', 'France', 'Niger', 'Northern Africa', 'Israel', 'Algeria', 'Uganda', 'Finland', 'Cuba', 'Ghana', 'Kyrgyzstan', 'Morocco', 'Chad', 'Kenya', 'Europe', 'Uruguay', 'Saint Kitts and Nevis', 'United Republic of Tanzania: Zanzibar', 'El Salvador', 'Malaysia', 'Canada', 'Japan', 'Liechtenstein', 'Madagascar', 'Samoa', 'Democratic Republic of the Congo', 'Mongolia', 'Southern Europe', 'Honduras', 'Lesotho', 'Turkmenistan', 'Romania', 'Senegal', 'Belize', 'Malawi', 'Sub-Saharan Africa', 'Bahrain', 'Liber

## Find commun years and countries among the 3 datasets and reduce them accordingly

In [16]:
common_years = gdp_years.intersection(qol_years.intersection(unicorn_startups_years))
common_countries = gdp_countries.intersection(qol_countries.intersection(unicorn_startups_countries))

print(common_years)
print(common_countries)
print(len(common_years))
print(len(common_countries))

{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{'Mexico', 'United Arab Emirates', 'United States', 'Croatia', 'Philippines', 'Singapore', 'Ireland', 'Switzerland', 'South Africa', 'Germany', 'Denmark', 'Spain', 'India', 'Australia', 'France', 'Argentina', 'Nigeria', 'Sweden', 'Israel', 'Norway', 'Indonesia', 'Chile', 'Colombia', 'Finland', 'Netherlands', 'Thailand', 'Malaysia', 'Japan', 'Canada', 'Austria', 'Brazil', 'Lithuania', 'Belgium', 'Estonia'}
7
34


In [17]:
unicorn_startups_df = unicorn_startups_df[unicorn_startups_df["Year"].isin(common_years) & unicorn_startups_df["Country"].isin(common_countries)].reset_index(drop=True)
qol_df = qol_df[qol_df["Year"].isin(common_years) & qol_df["Country"].isin(common_countries)].reset_index(drop=True)
gdp_df = gdp_df[gdp_df["Year"].isin(common_years) & gdp_df["Country"].isin(common_countries)].reset_index(drop=True)
print(set(unicorn_startups_df["Year"]))
print(set(qol_df["Year"]))
print(set(gdp_df["Year"]))
print(set(unicorn_startups_df["Country"]))
print(set(qol_df["Country"]))
print(set(gdp_df["Country"]))
print(len(set(unicorn_startups_df["Country"])))
print(len(set(qol_df["Country"])))
print(len(set(gdp_df["Country"])))

{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{'Mexico', 'United States', 'United Arab Emirates', 'Croatia', 'Philippines', 'Singapore', 'Ireland', 'Switzerland', 'South Africa', 'Germany', 'Denmark', 'Spain', 'India', 'Australia', 'France', 'Argentina', 'Nigeria', 'Sweden', 'Israel', 'Norway', 'Indonesia', 'Chile', 'Colombia', 'Finland', 'Netherlands', 'Thailand', 'Malaysia', 'Japan', 'Canada', 'Austria', 'Brazil', 'Lithuania', 'Belgium', 'Estonia'}
{'Mexico', 'United Arab Emirates', 'Croatia', 'United States', 'Philippines', 'Ireland', 'Singapore', 'Switzerland', 'South Africa', 'Germany', 'Denmark', 'Spain', 'Australia', 'India', 'France', 'Argentina', 'Nigeria', 'Sweden', 'Israel', 'Norway', 'Indonesia', 'Chile', 'Colombia', 'Finland', 'Netherlands', 'Thailand', 'Malaysia', 'Canada', 'Japan', 'Austria', 'Brazil', 'Lithuania', 'Belgium', 'Estonia'}
{'Mexico', 'United Arab Emirates', 'Croatia', 'United

## Merge GDP and Quality of Life data into one dataset
#### The Quality of Life dataset now contains the GDP per Capita data

In [18]:
qol_df["GDP Per Capita"] = gdp_df["GDP Per Capita"]
qol_df

Unnamed: 0,Country,Quality of Life Index,Purchasing Power Index,Cost of Living Index,Property Price to Income Ratio,Year,GDP Per Capita
0,Argentina,77.0,59.4,67.1,11.6,2015,14833.199680
1,Australia,180.8,110.4,99.3,7.1,2015,52009.802759
2,Austria,182.6,104.6,76.9,9.6,2015,43908.420277
3,Belgium,136.0,86.2,87.2,6.5,2015,40889.673570
4,Brazil,29.8,41.2,55.3,16.7,2015,8936.195589
...,...,...,...,...,...,...,...
228,Sweden,171.4,90.6,79.2,8.6,2021,93076.833587
229,Switzerland,190.8,111.0,131.7,8.4,2021,3484.385958
230,Thailand,100.3,31.4,49.3,22.2,2021,78270.600992
231,United Arab Emirates,156.0,85.7,61.7,4.9,2021,6830.894725


### Drop countries that are not present in all years

In [19]:
year_counts = qol_df.groupby("Country")["Year"].nunique()
countries = year_counts[year_counts == len(set(common_years))].index
qol_df = qol_df[qol_df["Country"].isin(countries)].reset_index(drop=True)
qol_df

Unnamed: 0,Country,Quality of Life Index,Purchasing Power Index,Cost of Living Index,Property Price to Income Ratio,Year,GDP Per Capita
0,Argentina,77.0,59.4,67.1,11.6,2015,14833.199680
1,Australia,180.8,110.4,99.3,7.1,2015,52009.802759
2,Austria,182.6,104.6,76.9,9.6,2015,43908.420277
3,Belgium,136.0,86.2,87.2,6.5,2015,40889.673570
4,Brazil,29.8,41.2,55.3,16.7,2015,8936.195589
...,...,...,...,...,...,...,...
219,Sweden,171.4,90.6,79.2,8.6,2021,93076.833587
220,Switzerland,190.8,111.0,131.7,8.4,2021,3484.385958
221,Thailand,100.3,31.4,49.3,22.2,2021,78270.600992
222,United Arab Emirates,156.0,85.7,61.7,4.9,2021,6830.894725


## Save Data in CSV format

In [21]:
filename = "World_Wide_Unicorn_Startups.csv"
unicorn_startups_df.to_csv(posixpath.join(PREPROCESSED_DATA_PATH, filename), index=False)
unicorn_startups_df = pd.read_csv(px.join(PREPROCESSED_DATA_PATH, filename))
unicorn_startups_df

Unnamed: 0,Company,Valuation,Year,Country,City,Industry
0,Gusto,10.00,2015,United States,San Francisco,Fintech
1,Tanium,9.00,2015,United States,Kirkland,Cybersecurity
2,Zenefits,4.50,2015,United States,San Francisco,Fintech
3,Thumbtack,3.20,2015,United States,San Francisco,E-commerce & direct-to-consumer
4,Illumio,2.75,2015,United States,Sunnyvale,Cybersecurity
...,...,...,...,...,...,...
678,YipitData,1.00,2021,United States,New York,Internet software & services
679,Anyscale,1.00,2021,United States,Berkeley,Artificial Intelligence
680,Iodine Software,1.00,2021,United States,Austin,Data management & analytics
681,ReliaQuest,1.00,2021,United States,Tampa,Cybersecurity


In [22]:
filename = "QOL.csv"
qol_df.to_csv(posixpath.join(PREPROCESSED_DATA_PATH, filename), index=False)
qol_df = pd.read_csv(px.join(PREPROCESSED_DATA_PATH, filename))
qol_df

Unnamed: 0,Country,Quality of Life Index,Purchasing Power Index,Cost of Living Index,Property Price to Income Ratio,Year,GDP Per Capita
0,Argentina,77.0,59.4,67.1,11.6,2015,14833.199680
1,Australia,180.8,110.4,99.3,7.1,2015,52009.802759
2,Austria,182.6,104.6,76.9,9.6,2015,43908.420277
3,Belgium,136.0,86.2,87.2,6.5,2015,40889.673570
4,Brazil,29.8,41.2,55.3,16.7,2015,8936.195589
...,...,...,...,...,...,...,...
219,Sweden,171.4,90.6,79.2,8.6,2021,93076.833587
220,Switzerland,190.8,111.0,131.7,8.4,2021,3484.385958
221,Thailand,100.3,31.4,49.3,22.2,2021,78270.600992
222,United Arab Emirates,156.0,85.7,61.7,4.9,2021,6830.894725
