# Preprocessing

## Imports

In [1]:
import numpy as np
import pandas as pd
import posixpath
import pycountry
import geopandas as gpd
import json
import pyproj

from info_vis import preprocessing, PROJECT_PATH, DATA_PATH, PREPROCESSED_DATA_PATH

## Load Data

### Unicorn Startups

In [2]:
filename = "World_Wide_Unicorn_Startups.csv"
unicorn_startups_df = pd.read_csv(posixpath.join(DATA_PATH, filename))
unicorn_startups_df

Unnamed: 0,Company,Valuation,Date,Country,City,Industry,Investors,year,month,day
0,Bytedance,140.0,4/7/2017,China,Beijing,Artificial intelligence,"0 Sequoia Capital China, SIG Asia Investm...",2017,7,4
1,SpaceX,100.3,12/1/2012,United States,Hawthorne,Other,"0 Sequoia Capital China, SIG Asia Investm...",2012,1,12
2,Stripe,95.0,1/23/2014,United States,San Francisco,Fintech,"0 Sequoia Capital China, SIG Asia Investm...",2014,23,1
3,Klarna,45.6,12/12/2011,Sweden,Stockholm,Fintech,"0 Sequoia Capital China, SIG Asia Investm...",2011,12,12
4,Canva,40.0,1/8/2018,Australia,Surry Hills,Internet software & services,"0 Sequoia Capital China, SIG Asia Investm...",2018,8,1
...,...,...,...,...,...,...,...,...,...,...
931,YipitData,1.0,12/6/2021,United States,New York,Internet software & services,"0 Sequoia Capital China, SIG Asia Investm...",2021,6,12
932,Anyscale,1.0,12/7/2021,United States,Berkeley,Artificial Intelligence,"0 Sequoia Capital China, SIG Asia Investm...",2021,7,12
933,Iodine Software,1.0,12/1/2021,United States,Austin,Data management & analytics,"0 Sequoia Capital China, SIG Asia Investm...",2021,1,12
934,ReliaQuest,1.0,12/1/2021,United States,Tampa,Cybersecurity,"0 Sequoia Capital China, SIG Asia Investm...",2021,1,12


### Quality Of Life

In [3]:
filename = "QOL.csv"
qol_df = pd.read_csv(posixpath.join(DATA_PATH, filename))
qol_df

Unnamed: 0,Rank,Country,Quality of Life Index,Purchasing Power Index,Safety Index,Health Care Index,Cost of Living Index,Property Price to Income Ratio,Traffic Commute Time Index,Pollution Index,Climate Index,Year
0,1,Switzerland,190.8,111.0,78.7,74.5,131.7,8.4,28.7,20.1,80.0,2021
1,2,Denmark,190.0,94.7,73.3,80.0,91.7,6.7,28.7,20.4,81.8,2021
2,3,Netherlands,183.3,83.9,72.8,75.8,78.6,7.4,27.8,25.3,87.1,2021
3,4,Finland,182.8,89.1,73.0,76.4,77.5,8.6,29.0,11.9,56.6,2021
4,5,Austria,182.4,78.2,74.8,78.4,75.5,10.4,25.7,19.2,77.8,2021
...,...,...,...,...,...,...,...,...,...,...,...,...
498,82,Bangladesh,-5.4,33.4,35.8,43.3,39.2,10.9,58.0,93.6,-,2015
499,83,Egypt,-7.1,25.8,39.9,54.7,37.2,11.3,56.9,96.6,-,2015
500,84,Vietnam,-19.5,21.3,47.7,34.3,41.9,23.9,36.4,84.3,-,2015
501,85,Mongolia,-35.7,23.5,29.7,28.3,52.3,18.6,36.9,94.6,-,2015


### GDP

In [4]:
filename = "GDP.csv"
gdp_df = pd.read_csv(posixpath.join(DATA_PATH, filename))
gdp_df

Unnamed: 0,Country/Area,Year,Unit,"GDP, Per Capita GDP - US Dollars"
0,Afghanistan,2015,US$,552.722397
1,Afghanistan,2016,US$,525.188137
2,Afghanistan,2017,US$,533.339054
3,Afghanistan,2018,US$,513.194331
4,Afghanistan,2019,US$,511.736489
...,...,...,...,...
1689,Zimbabwe,2017,US$,1487.978497
1690,Zimbabwe,2018,US$,1572.738770
1691,Zimbabwe,2019,US$,1479.534798
1692,Zimbabwe,2020,US$,1395.305090


## Preprocess Data

### Unicorn Startups

#### Drop wanted columns

In [5]:
unicorn_startups_df = unicorn_startups_df.drop(columns=["Investors", "month", "day", "year"])
unicorn_startups_df

Unnamed: 0,Company,Valuation,Date,Country,City,Industry
0,Bytedance,140.0,4/7/2017,China,Beijing,Artificial intelligence
1,SpaceX,100.3,12/1/2012,United States,Hawthorne,Other
2,Stripe,95.0,1/23/2014,United States,San Francisco,Fintech
3,Klarna,45.6,12/12/2011,Sweden,Stockholm,Fintech
4,Canva,40.0,1/8/2018,Australia,Surry Hills,Internet software & services
...,...,...,...,...,...,...
931,YipitData,1.0,12/6/2021,United States,New York,Internet software & services
932,Anyscale,1.0,12/7/2021,United States,Berkeley,Artificial Intelligence
933,Iodine Software,1.0,12/1/2021,United States,Austin,Data management & analytics
934,ReliaQuest,1.0,12/1/2021,United States,Tampa,Cybersecurity


#### 1. Change date format into yyyy
#### 2. Change Date column name into Year
#### 3. Sort by (year, valuation), in ascending and descending orders respectivly

In [6]:
unicorn_startups_df["Date"] = unicorn_startups_df["Date"].apply(lambda x: int(x[-4:]))
unicorn_startups_df = unicorn_startups_df.rename(columns={'Date': 'Year'})
unicorn_startups_df = unicorn_startups_df.sort_values(by=["Year", "Valuation"], ascending=[True, False]).reset_index(drop=True)
unicorn_startups_df

Unnamed: 0,Company,Valuation,Year,Country,City,Industry
0,Veepee,1.38,2007,France,La Plaine Saint-Denis,E-commerce & direct-to-consumer
1,VANCL,3.00,2010,China,Beijing,E-commerce & direct-to-consumer
2,Klarna,45.60,2011,Sweden,Stockholm,Fintech
3,Vice Media,5.70,2011,United States,Brooklyn,Internet software & services
4,SpaceX,100.30,2012,United States,Hawthorne,Other
...,...,...,...,...,...,...
931,YipitData,1.00,2021,United States,New York,Internet software & services
932,Anyscale,1.00,2021,United States,Berkeley,Artificial Intelligence
933,Iodine Software,1.00,2021,United States,Austin,Data management & analytics
934,ReliaQuest,1.00,2021,United States,Tampa,Cybersecurity


#### Clean data
1. Fill missing data (singapore and Hong Kong's city column is missing, take the capital by default)

In [7]:
mask = unicorn_startups_df['City'].isna()
unicorn_startups_df.loc[mask, 'City'] = unicorn_startups_df.loc[mask, 'Country']
unicorn_startups_df.isna().sum()

Company      0
Valuation    0
Year         0
Country      0
City         0
Industry     0
dtype: int64

#### Extract year range and existing countries

In [8]:
unicorn_startups_years = set(unicorn_startups_df["Year"])
unicorn_startups_countries = set(unicorn_startups_df["Country"])
print(unicorn_startups_years)
print(unicorn_startups_countries)

{2016, 2017, 2018, 2019, 2020, 2021, 2007, 2010, 2011, 2012, 2013, 2014, 2015}
{'Brazil', 'Czech Republic', 'Argentina', 'Malaysia', 'Switzerland', 'Denmark', 'Norway', 'Mexico', 'Nigeria', 'Australia', 'Canada', 'South Africa', 'Chile', 'United Arab Emirates', 'United States', 'Senegal', 'Israel', 'Philippines', 'India', 'United States,', 'Finland', 'Spain', 'Luxembourg', 'Japan', 'Lithuania', 'Santa Clara', 'Belgium', 'Austria', 'Indonesia,', 'Colombia', 'South Korea', 'United Kingdom', 'Sweden', 'Indonesia', 'Turkey', 'China', 'Thailand', 'Netherlands', 'Estonia', 'Croatia', 'France', 'Singapore', 'Bermuda', 'Vietnam', 'Germany', 'Ireland', 'Hong Kong'}


### Quality of Life

#### Drop wanted columns

In [9]:
qol_df = qol_df.drop(columns=["Rank", "Safety Index", "Health Care Index", "Traffic Commute Time Index", "Pollution Index", "Climate Index"])
qol_df

Unnamed: 0,Country,Quality of Life Index,Purchasing Power Index,Cost of Living Index,Property Price to Income Ratio,Year
0,Switzerland,190.8,111.0,131.7,8.4,2021
1,Denmark,190.0,94.7,91.7,6.7,2021
2,Netherlands,183.3,83.9,78.6,7.4,2021
3,Finland,182.8,89.1,77.5,8.6,2021
4,Austria,182.4,78.2,75.5,10.4,2021
...,...,...,...,...,...,...
498,Bangladesh,-5.4,33.4,39.2,10.9,2015
499,Egypt,-7.1,25.8,37.2,11.3,2015
500,Vietnam,-19.5,21.3,41.9,23.9,2015
501,Mongolia,-35.7,23.5,52.3,18.6,2015


#### Sort by (year, country) in ascending order

In [10]:
qol_df = qol_df.sort_values(by=["Year", "Country"]).reset_index(drop=True)
qol_df

Unnamed: 0,Country,Quality of Life Index,Purchasing Power Index,Cost of Living Index,Property Price to Income Ratio,Year
0,Argentina,77.0,59.4,67.1,11.6,2015
1,Armenia,49.1,27.7,40.7,13.4,2015
2,Australia,180.8,110.4,99.3,7.1,2015
3,Austria,182.6,104.6,76.9,9.6,2015
4,Bahrain,84.1,59.7,56.2,7.9,2015
...,...,...,...,...,...,...
498,United Arab Emirates,156.0,85.7,61.7,4.9,2021
499,United Kingdom,159.0,82.6,71.0,9.6,2021
500,United States,167.0,102.6,71.9,4.0,2021
501,Uruguay,124.6,30.6,51.1,17.3,2021


#### Extract year range and existing countries

In [11]:
qol_years = set(qol_df["Year"])
qol_countries = set(qol_df["Country"])
print(qol_years)
print(qol_countries)

{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{'Malaysia', 'Switzerland', 'Nigeria', 'Pakistan', 'Canada', 'United Arab Emirates', 'Bulgaria', 'United States', 'North Macedonia', 'India', 'Finland', 'Bahrain', 'Spain', 'Lithuania', 'Slovenia', 'Ecuador', 'Kuwait', 'Poland', 'Morocco', 'Romania', 'Sri Lanka', 'China', 'Thailand', 'Qatar', 'Kazakhstan', 'Russia', 'Argentina', 'Bosnia And Herzegovina', 'Mexico', 'Uruguay', 'Puerto Rico', 'Israel', 'Latvia', 'Costa Rica', 'Cyprus', 'Iceland', 'Belgium', 'Armenia', 'Egypt', 'Bolivia', 'Indonesia', 'Estonia', 'Germany', 'Ireland', 'Hong Kong', 'Norway', 'Hungary', 'Australia', 'Bangladesh', 'South Africa', 'Turkmenistan', 'Saudi Arabia', 'Georgia', 'Philippines', 'Serbia', 'Kenya', 'Venezuela', 'South Korea', 'United Kingdom', 'Sweden', 'Turkey', 'Lebanon', 'Netherlands', 'Belarus', 'Peru', 'Greece', 'Moldova', 'Singapore', 'Mongolia', 'Brazil', 'Czech Republic', 'Dominican Republic', 'Denmark', 'Oman', 'Chile', 'Ukraine', 'Portugal', 'Azerbaij

### GDP

#### Drop unwanted columns

In [12]:
gdp_df = gdp_df.drop(columns=["Unit"])
gdp_df

Unnamed: 0,Country/Area,Year,"GDP, Per Capita GDP - US Dollars"
0,Afghanistan,2015,552.722397
1,Afghanistan,2016,525.188137
2,Afghanistan,2017,533.339054
3,Afghanistan,2018,513.194331
4,Afghanistan,2019,511.736489
...,...,...,...
1689,Zimbabwe,2017,1487.978497
1690,Zimbabwe,2018,1572.738770
1691,Zimbabwe,2019,1479.534798
1692,Zimbabwe,2020,1395.305090


In [13]:
gdp_df = gdp_df.rename(columns={'GDP, Per Capita GDP - US Dollars': 'GDP Per Capita', "Country/Area": "Country"})
gdp_df

Unnamed: 0,Country,Year,GDP Per Capita
0,Afghanistan,2015,552.722397
1,Afghanistan,2016,525.188137
2,Afghanistan,2017,533.339054
3,Afghanistan,2018,513.194331
4,Afghanistan,2019,511.736489
...,...,...,...
1689,Zimbabwe,2017,1487.978497
1690,Zimbabwe,2018,1572.738770
1691,Zimbabwe,2019,1479.534798
1692,Zimbabwe,2020,1395.305090


#### Sort by (year, Country) in ascending order

In [14]:
gdp_df = gdp_df.sort_values(by=["Year", "Country"]).reset_index(drop=True)
gdp_df

Unnamed: 0,Country,Year,GDP Per Capita
0,Afghanistan,2015,552.722397
1,Africa,2015,1983.750763
2,Albania,2015,3928.362400
3,Algeria,2015,4147.453064
4,Americas,2015,25903.772991
...,...,...,...
1689,Western Europe,2021,51910.529680
1690,World,2021,12313.588733
1691,Yemen,2021,243.188488
1692,Zambia,2021,1127.523151


#### Extract year range and existing countries

In [15]:
gdp_years = set(gdp_df["Year"])
gdp_countries = set(gdp_df["Country"])
print(gdp_years)
print(gdp_countries)

{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{'Mauritania', 'Malaysia', 'Mozambique', 'Switzerland', 'Uzbekistan', 'Africa', 'Nigeria', 'Pakistan', 'Canada', 'Antigua and Barbuda', 'United Arab Emirates', 'Bulgaria', 'Senegal', 'Haiti', 'Maldives', 'United States', 'Jamaica', 'Timor-Leste', 'Namibia', 'Kosovo', 'Gabon', 'India', 'Finland', 'Bahrain', 'Luxembourg', 'Saint Lucia', 'Afghanistan', 'French Polynesia', 'Eastern Africa', 'Kingdom of Eswatini', 'Lithuania', 'Slovenia', 'Spain', 'Ecuador', 'Kuwait', 'Poland', 'Algeria', 'Tajikistan', 'Morocco', 'San Marino', 'Yemen', 'Northern America', 'Caribbean', 'Southern Asia', 'Malta', 'Romania', 'Bosnia and Herzegovina', 'Sri Lanka', 'Equatorial Guinea', 'Thailand', 'Guatemala', 'Guinea', 'Trinidad and Tobago', 'Fiji', 'El Salvador', 'Republic of Korea', 'Qatar', 'Latin America and the Caribbean', 'Kazakhstan', 'Congo', 'Montserrat', 'Guinea-Bissau', 'Northern Africa', 'Polynesia', 'Argentina', 'Central America', 'Brunei Darussalam', 'Beni

## Find commun years and countries among the 3 datasets and reduce them accordingly

In [16]:
common_years = gdp_years.intersection(qol_years.intersection(unicorn_startups_years))
common_countries = gdp_countries.intersection(qol_countries.intersection(unicorn_startups_countries))

print(common_years)
print(common_countries)
print(len(common_years))
print(len(common_countries))

{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{'Brazil', 'Argentina', 'Malaysia', 'Switzerland', 'Denmark', 'Norway', 'Mexico', 'Nigeria', 'Australia', 'Canada', 'South Africa', 'Chile', 'United Arab Emirates', 'United States', 'Israel', 'Philippines', 'India', 'Finland', 'Spain', 'Japan', 'Lithuania', 'Belgium', 'Austria', 'Colombia', 'Sweden', 'Indonesia', 'Thailand', 'Netherlands', 'Estonia', 'Croatia', 'France', 'Singapore', 'Germany', 'Ireland'}
7
34


In [17]:
unicorn_startups_df = unicorn_startups_df[unicorn_startups_df["Year"].isin(common_years) & unicorn_startups_df["Country"].isin(common_countries)].reset_index(drop=True)
qol_df = qol_df[qol_df["Year"].isin(common_years) & qol_df["Country"].isin(common_countries)].reset_index(drop=True)
gdp_df = gdp_df[gdp_df["Year"].isin(common_years) & gdp_df["Country"].isin(common_countries)].reset_index(drop=True)
print(set(unicorn_startups_df["Year"]))
print(set(qol_df["Year"]))
print(set(gdp_df["Year"]))
print(set(unicorn_startups_df["Country"]))
print(set(qol_df["Country"]))
print(set(gdp_df["Country"]))
print(len(set(unicorn_startups_df["Country"])))
print(len(set(qol_df["Country"])))
print(len(set(gdp_df["Country"])))

{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{2016, 2017, 2018, 2019, 2020, 2021, 2015}
{'Brazil', 'Argentina', 'Malaysia', 'Switzerland', 'Denmark', 'Norway', 'Mexico', 'Nigeria', 'Australia', 'Canada', 'South Africa', 'Chile', 'United Arab Emirates', 'United States', 'Israel', 'Philippines', 'India', 'Finland', 'Spain', 'Japan', 'Lithuania', 'Belgium', 'Austria', 'Colombia', 'Sweden', 'Indonesia', 'Thailand', 'Netherlands', 'Estonia', 'Croatia', 'France', 'Singapore', 'Germany', 'Ireland'}
{'Brazil', 'Argentina', 'Malaysia', 'Denmark', 'Norway', 'Switzerland', 'Mexico', 'Nigeria', 'Australia', 'Canada', 'Chile', 'South Africa', 'United Arab Emirates', 'United States', 'Israel', 'Philippines', 'India', 'Finland', 'Spain', 'Japan', 'Lithuania', 'Belgium', 'Austria', 'Colombia', 'Sweden', 'Indonesia', 'Thailand', 'Netherlands', 'Estonia', 'Croatia', 'France', 'Singapore', 'Germany', 'Ireland'}
{'Brazil', 'Argentina', 'Malaysia', 'Denmark', 'Norwa

## Merge GDP and Quality of Life data into one dataset
#### The Quality of Life dataset now contains the GDP per Capita data

In [18]:
qol_df["GDP Per Capita"] = gdp_df["GDP Per Capita"]
qol_df

Unnamed: 0,Country,Quality of Life Index,Purchasing Power Index,Cost of Living Index,Property Price to Income Ratio,Year,GDP Per Capita
0,Argentina,77.0,59.4,67.1,11.6,2015,14833.199680
1,Australia,180.8,110.4,99.3,7.1,2015,52009.802759
2,Austria,182.6,104.6,76.9,9.6,2015,43908.420277
3,Belgium,136.0,86.2,87.2,6.5,2015,40889.673570
4,Brazil,29.8,41.2,55.3,16.7,2015,8936.195589
...,...,...,...,...,...,...,...
228,Sweden,171.4,90.6,79.2,8.6,2021,93076.833587
229,Switzerland,190.8,111.0,131.7,8.4,2021,3484.385958
230,Thailand,100.3,31.4,49.3,22.2,2021,78270.600992
231,United Arab Emirates,156.0,85.7,61.7,4.9,2021,6830.894725


### Drop countries that are not present in all years

In [19]:
year_counts = qol_df.groupby("Country")["Year"].nunique()
countries = year_counts[year_counts == len(set(common_years))].index
qol_df = qol_df[qol_df["Country"].isin(countries)].reset_index(drop=True)
unicorn_startups_df = unicorn_startups_df[unicorn_startups_df["Country"].isin(countries)].reset_index(drop=True)
qol_df

Unnamed: 0,Country,Quality of Life Index,Purchasing Power Index,Cost of Living Index,Property Price to Income Ratio,Year,GDP Per Capita
0,Argentina,77.0,59.4,67.1,11.6,2015,14833.199680
1,Australia,180.8,110.4,99.3,7.1,2015,52009.802759
2,Austria,182.6,104.6,76.9,9.6,2015,43908.420277
3,Belgium,136.0,86.2,87.2,6.5,2015,40889.673570
4,Brazil,29.8,41.2,55.3,16.7,2015,8936.195589
...,...,...,...,...,...,...,...
219,Sweden,171.4,90.6,79.2,8.6,2021,93076.833587
220,Switzerland,190.8,111.0,131.7,8.4,2021,3484.385958
221,Thailand,100.3,31.4,49.3,22.2,2021,78270.600992
222,United Arab Emirates,156.0,85.7,61.7,4.9,2021,6830.894725


## Standardizing column and countries (USA) names

In [20]:
unicorn_startups_df.columns = unicorn_startups_df.columns.str.strip().str.replace(' ', '_').str.replace('-', '_')
qol_df.columns = qol_df.columns.str.strip().str.replace(' ', '_').str.replace('-', '_')

def normalize_country(name):
    name = str(name).strip()
    name = name.replace('U.S.', 'United States').replace('USA', 'United States')
    return name.title()

unicorn_startups_df['Country'] = unicorn_startups_df['Country'].apply(normalize_country)
qol_df['Country'] = qol_df['Country'].apply(normalize_country)

## Merge with the ISO3 countries data

In [21]:
def country_to_iso3(name):
    try: return pycountry.countries.lookup(name).alpha_3
    except: return None

unicorn_startups_df['ISO3'] = unicorn_startups_df['Country'].apply(country_to_iso3)
qol_df["ISO3"] = qol_df["Country"].apply(country_to_iso3)

## Aggregate cities' data for unicorn startups

In [22]:
unicorn_startups_df = unicorn_startups_df.groupby(["ISO3", "Country", "Year"], as_index=False).agg(n_unicorns = ("ISO3", "size"), total_val = ("Valuation", "sum")).sort_values(by=["Country", "Year"])
unicorn_startups_df[["total_val_cumulative", "n_unicorns_cumulative"]] = unicorn_startups_df.groupby('Country').agg(n_unicorns = ("total_val", "cumsum"), total_val = ("n_unicorns", "cumsum"))
unicorn_startups_df = unicorn_startups_df.sort_values(by=["Year", "Country"]).reset_index(drop=True)
unicorn_startups_df

Unnamed: 0,ISO3,Country,Year,n_unicorns,total_val,total_val_cumulative,n_unicorns_cumulative
0,COL,Colombia,2015,1,1.15,1.15,1
1,FRA,France,2015,1,2.00,2.00,1
2,USA,United States,2015,14,44.17,44.17,14
3,FRA,France,2016,1,1.10,3.10,2
4,ISR,Israel,2016,1,1.50,1.50,1
...,...,...,...,...,...,...,...
78,SWE,Sweden,2021,2,3.40,12.48,3
79,CHE,Switzerland,2021,1,1.10,5.60,4
80,THA,Thailand,2021,2,2.50,2.50,2
81,ARE,United Arab Emirates,2021,1,1.00,4.50,3


## Fill missing data in Unicorn Startups data

In [23]:
df = unicorn_startups_df.copy()

# 1) Determine the full list of years and countries
years     = df['Year'].sort_values().unique()
countries = df['Country'].unique()

# 2) Build a full MultiIndex of Country × Year
full_idx = pd.MultiIndex.from_product(
    [countries, years],
    names=['Country','Year']
)

# 3) Re‑index your DataFrame onto that full grid
df = (
    df
    .set_index(['Country','Year'])
    .reindex(full_idx)
    .reset_index()
)

# 4) Bring ISO3 back (it’s constant within each Country)
#    you can create a mapping before the reindex step:
iso_map = unicorn_startups_df.drop_duplicates('Country').set_index('Country')['ISO3']
df['ISO3'] = df['Country'].map(iso_map)

# 5) Fill “flow” columns with zeros
df['n_unicorns'] = df['n_unicorns'].fillna(0)
df['total_val']  = df['total_val'].fillna(0.0)

# 6) Sort, then recompute your cumulative sums by country
df = df.sort_values(['Country','Year'])
df['n_unicorns_cumulative'] = df.groupby('Country')['n_unicorns'].cumsum()
df['total_val_cumulative']  = df.groupby('Country')['total_val'].cumsum()

unicorn_startups_df = df.reset_index(drop=True)
unicorn_startups_df

Unnamed: 0,Country,Year,ISO3,n_unicorns,total_val,total_val_cumulative,n_unicorns_cumulative
0,Argentina,2015,ARG,0.0,0.00,0.00,0.0
1,Argentina,2016,ARG,0.0,0.00,0.00,0.0
2,Argentina,2017,ARG,0.0,0.00,0.00,0.0
3,Argentina,2018,ARG,0.0,0.00,0.00,0.0
4,Argentina,2019,ARG,0.0,0.00,0.00,0.0
...,...,...,...,...,...,...,...
219,United States,2017,USA,12.0,48.27,99.44,30.0
220,United States,2018,USA,42.0,222.82,322.26,72.0
221,United States,2019,USA,48.0,231.00,553.26,120.0
222,United States,2020,USA,64.0,224.53,777.79,184.0


## Merge countries with their geo data

In [24]:
world = gpd.read_file("../data/ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp").to_crs(4326)
countries_gdf = world[['ISO_A3','geometry']].rename(columns={'ISO_A3':'ISO3', "geometry":"Country_Geom"})

unicorn_startups_df = pd.merge(unicorn_startups_df, countries_gdf, on='ISO3', how='left')
unicorn_startups_df = gpd.GeoDataFrame(unicorn_startups_df, geometry='Country_Geom').to_crs(4326)
qol_df = pd.merge(qol_df, countries_gdf, on='ISO3', how='left')
qol_df = gpd.GeoDataFrame(qol_df, geometry='Country_Geom').to_crs(4326)

unicorn_startups_df.head()

Unnamed: 0,Country,Year,ISO3,n_unicorns,total_val,total_val_cumulative,n_unicorns_cumulative,Country_Geom
0,Argentina,2015,ARG,0.0,0.0,0.0,0.0,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25 -5..."
1,Argentina,2016,ARG,0.0,0.0,0.0,0.0,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25 -5..."
2,Argentina,2017,ARG,0.0,0.0,0.0,0.0,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25 -5..."
3,Argentina,2018,ARG,0.0,0.0,0.0,0.0,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25 -5..."
4,Argentina,2019,ARG,0.0,0.0,0.0,0.0,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25 -5..."


In [25]:
qol_df.head()

Unnamed: 0,Country,Quality_of_Life_Index,Purchasing_Power_Index,Cost_of_Living_Index,Property_Price_to_Income_Ratio,Year,GDP_Per_Capita,ISO3,Country_Geom
0,Argentina,77.0,59.4,67.1,11.6,2015,14833.19968,ARG,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25 -5..."
1,Australia,180.8,110.4,99.3,7.1,2015,52009.802759,AUS,"MULTIPOLYGON (((147.68926 -40.80826, 148.28907..."
2,Austria,182.6,104.6,76.9,9.6,2015,43908.420277,AUT,"POLYGON ((16.97967 48.1235, 16.90375 47.71487,..."
3,Belgium,136.0,86.2,87.2,6.5,2015,40889.67357,BEL,"POLYGON ((6.15666 50.80372, 6.04307 50.12805, ..."
4,Brazil,29.8,41.2,55.3,16.7,2015,8936.195589,BRA,"POLYGON ((-53.37366 -33.76838, -53.65054 -33.2..."


## Save Data in CSV format

In [26]:
filename = "World_Wide_Unicorn_Startups.csv"
unicorn_startups_df.to_csv(posixpath.join(PREPROCESSED_DATA_PATH, filename), index=False)
unicorn_startups_df = pd.read_csv(posixpath.join(PREPROCESSED_DATA_PATH, filename))
unicorn_startups_df

Unnamed: 0,Country,Year,ISO3,n_unicorns,total_val,total_val_cumulative,n_unicorns_cumulative,Country_Geom
0,Argentina,2015,ARG,0.0,0.00,0.00,0.0,MULTIPOLYGON (((-68.63401022758323 -52.6363704...
1,Argentina,2016,ARG,0.0,0.00,0.00,0.0,MULTIPOLYGON (((-68.63401022758323 -52.6363704...
2,Argentina,2017,ARG,0.0,0.00,0.00,0.0,MULTIPOLYGON (((-68.63401022758323 -52.6363704...
3,Argentina,2018,ARG,0.0,0.00,0.00,0.0,MULTIPOLYGON (((-68.63401022758323 -52.6363704...
4,Argentina,2019,ARG,0.0,0.00,0.00,0.0,MULTIPOLYGON (((-68.63401022758323 -52.6363704...
...,...,...,...,...,...,...,...,...
219,United States,2017,USA,12.0,48.27,99.44,30.0,MULTIPOLYGON (((-122.84000000000003 49.0000000...
220,United States,2018,USA,42.0,222.82,322.26,72.0,MULTIPOLYGON (((-122.84000000000003 49.0000000...
221,United States,2019,USA,48.0,231.00,553.26,120.0,MULTIPOLYGON (((-122.84000000000003 49.0000000...
222,United States,2020,USA,64.0,224.53,777.79,184.0,MULTIPOLYGON (((-122.84000000000003 49.0000000...


In [27]:
filename = "QOL.csv"
qol_df.to_csv(posixpath.join(PREPROCESSED_DATA_PATH, filename), index=False)
qol_df = pd.read_csv(posixpath.join(PREPROCESSED_DATA_PATH, filename))
qol_df

Unnamed: 0,Country,Quality_of_Life_Index,Purchasing_Power_Index,Cost_of_Living_Index,Property_Price_to_Income_Ratio,Year,GDP_Per_Capita,ISO3,Country_Geom
0,Argentina,77.0,59.4,67.1,11.6,2015,14833.199680,ARG,MULTIPOLYGON (((-68.63401022758323 -52.6363704...
1,Australia,180.8,110.4,99.3,7.1,2015,52009.802759,AUS,MULTIPOLYGON (((147.68925947488418 -40.8082581...
2,Austria,182.6,104.6,76.9,9.6,2015,43908.420277,AUT,POLYGON ((16.979666782304037 48.12349701597630...
3,Belgium,136.0,86.2,87.2,6.5,2015,40889.673570,BEL,"POLYGON ((6.15665815595878 50.80372101501058, ..."
4,Brazil,29.8,41.2,55.3,16.7,2015,8936.195589,BRA,POLYGON ((-53.373661668498244 -33.768377780900...
...,...,...,...,...,...,...,...,...,...
219,Sweden,171.4,90.6,79.2,8.6,2021,93076.833587,SWE,POLYGON ((11.027368605196868 58.85614940045936...
220,Switzerland,190.8,111.0,131.7,8.4,2021,3484.385958,CHE,"POLYGON ((9.59422610844635 47.52505809182027, ..."
221,Thailand,100.3,31.4,49.3,22.2,2021,78270.600992,THA,POLYGON ((105.21877689007889 14.27321177821069...
222,United Arab Emirates,156.0,85.7,61.7,4.9,2021,6830.894725,ARE,POLYGON ((51.57951867046327 24.245497137951105...


In [28]:
unicorn_startups_df[unicorn_startups_df["Country_Geom"].isna()]

Unnamed: 0,Country,Year,ISO3,n_unicorns,total_val,total_val_cumulative,n_unicorns_cumulative,Country_Geom
77,France,2015,FRA,1.0,2.0,2.0,1.0,
78,France,2016,FRA,1.0,1.1,3.1,2.0,
79,France,2017,FRA,0.0,0.0,3.1,2.0,
80,France,2018,FRA,1.0,1.4,4.5,3.0,
81,France,2019,FRA,3.0,3.14,7.64,6.0,
82,France,2020,FRA,2.0,5.81,13.45,8.0,
83,France,2021,FRA,10.0,20.78,34.23,18.0,
154,Norway,2015,NOR,0.0,0.0,0.0,0.0,
155,Norway,2016,NOR,0.0,0.0,0.0,0.0,
156,Norway,2017,NOR,0.0,0.0,0.0,0.0,
