In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json

In [2]:
# Read the csv data file as data_df
data_df = pd.read_csv("data/Billionaires Statistics Dataset.csv")
data_df.head()

Unnamed: 0,rank,finalWorth,category,personName,age,country,city,source,industries,countryOfCitizenship,...,cpi_change_country,gdp_country,gross_tertiary_education_enrollment,gross_primary_education_enrollment_country,life_expectancy_country,tax_revenue_country_country,total_tax_rate_country,population_country,latitude_country,longitude_country
0,1,211000,Fashion & Retail,Bernard Arnault & family,74.0,France,Paris,LVMH,Fashion & Retail,France,...,1.1,"$2,715,518,274,227",65.6,102.5,82.5,24.2,60.7,67059887.0,46.227638,2.213749
1,2,180000,Automotive,Elon Musk,51.0,United States,Austin,"Tesla, SpaceX",Automotive,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523.0,37.09024,-95.712891
2,3,114000,Technology,Jeff Bezos,59.0,United States,Medina,Amazon,Technology,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523.0,37.09024,-95.712891
3,4,107000,Technology,Larry Ellison,78.0,United States,Lanai,Oracle,Technology,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523.0,37.09024,-95.712891
4,5,106000,Finance & Investments,Warren Buffett,92.0,United States,Omaha,Berkshire Hathaway,Finance & Investments,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523.0,37.09024,-95.712891


In [3]:
# Check out the data types
data_df.dtypes

rank                                            int64
finalWorth                                      int64
category                                       object
personName                                     object
age                                           float64
country                                        object
city                                           object
source                                         object
industries                                     object
countryOfCitizenship                           object
organization                                   object
selfMade                                         bool
status                                         object
gender                                         object
birthDate                                      object
lastName                                       object
firstName                                      object
title                                          object
date                        

In [4]:
# Convert gdp_country to float
data_df['gdp_country']= data_df['gdp_country'].replace('[\$,]', '', regex=True).astype(float)
# Convert unit of measurement
data_df['finalWorth'] = data_df['finalWorth']/1000
data_df['gdp_country'] = round(data_df['gdp_country']/1000000000,1)
data_df['population_country'] = round(data_df['population_country']/1000000,1)
data_df.head()

Unnamed: 0,rank,finalWorth,category,personName,age,country,city,source,industries,countryOfCitizenship,...,cpi_change_country,gdp_country,gross_tertiary_education_enrollment,gross_primary_education_enrollment_country,life_expectancy_country,tax_revenue_country_country,total_tax_rate_country,population_country,latitude_country,longitude_country
0,1,211.0,Fashion & Retail,Bernard Arnault & family,74.0,France,Paris,LVMH,Fashion & Retail,France,...,1.1,2715.52,65.6,102.5,82.5,24.2,60.7,67.06,46.227638,2.213749
1,2,180.0,Automotive,Elon Musk,51.0,United States,Austin,"Tesla, SpaceX",Automotive,United States,...,7.5,21427.7,88.2,101.8,78.5,9.6,36.6,328.24,37.09024,-95.712891
2,3,114.0,Technology,Jeff Bezos,59.0,United States,Medina,Amazon,Technology,United States,...,7.5,21427.7,88.2,101.8,78.5,9.6,36.6,328.24,37.09024,-95.712891
3,4,107.0,Technology,Larry Ellison,78.0,United States,Lanai,Oracle,Technology,United States,...,7.5,21427.7,88.2,101.8,78.5,9.6,36.6,328.24,37.09024,-95.712891
4,5,106.0,Finance & Investments,Warren Buffett,92.0,United States,Omaha,Berkshire Hathaway,Finance & Investments,United States,...,7.5,21427.7,88.2,101.8,78.5,9.6,36.6,328.24,37.09024,-95.712891


In [5]:
# Checking rows that contain nan value
#data_df[data_df.isna().any(axis=1)]
data_df.isnull().sum()

rank                                             0
finalWorth                                       0
category                                         0
personName                                       0
age                                             65
country                                         38
city                                            72
source                                           0
industries                                       0
countryOfCitizenship                             0
organization                                  2315
selfMade                                         0
status                                           0
gender                                           0
birthDate                                       76
lastName                                         0
firstName                                        3
title                                         2301
date                                             0
state                          

In [6]:
# fill missing values in the 'country' column with values from 'country of citizen'
data_df['country'].fillna(data_df['countryOfCitizenship'], inplace=True)

In [7]:
# Cleaning out the 'State' and 'Residence of State' which are not needed
full_df = data_df[['rank', 'finalWorth', 'category', 'personName', 'age', 'country',
    'source', 'industries', 'countryOfCitizenship','selfMade', 'status', 'gender', 'lastName', 'firstName',
       'cpi_country', 
       'gdp_country', 'gross_tertiary_education_enrollment',
       'gross_primary_education_enrollment_country', 'life_expectancy_country',
       'tax_revenue_country_country', 'total_tax_rate_country',
       'population_country', 'latitude_country', 'longitude_country']]
full_df.isnull().sum()

rank                                            0
finalWorth                                      0
category                                        0
personName                                      0
age                                            65
country                                         0
source                                          0
industries                                      0
countryOfCitizenship                            0
selfMade                                        0
status                                          0
gender                                          0
lastName                                        0
firstName                                       3
cpi_country                                   184
gdp_country                                   164
gross_tertiary_education_enrollment           182
gross_primary_education_enrollment_country    181
life_expectancy_country                       182
tax_revenue_country_country                   183


In [8]:
# Kevin fills nans in age with median age
agedata = full_df.notna()
agedata = full_df[agedata]
median_age = agedata["age"].median()
median_age
full_df['age'] = full_df['age'].fillna(value = median_age)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_df['age'] = full_df['age'].fillna(value = median_age)


In [9]:
full_df.isnull().sum()

rank                                            0
finalWorth                                      0
category                                        0
personName                                      0
age                                             0
country                                         0
source                                          0
industries                                      0
countryOfCitizenship                            0
selfMade                                        0
status                                          0
gender                                          0
lastName                                        0
firstName                                       3
cpi_country                                   184
gdp_country                                   164
gross_tertiary_education_enrollment           182
gross_primary_education_enrollment_country    181
life_expectancy_country                       182
tax_revenue_country_country                   183


In [10]:
# filling out the gross_tertiary_education_enrollment for country
full_df = full_df[full_df["gdp_country"].isnull()]
full_df.country.unique()


array(['Hong Kong', 'Germany', 'Taiwan', 'Eswatini (Swaziland)', 'India',
       'Bahamas', 'Cayman Islands', 'United Kingdom', 'Bermuda',
       'British Virgin Islands', 'Mexico', 'Belgium', 'Ireland', 'Italy',
       'Guernsey', 'France', 'Turks and Caicos Islands', 'Czech Republic',
       'United States', 'South Korea', 'Peru', 'Malaysia', 'Austria',
       'Panama', 'Japan'], dtype=object)

In [15]:
missing_value = {'country':['Hong Kong', 'Germany', 'Taiwan', 'Eswatini (Swaziland)', 'India',
       'Bahamas', 'Cayman Islands', 'United Kingdom', 'Bermuda',
       'British Virgin Islands', 'Mexico', 'Belgium', 'Ireland', 'Italy',
       'Guernsey', 'France', 'Turks and Caicos Islands', 'Czech Republic',
       'United States', 'South Korea', 'Peru', 'Malaysia', 'Austria',
       'Panama', 'Japan'],
                'gdp': [372.8, 3845.6, 761.7, 4.9, 2611.0, 12.7, 5.9, 2827.1, 7.6, 1.1, 1414.2, 579.1, 533.6, 
                        1862, 0.004, 2778.1, 1.1, 295.6, 21427.7, 2029.0, 226.8, 364.7, 446.3, 76.5, 5081.7]}
counter = 0
for i in missing_value['country']:
    for index, row in full_df.iterrows():
        if (row["country"]==i):
            full_df.at[index, "gdp_country"] = missing_value['gdp'][counter]
    counter = counter + 1
  

In [18]:
full_df

Unnamed: 0,rank,finalWorth,category,personName,age,country,source,industries,countryOfCitizenship,selfMade,...,cpi_country,gdp_country,gross_tertiary_education_enrollment,gross_primary_education_enrollment_country,life_expectancy_country,tax_revenue_country_country,total_tax_rate_country,population_country,latitude_country,longitude_country
32,33,38.0,Diversified,Li Ka-shing,94.0,Hong Kong,Diversified,Diversified,Hong Kong,True,...,,372.8,,,,,,,,
46,47,29.5,Real Estate,Lee Shau Kee,95.0,Hong Kong,Real estate,Real Estate,Hong Kong,True,...,,372.8,,,,,,,,
108,108,15.8,Fashion & Retail,Beate Heister,65.0,Germany,Supermarkets,Fashion & Retail,Germany,False,...,,3845.6,,,,,,,,
110,108,15.8,Real Estate,Peter Woo,76.0,Hong Kong,Real estate,Real Estate,Hong Kong,False,...,,372.8,,,,,,,,
125,126,14.1,Real Estate,Kwong Siu-hing,93.0,Hong Kong,Real estate,Real Estate,Hong Kong,False,...,,372.8,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2559,2540,1.0,Finance & Investments,Pollyanna Chu,64.0,Hong Kong,"Financial services, property",Finance & Investments,Hong Kong,True,...,,372.8,,,,,,,,
2607,2540,1.0,Finance & Investments,Stanley Motta,77.0,Panama,Finance,Finance & Investments,Panama,False,...,,76.5,,,,,,,,
2610,2540,1.0,Food & Beverage,Vera Rechulski Santo Domingo,74.0,Bermuda,Beer,Food & Beverage,Brazil,False,...,,7.6,,,,,,,,
2626,2540,1.0,Logistics,Masaru Wasami,77.0,Japan,Logistics,Logistics,Japan,True,...,,5081.7,,,,,,,,


In [17]:
full_df.isnull().sum()

rank                                            0
finalWorth                                      0
category                                        0
personName                                      0
age                                             0
country                                         0
source                                          0
industries                                      0
countryOfCitizenship                            0
selfMade                                        0
status                                          0
gender                                          0
lastName                                        0
firstName                                       0
cpi_country                                   164
gdp_country                                     0
gross_tertiary_education_enrollment           164
gross_primary_education_enrollment_country    164
life_expectancy_country                       164
tax_revenue_country_country                   164


# General Informations About Billionaires Distributions

In [None]:
# Gender demographic of billionaires

# Gender proportion pie chart

# Gender with rank and wealth


In [None]:
# Age distribution of billionaires by gender

# histogram

# boxplot to see the distribution within gender


In [None]:
# Billionaires by industries

# Histogram

In [None]:
# Billionaires by countries

# histogram

In [None]:
# Geographical distribution

# Top 10 countries with most billionaires

# Use API geoapify to map the countries of billionaires


In [None]:
# Selfmade billionaire vs. Inherited billionaire


# Correlation Analysis of Billionaires

In [None]:
# Billionaire vs. Countries GDP


In [None]:
# Billionaire vs. Countries Life Expectancies


In [None]:
# Billionaires vs. Countries Population


In [None]:
# Billionaire vs. Countries Tax Revenue


In [None]:
# Billionaire vs. Countries CPI


In [None]:
# Billionaire vs. Education


# Bonus: The chances of you becoming a billionaire

In [None]:
# We are only interested in selfmade billionaire for this part
# Y = billionaire/countries population %
# X = gender + country + age + industries

