In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json

In [2]:
# Read the csv data file as data_df
data_df = pd.read_csv("data/Billionaires Statistics Dataset.csv")
data_df.head()

Unnamed: 0,rank,finalWorth,category,personName,age,country,city,source,industries,countryOfCitizenship,...,cpi_change_country,gdp_country,gross_tertiary_education_enrollment,gross_primary_education_enrollment_country,life_expectancy_country,tax_revenue_country_country,total_tax_rate_country,population_country,latitude_country,longitude_country
0,1,211000,Fashion & Retail,Bernard Arnault & family,74.0,France,Paris,LVMH,Fashion & Retail,France,...,1.1,"$2,715,518,274,227",65.6,102.5,82.5,24.2,60.7,67059887.0,46.227638,2.213749
1,2,180000,Automotive,Elon Musk,51.0,United States,Austin,"Tesla, SpaceX",Automotive,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523.0,37.09024,-95.712891
2,3,114000,Technology,Jeff Bezos,59.0,United States,Medina,Amazon,Technology,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523.0,37.09024,-95.712891
3,4,107000,Technology,Larry Ellison,78.0,United States,Lanai,Oracle,Technology,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523.0,37.09024,-95.712891
4,5,106000,Finance & Investments,Warren Buffett,92.0,United States,Omaha,Berkshire Hathaway,Finance & Investments,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,328239523.0,37.09024,-95.712891


In [3]:
# Convert gdp_country to float
data_df['gdp_country']= data_df['gdp_country'].replace('[\$,]', '', regex=True).astype(float)
# Convert unit of measurement
data_df['finalWorth'] = data_df['finalWorth']/1000
data_df['gdp_country'] = round(data_df['gdp_country']/1000000000,1)
data_df['population_country'] = round(data_df['population_country']/1000000,1)
data_df['age'] = round(data_df['age'],0)
data_df.head()

Unnamed: 0,rank,finalWorth,category,personName,age,country,city,source,industries,countryOfCitizenship,...,cpi_change_country,gdp_country,gross_tertiary_education_enrollment,gross_primary_education_enrollment_country,life_expectancy_country,tax_revenue_country_country,total_tax_rate_country,population_country,latitude_country,longitude_country
0,1,211.0,Fashion & Retail,Bernard Arnault & family,74.0,France,Paris,LVMH,Fashion & Retail,France,...,1.1,2715.5,65.6,102.5,82.5,24.2,60.7,67.1,46.227638,2.213749
1,2,180.0,Automotive,Elon Musk,51.0,United States,Austin,"Tesla, SpaceX",Automotive,United States,...,7.5,21427.7,88.2,101.8,78.5,9.6,36.6,328.2,37.09024,-95.712891
2,3,114.0,Technology,Jeff Bezos,59.0,United States,Medina,Amazon,Technology,United States,...,7.5,21427.7,88.2,101.8,78.5,9.6,36.6,328.2,37.09024,-95.712891
3,4,107.0,Technology,Larry Ellison,78.0,United States,Lanai,Oracle,Technology,United States,...,7.5,21427.7,88.2,101.8,78.5,9.6,36.6,328.2,37.09024,-95.712891
4,5,106.0,Finance & Investments,Warren Buffett,92.0,United States,Omaha,Berkshire Hathaway,Finance & Investments,United States,...,7.5,21427.7,88.2,101.8,78.5,9.6,36.6,328.2,37.09024,-95.712891


In [4]:
# Checking rows that contain nan value
#data_df[data_df.isna().any(axis=1)]
data_df.isnull().sum()

rank                                             0
finalWorth                                       0
category                                         0
personName                                       0
age                                             65
country                                         38
city                                            72
source                                           0
industries                                       0
countryOfCitizenship                             0
organization                                  2315
selfMade                                         0
status                                           0
gender                                           0
birthDate                                       76
lastName                                         0
firstName                                        3
title                                         2301
date                                             0
state                          

In [39]:
# Selecting features that are needed
full_df = data_df[['rank', 'finalWorth', 'category', 'personName', 'age', 'country',
    'source', 'industries', 'countryOfCitizenship','selfMade', 'status', 'gender', 'lastName', 'firstName',
       'cpi_country', 
       'gdp_country', 'gross_tertiary_education_enrollment',
       'gross_primary_education_enrollment_country', 'life_expectancy_country',
       'tax_revenue_country_country', 'total_tax_rate_country',
       'population_country', 'latitude_country', 'longitude_country']]
full_df.columns = ['rank', 'finalWorth', 'category', 'personName', 'age', 'country',
    'source', 'industries', 'countryOfCitizenship','selfMade', 'status', 'gender', 'lastName', 'firstName',
       'cpi_country', 'gdp_country', 'gross_tertiary_education_enrollment',
       'gross_primary_education_enrollment_country', 'life_expectancy_country',
       'tax_revenue_country', 'total_tax_rate_country',
       'population_country', 'latitude_country', 'longitude_country']

#Checking null cells
full_df.isnull().sum()

rank                                            0
finalWorth                                      0
category                                        0
personName                                      0
age                                            65
country                                        38
source                                          0
industries                                      0
countryOfCitizenship                            0
selfMade                                        0
status                                          0
gender                                          0
lastName                                        0
firstName                                       3
cpi_country                                   184
gdp_country                                   164
gross_tertiary_education_enrollment           182
gross_primary_education_enrollment_country    181
life_expectancy_country                       182
tax_revenue_country                           183


In [40]:
# Fill in missing age with median age
agedata = full_df.notna()
agedata = full_df[agedata]
median_age = agedata["age"].median()
median_age
full_df['age'] = full_df['age'].fillna(value = median_age)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_df['age'] = full_df['age'].fillna(value = median_age)


In [41]:
# Fill in null country with country of citizenship
full_df['country'].fillna(full_df['countryOfCitizenship'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_df['country'].fillna(full_df['countryOfCitizenship'], inplace=True)


In [42]:
# Fill in null cpi_country with real-time value
for index, row in full_df.iterrows():
    if(row["country"]=='Hong Kong'):
        full_df.at[index, "cpi_country"] = 105.3
    if(row["country"]=='Uzbekistan'):
        full_df.at[index, "cpi_country"] = 209.90 
    if(row["country"]=='Taiwan'):
        full_df.at[index, "cpi_country"] = 105.63
    if(row["country"]=='Eswatini (Swaziland)'):
        full_df.at[index, "cpi_country"] = 174.42
    if(row["country"]=='Bahamas'):
        full_df.at[index, "cpi_country"] = 121.79
    if(row["country"]=='Cayman Islands'):
        full_df.at[index, "cpi_country"] = 130.6
    if(row["country"]=='Bermuda'):
        full_df.at[index, "cpi_country"] = 105.3
    if(row["country"]=='Ireland'):
        full_df.at[index, "cpi_country"] = 121.1
    if(row["country"]=='Liechtenstein'):
        full_df.at[index, "cpi_country"] = 91.8
    if(row["country"]=='Andorra'):
        full_df.at[index, "cpi_country"] = 112.92
    if(row["country"]=='Monaco'):
        full_df.at[index, "cpi_country"] = 122.1
#     if(row["country"]=='Guernsey'):
#         full_df.at[index, "cpi_country"] = 122.1
#     if(row["country"]=='British Virgin Islands'):
#         full_df.at[index, "cpi_country"] = 122.1
#     if(row["country"]=='Turks and Caicos Island'):
#         full_df.at[index, "cpi_country"] = 122.1

In [43]:
# Find the missing values for gdp_country
missing_gdp = {'country':['Hong Kong', 'Germany', 'Taiwan', 'Eswatini (Swaziland)', 'India',
       'Bahamas', 'Cayman Islands', 'United Kingdom', 'Bermuda',
       'British Virgin Islands', 'Mexico', 'Belgium', 'Ireland', 'Italy',
       'Guernsey', 'France', 'Turks and Caicos Islands', 'Czech Republic',
       'United States', 'South Korea', 'Peru', 'Malaysia', 'Austria',
       'Panama', 'Japan'],
                'gdp': [372.8, 3845.6, 761.7, 4.9, 2611.0, 12.7, 5.9, 2827.1, 7.6, 1.1, 1414.2, 579.1, 533.6, 
                        1862, 0.004, 2778.1, 1.1, 295.6, 21427.7, 2029.0, 226.8, 364.7, 446.3, 76.5, 5081.7]}

# Fill in null gdp_country with real-time value
counter = 0
for i in missing_value['country']:
    for index, row in full_df.iterrows():
        if (row["country"]==i):
            full_df.at[index, "gdp_country"] = missing_value['gdp'][counter]
    counter = counter + 1

In [44]:
# Find the missing values for gross_tertiary_education_enrollment
missing_tertiary = {'country':['Hong Kong', 'Monaco', 'Germany', 'Taiwan', 'Eswatini (Swaziland)',
       'India', 'Bahamas', 'Cayman Islands', 'United Kingdom', 'Bermuda',
       'British Virgin Islands', 'Mexico', 'Belgium', 'Ireland', 'Italy',
       'Guernsey', 'France', 'Turks and Caicos Islands', 'Czech Republic',
       'United States', 'South Korea', 'Peru', 'Andorra', 'Malaysia',
       'Austria', 'Panama', 'Japan'],
              'gross_tertiary_education_enrollment': [50.6, 100, 70.2, 48, 6.75, 28.1, 15.1,
                    25.9, 65.8, 19.0, 39.9, 40.2, 79.7, 75.2, 61.9, 1.0, 65.6, 1.0, 64.1, 88.2,
                    94.3, 70.7, 1, 45.1, 85.1, 47.8, 63.2]
                   }
# Fill in null gross_tertiary_education_enrollment with real-time value
counter = 0
for i in missing_tertiary['country']:
    for index, row in full_df.iterrows():
        if (row["country"]==i):
            full_df.at[index, "gross_tertiary_education_enrollment"] = missing_tertiary['gross_tertiary_education_enrollment'][counter]
    counter = counter + 1

In [45]:
# Find the missing values for gross_primary_education_enrollment_country
missing_primary = {'country':['Hong Kong', 'Monaco', 'Germany', 'Taiwan', 'Eswatini (Swaziland)',
       'India', 'Bahamas', 'Cayman Islands', 'United Kingdom', 'Bermuda',
       'British Virgin Islands', 'Mexico', 'Belgium', 'Ireland', 'Italy',
       'Guernsey', 'France', 'Turks and Caicos Islands', 'Czech Republic',
       'United States', 'South Korea', 'Peru', 'Malaysia', 'Austria',
       'Panama', 'Japan'],
              'gross_primary_education_enrollment_country': [107.8, 130.9, 104.0, 100, 114.5, 113, 92.0, 
                                                      93.2, 101.2, 101.0, 93.5, 105.8, 103.9, 101.4,
                                                      101.9, 100, 102.5, 122.34, 100.7, 101.8,
                                                      98.1, 106.9, 105.3, 103.1, 101.5, 98.8]}

# Fill in null gross_primary_education_enrollment_country with real-time value
counter = 0
for i in missing_primary['country']:
    for index, row in full_df.iterrows():
        if (row["country"]==i):
            full_df.at[index, "gross_primary_education_enrollment_country"] = missing_primary['gross_primary_education_enrollment_country'][counter]
    counter = counter + 1

In [46]:
# Find the missing values for life_expectancy_country
missing_life = {'country':['Hong Kong', 'Monaco', 'Germany', 'Taiwan', 'Eswatini (Swaziland)',
       'India', 'Bahamas', 'Cayman Islands', 'United Kingdom', 'Bermuda',
       'British Virgin Islands', 'Mexico', 'Belgium', 'Ireland', 'Italy',
       'Guernsey', 'France', 'Turks and Caicos Islands', 'Czech Republic',
       'United States', 'South Korea', 'Peru', 'Andorra', 'Malaysia',
       'Austria', 'Panama', 'Japan'],
              'life_expectancy_country': [85.4, 89.5, 80.9, 81.0, 59.7, 69.4, 72.7, 82.2, 81.3, 
                                          82.1, 75.9, 75.0, 81.6, 82.2, 82.9, 82.9, 82.5, 75,79,
                                          78.5, 82.6, 76.5, 81.2, 76, 81.6, 76.7, 84.2]}

# Fill in null life_expectancy_country with real-time value
counter = 0
for i in missing_life['country']:
    for index, row in full_df.iterrows():
        if (row["country"]==i):
            full_df.at[index, "life_expectancy_country"] = missing_life['life_expectancy_country'][counter]
    counter = counter + 1

In [50]:
# Find the missing values for population_country
missing_population = {'country':['Hong Kong', 'Germany', 'Taiwan', 'Eswatini (Swaziland)', 'India',
       'Bahamas', 'Cayman Islands', 'United Kingdom', 'Bermuda',
       'British Virgin Islands', 'Mexico', 'Belgium', 'Ireland', 'Italy',
       'Guernsey', 'France', 'Turks and Caicos Islands', 'Czech Republic',
       'United States', 'South Korea', 'Peru', 'Malaysia', 'Austria',
       'Panama', 'Japan'],
              'population_country': [7.4, 8.3, 23.6, 1.2, 136.6, 0.04, 0.006, 6.7, 0.006, 0.003,
                                    12.6, 1.1, 5.0, 6.0, 0.006, 6.7, 0.005, 1.1, 32.8, 5.2, 3.3,
                                    3.2, 0.9, 4.4, 12.6]}

# Fill in null population_country with real-time value
counter = 0
for i in missing_population['country']:
    for index, row in full_df.iterrows():
        if (row["country"]==i):
            full_df.at[index, "population_country"] = missing_population['population_country'][counter]
    counter = counter + 1

In [None]:
# Find the missing values for latitude_country
missing_latitude = {'country':['Hong Kong', 'Germany', 'Taiwan', 'Eswatini (Swaziland)', 'India',
       'Bahamas', 'Cayman Islands', 'United Kingdom', 'Bermuda',
       'British Virgin Islands', 'Mexico', 'Belgium', 'Ireland', 'Italy',
       'Guernsey', 'France', 'Turks and Caicos Islands', 'Czech Republic',
       'United States', 'South Korea', 'Peru', 'Malaysia', 'Austria',
       'Panama', 'Japan'],
              'latitude_country': [,51.165691,]
              'longitude_country':[,10.451526,]
                   }

# Fill in null latitude_country with real-time value
counter = 0
for i in missing_population['country']:
    for index, row in full_df.iterrows():
        if (row["country"]==i):
            full_df.at[index, "latitude_country"] = missing_population['latitude_country'][counter]
    counter = counter + 1

In [54]:
# Fill in null gdp_country with real-time value
df = full_df[full_df["latitude_country"].isnull()]
df.country.unique()        

array(['Hong Kong', 'Germany', 'Taiwan', 'Eswatini (Swaziland)', 'India',
       'Bahamas', 'Cayman Islands', 'United Kingdom', 'Bermuda',
       'British Virgin Islands', 'Mexico', 'Belgium', 'Ireland', 'Italy',
       'Guernsey', 'France', 'Turks and Caicos Islands', 'Czech Republic',
       'United States', 'South Korea', 'Peru', 'Malaysia', 'Austria',
       'Panama', 'Japan'], dtype=object)

In [53]:
full_df

Unnamed: 0,rank,finalWorth,category,personName,age,country,source,industries,countryOfCitizenship,selfMade,...,cpi_country,gdp_country,gross_tertiary_education_enrollment,gross_primary_education_enrollment_country,life_expectancy_country,tax_revenue_country,total_tax_rate_country,population_country,latitude_country,longitude_country
0,1,211.0,Fashion & Retail,Bernard Arnault & family,74.0,France,LVMH,Fashion & Retail,France,False,...,110.05,2778.1,65.6,102.5,82.5,24.2,60.7,6.7,46.227638,2.213749
1,2,180.0,Automotive,Elon Musk,51.0,United States,"Tesla, SpaceX",Automotive,United States,True,...,117.24,21427.7,88.2,101.8,78.5,9.6,36.6,32.8,37.090240,-95.712891
2,3,114.0,Technology,Jeff Bezos,59.0,United States,Amazon,Technology,United States,True,...,117.24,21427.7,88.2,101.8,78.5,9.6,36.6,32.8,37.090240,-95.712891
3,4,107.0,Technology,Larry Ellison,78.0,United States,Oracle,Technology,United States,True,...,117.24,21427.7,88.2,101.8,78.5,9.6,36.6,32.8,37.090240,-95.712891
4,5,106.0,Finance & Investments,Warren Buffett,92.0,United States,Berkshire Hathaway,Finance & Investments,United States,True,...,117.24,21427.7,88.2,101.8,78.5,9.6,36.6,32.8,37.090240,-95.712891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2635,2540,1.0,Healthcare,Yu Rong,51.0,China,Health clinics,Healthcare,China,True,...,125.08,19910.0,50.6,100.2,77.0,9.4,59.2,1397.7,35.861660,104.195397
2636,2540,1.0,Food & Beverage,"Richard Yuengling, Jr.",80.0,United States,Beer,Food & Beverage,United States,False,...,117.24,21427.7,88.2,101.8,78.5,9.6,36.6,32.8,37.090240,-95.712891
2637,2540,1.0,Manufacturing,Zhang Gongyun,60.0,China,Tyre manufacturing machinery,Manufacturing,China,True,...,125.08,19910.0,50.6,100.2,77.0,9.4,59.2,1397.7,35.861660,104.195397
2638,2540,1.0,Real Estate,Zhang Guiping & family,71.0,China,Real estate,Real Estate,China,True,...,125.08,19910.0,50.6,100.2,77.0,9.4,59.2,1397.7,35.861660,104.195397


# General Informations About Billionaires Distributions

In [None]:
# Gender demographic of billionaires

# Gender proportion pie chart

# Gender with rank and wealth


In [None]:
# Age distribution of billionaires by gender

# histogram

# boxplot to see the distribution within gender


In [None]:
# Billionaires by industries

# Histogram

In [None]:
# Billionaires by countries

# histogram

In [None]:
# Geographical distribution

# Top 10 countries with most billionaires

# Use API geoapify to map the countries of billionaires


In [None]:
# Selfmade billionaire vs. Inherited billionaire


# Correlation Analysis of Billionaires

In [None]:
# Billionaire vs. Countries GDP


In [None]:
# Billionaire vs. Countries Life Expectancies


In [None]:
# Billionaires vs. Countries Population


In [None]:
# Billionaire vs. Countries CPI


In [None]:
# Billionaire vs. Education


In [None]:
# Billionaire vs. Gender

In [None]:
# Billionaire vs. Age

In [None]:
# Billionaire vs. Industries

# Bonus: The chances of you becoming a billionaire

In [None]:
# We are only interested in selfmade billionaire for this part
# Y = billionaire/countries population %
# X = gender + country_gdp + age + industries

