In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load data
df_gdp_mainlandchina = pd.read_csv('raw_data/data-gdp-mainlandchina-2019.csv')  
df_gdp_usa = pd.read_csv('raw_data/data-gdp-usa-2019.csv')  
data_gdp_world = pd.read_csv('raw_data/WEO_Data.csv')  

df_pop_china = pd.read_csv('raw_data/data-pop-china-2017.csv')  
df_pop_usa = pd.read_csv('raw_data/data-pop-usa-2019.csv')  
data_pop_world = pd.read_csv('raw_data/WPP2019_TotalPopulationBySex.csv')

df_region_china = pd.read_csv('raw_data/data-economyregion-china.csv')  
df_continent = pd.read_csv('raw_data/country-and-continent-codes-list.csv') 

# Get data of China

In [3]:
# Merge dataframe for China
df_china = pd.DataFrame.merge(df_gdp_mainlandchina,df_pop_china,how='right',left_on='Provinces',right_on='Administrative Division')
df_china = pd.DataFrame.merge(df_china,df_region_china,how='left',left_on='Administrative Division',right_on='Provinces')

# Don't show Unnamed columns
df_china = df_china.loc[:, ~df_china.columns.str.contains('^Unnamed')] 

# Drop column
df_china = df_china.drop(['Provinces_x'], axis=1)
df_china = df_china.drop(['Provinces_y'], axis=1)

# Add column
df_china['Country'] = 'China'
df_china['Year'] = '2019'

# Concat 'CN' to all value in[Region]
df_china['Region'] = df_china['Region'].map(str) + '-CN'

df_china.tail()

Unnamed: 0,Nominal GDP (Billion),Administrative Division,Population,Region,Economic Zone #,Economic Zone,经济区,Characteristic,Country,Year
29,43.0,Qinghai,5980000,West-CN,10,Qinghai-Tibet Plateau,青藏高原经济区,"Natural gas, salt lake resources, non-ferrous ...",China,2019
30,25.0,Tibet,3370000,West-CN,10,Qinghai-Tibet Plateau,青藏高原经济区,"Natural gas, salt lake resources, non-ferrous ...",China,2019
31,,Hong Kong,7335384,Hong Kong-CN,11,Hong Kong,香港特别行政区,"Trade and logistics industry (21.2%), financia...",China,2019
32,,Macau,644900,Macau-CN,12,Macau,澳门港特别行政区,Tourism and apparel industry (41%),China,2019
33,,Taiwan,23562318,Taiwan-CN,13,Taiwan,台澎金马个别关税领域,Agricultural products and OEM,China,2019


In [4]:
# Get GDP value for Hong Kong, Macau and Taiwan
df_hongkong = data_gdp_world[data_gdp_world['Country'].str.contains('hong kong', case=False, na=False)]
hongkong_gdp = df_hongkong.loc[df_hongkong['Subject Descriptor'] == 'Gross domestic product, current prices', '2019'].values[0]

df_macau = data_gdp_world[data_gdp_world['Country'].str.contains('maca', case=False, na=False)]
macau_gdp = df_macau.loc[df_macau['Subject Descriptor'] == 'Gross domestic product, current prices', '2019'].values[0]

df_taiwan = data_gdp_world[data_gdp_world['Country'].str.contains('taiwan', case=False, na=False)]
taiwan_gdp = df_taiwan.loc[df_taiwan['Subject Descriptor'] == 'Gross domestic product, current prices', '2019'].values[0]

print('Hong Kong GDP: ',hongkong_gdp)
print('Macau GDP: ',macau_gdp)
print('Taiwan GDP: ',taiwan_gdp)

Hong Kong GDP:  372.99
Macau GDP:  55.14
Taiwan GDP:  586.1


In [5]:
# Assign value
## to Hong Kong
hongkong_index = df_china.loc[df_china['Administrative Division'] == 'Hong Kong'].index[0]
df_china.at[hongkong_index, 'Nominal GDP (Billion)'] = hongkong_gdp

## to Macau
macau_index = df_china.loc[df_china['Administrative Division'] == 'Macau'].index[0]
df_china.at[macau_index, 'Nominal GDP (Billion)'] = macau_gdp

## to Taiwan 
taiwan_index = df_china.loc[df_china['Administrative Division'] == 'Taiwan'].index[0]
df_china.at[taiwan_index, 'Nominal GDP (Billion)'] = taiwan_gdp

# Calculate GDP per Capita
df_china['GDP per Capita'] = (df_china['Nominal GDP (Billion)'] / df_china['Population'])*1000000000

# Re-order Columns
df_china = df_china[['Year','Country','Region','Economic Zone #','Economic Zone','Administrative Division','Nominal GDP (Billion)','Population','GDP per Capita','Characteristic']]

# Set 0 decimal places
df_china['GDP per Capita'] = df_china['GDP per Capita'].round(0)

df_china.head()

Unnamed: 0,Year,Country,Region,Economic Zone #,Economic Zone,Administrative Division,Nominal GDP (Billion),Population,GDP per Capita,Characteristic
0,2019,China,East-CN,4,Southeast Coast,Guangdong,1561.0,111690000,13976.0,"Export-oriented, digesting foreign advanced te..."
1,2019,China,East-CN,3,East Coast,Jiangsu,1444.0,80290000,17985.0,Multifunctional manufacturing: Light industria...
2,2019,China,East-CN,2,North Coast,Shandong,1030.0,100060000,10294.0,High-tech R & D and manufacturing
3,2019,China,East-CN,3,East Coast,Zhejiang,904.0,56570000,15980.0,Multifunctional manufacturing: Light industria...
4,2019,China,Middle-CN,5,Yellow River Upper and Middle Reaches,Henan,787.0,95590000,8233.0,"Coal mining and coal deep processing, natural ..."


# Get data of USA

In [6]:
df_pop_usa.head()

Unnamed: 0,State,Population
0,California,39512223
1,Texas,28995881
2,Florida,21477737
3,New York,19453561
4,Pennsylvania,12801989


In [7]:
# Strip leading and trailng white space in State column
df_gdp_usa['State'] = df_gdp_usa['State'].str.strip()
df_pop_usa['State'] = df_pop_usa['State'].str.strip()

# Merge dataframe for China
df_usa = pd.DataFrame.merge(df_gdp_usa,df_pop_usa,how='left',on='State')

# Add column
df_usa['Country'] = 'United States'
df_usa['Year'] = '2019'

# Rename column
df_usa.rename(columns={'State':'Administrative Division'}, inplace=True)

# Re-order Columns
df_usa = df_usa[['Year','Country','Region','Administrative Division','Nominal GDP (Billion)','Population']]

# Calculate GDP per Capita
df_usa['GDP per Capita'] = (df_usa['Nominal GDP (Billion)'] / df_usa['Population'])*1000000000

# Set 0 decimal places
df_usa['GDP per Capita'] = df_usa['GDP per Capita'].round(0)

# Concat 'US' to all value in[Region]
df_usa['Region'] = df_usa['Region'].map(str) + '-US'

df_usa.head()

Unnamed: 0,Year,Country,Region,Administrative Division,Nominal GDP (Billion),Population,GDP per Capita
0,2019,United States,New England-US,Connecticut,288.99,3565287,81057.0
1,2019,United States,New England-US,Maine,68.44,1344212,50915.0
2,2019,United States,New England-US,Massachusetts,604.21,6949503,86943.0
3,2019,United States,New England-US,New Hampshire,89.84,1359711,66073.0
4,2019,United States,New England-US,Rhode Island,64.44,1059361,60829.0


# Concat China and USA data

In [8]:
# Concat data frame
clean_china_and_usa_df = pd.concat([df_china, df_usa])

# Reset index
clean_china_and_usa_df.reset_index(drop = True, inplace = True)

clean_china_and_usa_df.head()

Unnamed: 0,Administrative Division,Characteristic,Country,Economic Zone,Economic Zone #,GDP per Capita,Nominal GDP (Billion),Population,Region,Year
0,Guangdong,"Export-oriented, digesting foreign advanced te...",China,Southeast Coast,4.0,13976.0,1561.0,111690000,East-CN,2019
1,Jiangsu,Multifunctional manufacturing: Light industria...,China,East Coast,3.0,17985.0,1444.0,80290000,East-CN,2019
2,Shandong,High-tech R & D and manufacturing,China,North Coast,2.0,10294.0,1030.0,100060000,East-CN,2019
3,Zhejiang,Multifunctional manufacturing: Light industria...,China,East Coast,3.0,15980.0,904.0,56570000,East-CN,2019
4,Henan,"Coal mining and coal deep processing, natural ...",China,Yellow River Upper and Middle Reaches,5.0,8233.0,787.0,95590000,Middle-CN,2019


# Clear world population data

In [14]:
df_pop_world = data_pop_world[['Country','Population']][data_pop_world['Time'] == 2019]
df_pop_world['Population'] = df_pop_world['Population']*1000

# Replace Country Name
df_pop_world['Country'].replace({"Dem. People's Republic of Korea":'North Korea',
                                'Republic of Korea':'South Korea',
                                "Lao People's Democratic Republic":'Lao',
                                'Viet Nam':'Vietnam',
                                'Iran (Islamic Republic of)':'Iran'}, inplace=True)

# df_pop_world.head()
df_pop_world.loc[df_pop_world['Country'].str.contains('Libya')]

Unnamed: 0,Country,Population
143264,Libya,6777453.0


# Clear continent data

In [15]:
# Replace Country Name
df_continent['Country_Name'].replace({"Korea, Democratic People's Republic of":'North Korea',
                                      'Korea, Republic of':'South Korea',
                                      "Lao People's Democratic Republic":'Lao',
                                      'Vietnam, Socialist Republic of':'Vietnam',
                                      'Cuba, Republic of':'Cuba',
                                      'Iran, Islamic Republic of':'Iran',
                                      'Libyan Arab Jamahiriya':'Libya'}, inplace=True)

# Clean data in ['Country']
df_continent['Country'] = df_continent['Country_Name'].str.rsplit(',').str[0]

# df_continent.head()
df_continent.loc[df_continent['Country'].str.contains('Libya')]

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number,Country
129,Africa,AF,Libyan Arab Jamahiriya,LY,LBY,434.0,Libyan Arab Jamahiriya


# Clear world GDP data

In [11]:
# Select columns based on condition
df_gdp_world = data_gdp_world[['Country','Units','Scale','2019']][data_gdp_world['Subject Descriptor'] == 'Gross domestic product, current prices']

# Replace Country Name
df_gdp_world['Country'].replace({'Korea':'South Korea',
                                 'Lao P.D.R.':'Lao',
                                 'Islamic Republic of Iran':'Iran'}, inplace=True)

df_gdp_world.loc[df_gdp_world['Country'].str.contains('Syria')]

Unnamed: 0,Country,Units,Scale,2019
840,Syria,U.S. dollars,Billions,


In [12]:
# Merge dataframe 
df_gdp_world_continent = pd.DataFrame.merge(df_gdp_world,df_continent,how='left',on='Country')

# Merge dataframe 
df_world = pd.DataFrame.merge(df_gdp_world_continent,df_pop_world,how='left',on='Country')

# Add column
df_world['Year'] = '2019'

# Clean world data
df_world = df_world[['Year','Country','Continent_Name','2019','Population']]

# Rename column
df_world.rename(columns={'2019':'Nominal GDP (Billion)'}, inplace=True)

# Change data type
df_world['Nominal GDP (Billion)'] = pd.to_numeric(df_world['Nominal GDP (Billion)'], errors='coerce') # .astype() not working
df_world['Population'].astype(float)

# Calculate GDP per Capita
df_world['GDP per Capita'] = (df_world['Nominal GDP (Billion)'] / df_world['Population'])*1000000000

# Set 0 decimal places
df_world['GDP per Capita'] = df_world['GDP per Capita'].round(0)

# df_world.head()
df_world.loc[df_world['Country'].str.contains('Vietnam')]

Unnamed: 0,Year,Country,Continent_Name,Nominal GDP (Billion),Population,GDP per Capita
196,2019,Vietnam,Asia,261.64,96462108.0,2712.0


In [13]:
clean_china_and_usa_df.to_csv('clean_data/clean_china_and_usa.csv')
df_world.to_csv('clean_data/clean_world.csv')