In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load data
df_gdp_mainlandchina = pd.read_csv('raw_data/data-gdp-mainlandchina-2019.csv')  
df_gdp_usa = pd.read_csv('raw_data/data-gdp-usa-2019.csv')  
df_pop_china = pd.read_csv('raw_data/data-pop-china-2017.csv')  
df_pop_usa = pd.read_csv('raw_data/data-pop-usa-2019.csv')  
df_WEO = pd.read_csv('raw_data/WEO_Data.csv')  

# China

In [3]:
# Merge dataframe for China
df_china = pd.DataFrame.merge(df_gdp_mainlandchina,df_pop_china,how='right',left_on='Provinces',right_on='Administrative Division')

# Don't show Unnamed columns
df_china = df_china.loc[:, ~df_china.columns.str.contains('^Unnamed')] 

# Drop column
df_china = df_china.drop(['Provinces'], axis=1)

# Add column
df_china['Country'] = 'China'
df_china['Year'] = '2019'

# Re-order Columns
df_china = df_china[['Year','Country','Administrative Division','Nominal GDP (Billion)','Population']]

# Calculate GDP per Capita
df_china['GDP per Capita'] = (df_china['Nominal GDP (Billion)'] / df_china['Population'])*1000000000

# Set 0 decimal places
df_china['GDP per Capita'] = df_china['GDP per Capita'].round(0)

df_china.head()

Unnamed: 0,Year,Country,Administrative Division,Nominal GDP (Billion),Population,GDP per Capita
0,2019,China,Guangdong,1561.0,111690000,13976.0
1,2019,China,Jiangsu,1444.0,80290000,17985.0
2,2019,China,Shandong,1030.0,100060000,10294.0
3,2019,China,Zhejiang,904.0,56570000,15980.0
4,2019,China,Henan,787.0,95590000,8233.0


In [4]:
# Review data
df_WEO.head()

Unnamed: 0,Country,Subject Descriptor,Units,Scale,2017,2018,2019,2020,2021,2022,2023,2024,Estimates Start After
0,Afghanistan,"Gross domestic product, current prices",U.S. dollars,Billions,20.235,19.63,18.734,18.861,19.998,21.54,23.237,25.19,2018.0
1,Afghanistan,"Gross domestic product per capita, current prices",U.S. dollars,Units,569.531,544.983,513.108,509.759,533.089,566.416,602.884,644.95,2016.0
2,Afghanistan,"Inflation, average consumer prices",Index,,110.998,111.693,114.583,119.742,125.427,131.699,138.284,145.198,2018.0
3,Afghanistan,Unemployment rate,Percent of total labor force,,,,,,,,,,
4,Afghanistan,Population,Persons,Millions,35.53,36.02,36.51,37.0,37.514,38.028,38.543,39.057,2016.0


In [5]:
# Get GDP value for Hong Kong, Macau and Taiwan
df_hongkong = df_WEO[df_WEO['Country'].str.contains('hong kong', case=False, na=False)]
hongkong_gdp = df_hongkong.loc[df_hongkong['Subject Descriptor'] == 'Gross domestic product, current prices', '2019'].values[0]

df_macau = df_WEO[df_WEO['Country'].str.contains('maca', case=False, na=False)]
macau_gdp = df_macau.loc[df_macau['Subject Descriptor'] == 'Gross domestic product, current prices', '2019'].values[0]

df_taiwan = df_WEO[df_WEO['Country'].str.contains('taiwan', case=False, na=False)]
taiwan_gdp = df_taiwan.loc[df_taiwan['Subject Descriptor'] == 'Gross domestic product, current prices', '2019'].values[0]

print('Hong Kong GDP: ',hongkong_gdp)
print('Macau GDP: ',macau_gdp)
print('Taiwan GDP: ',taiwan_gdp)

Hong Kong GDP:  372.989
Macau GDP:  55.136
Taiwan GDP:  586.104


In [6]:
# Assign value
## to Hong Kong
hongkong_index = df_china.loc[df_china['Administrative Division'] == 'Hong Kong'].index[0]
df_china.at[hongkong_index, 'Nominal GDP (Billion)'] = hongkong_gdp

## to Macau
macau_index = df_china.loc[df_china['Administrative Division'] == 'Macau'].index[0]
df_china.at[macau_index, 'Nominal GDP (Billion)'] = macau_gdp

## to Taiwan 
taiwan_index = df_china.loc[df_china['Administrative Division'] == 'Taiwan'].index[0]
df_china.at[taiwan_index, 'Nominal GDP (Billion)'] = taiwan_gdp

df_china

Unnamed: 0,Year,Country,Administrative Division,Nominal GDP (Billion),Population,GDP per Capita
0,2019,China,Guangdong,1561.0,111690000,13976.0
1,2019,China,Jiangsu,1444.0,80290000,17985.0
2,2019,China,Shandong,1030.0,100060000,10294.0
3,2019,China,Zhejiang,904.0,56570000,15980.0
4,2019,China,Henan,787.0,95590000,8233.0
5,2019,China,Sichuan,676.0,83020000,8143.0
6,2019,China,Hubei,664.0,59020000,11250.0
7,2019,China,Fujian,615.0,39110000,15725.0
8,2019,China,Hunan,576.0,68600000,8397.0
9,2019,China,Shanghai,553.0,24180000,22870.0


# USA

In [7]:
df_gdp_usa.head()

Unnamed: 0,State,Nominal GDP (Billion)
0,Connecticut,1142561
1,Maine,270074
2,Massachusetts,2382235
3,New Hampshire,354379
4,Rhode Island,254164


In [8]:
df_pop_usa.head()

Unnamed: 0,State,Population
0,California,39512223
1,Texas,28995881
2,Florida,21477737
3,New York,19453561
4,Pennsylvania,12801989


In [9]:
# Strip leading and trailng white space in State column
df_gdp_usa['State'] = df_gdp_usa['State'].str.strip()
df_pop_usa['State'] = df_pop_usa['State'].str.strip()

# Merge dataframe for China
df_usa = pd.DataFrame.merge(df_gdp_usa,df_pop_usa,how='left',on='State')

# Add column
df_usa['Country'] = 'United States'
df_usa['Year'] = '2019'

# Rename column
df_usa.rename(columns={'State':'Administrative Division'}, inplace=True)

# Re-order Columns
df_usa = df_usa[['Year','Country','Administrative Division','Nominal GDP (Billion)','Population']]

# Calculate GDP per Capita
df_usa['GDP per Capita'] = (df_usa['Nominal GDP (Billion)'] / df_usa['Population'])*1000000000

# Set 0 decimal places
df_usa['GDP per Capita'] = df_usa['GDP per Capita'].round(0)

df_usa

Unnamed: 0,Year,Country,Administrative Division,Nominal GDP (Billion),Population,GDP per Capita
0,2019,United States,Connecticut,1142561,3565287,320468170.0
1,2019,United States,Maine,270074,1344212,200916225.0
2,2019,United States,Massachusetts,2382235,6949503,342792139.0
3,2019,United States,New Hampshire,354379,1359711,260628178.0
4,2019,United States,Rhode Island,254164,1059361,239921991.0
5,2019,United States,Vermont,139140,623989,222984700.0
6,2019,United States,Delaware,301661,973764,309788614.0
7,2019,United States,District of Columbia,584778,705749,828592035.0
8,2019,United States,Maryland,1713354,6045680,283401371.0
9,2019,United States,New Jersey,2579366,8882190,290397526.0


# Concat

In [10]:
# Concat data frame
clean_df = pd.concat([df_china, df_usa])

# Reset index
clean_df.reset_index(inplace = True)

clean_df

Unnamed: 0,index,Year,Country,Administrative Division,Nominal GDP (Billion),Population,GDP per Capita
0,0,2019,China,Guangdong,1561.0,111690000,13976.0
1,1,2019,China,Jiangsu,1444.0,80290000,17985.0
2,2,2019,China,Shandong,1030.0,100060000,10294.0
3,3,2019,China,Zhejiang,904.0,56570000,15980.0
4,4,2019,China,Henan,787.0,95590000,8233.0
5,5,2019,China,Sichuan,676.0,83020000,8143.0
6,6,2019,China,Hubei,664.0,59020000,11250.0
7,7,2019,China,Fujian,615.0,39110000,15725.0
8,8,2019,China,Hunan,576.0,68600000,8397.0
9,9,2019,China,Shanghai,553.0,24180000,22870.0


In [11]:
clean_df.to_csv('clean_data/clean_china_and_usa.csv')