In [104]:
import pandas as pd
import numpy as np

df_gdp = pd.read_csv('E:\\Career\\Projects\\Data-Driven GDP Analysis\\data\\gdp.csv', encoding='ISO-8859-1')

df_gdp.head()

Unnamed: 0,Region/Country/Area,Unnamed: 1,Year,Series,Value,Footnotes,Source
0,1,"Total, all countries or areas",1995,GDP in current prices (millions of US dollars),31269041,,"United Nations Statistics Division, New York, ..."
1,1,"Total, all countries or areas",2005,GDP in current prices (millions of US dollars),47775404,,"United Nations Statistics Division, New York, ..."
2,1,"Total, all countries or areas",2010,GDP in current prices (millions of US dollars),66578017,,"United Nations Statistics Division, New York, ..."
3,1,"Total, all countries or areas",2015,GDP in current prices (millions of US dollars),75283835,,"United Nations Statistics Division, New York, ..."
4,1,"Total, all countries or areas",2019,GDP in current prices (millions of US dollars),87728744,,"United Nations Statistics Division, New York, ..."


In [105]:
# Dropping Columns
df_gdp = df_gdp.drop(columns = ['Footnotes', 'Source', 'Region/Country/Area'])
df_gdp

Unnamed: 0,Unnamed: 1,Year,Series,Value
0,"Total, all countries or areas",1995,GDP in current prices (millions of US dollars),31269041
1,"Total, all countries or areas",2005,GDP in current prices (millions of US dollars),47775404
2,"Total, all countries or areas",2010,GDP in current prices (millions of US dollars),66578017
3,"Total, all countries or areas",2015,GDP in current prices (millions of US dollars),75283835
4,"Total, all countries or areas",2019,GDP in current prices (millions of US dollars),87728744
...,...,...,...,...
6770,Zimbabwe,2010,GDP real rates of growth (percent),19.7
6771,Zimbabwe,2015,GDP real rates of growth (percent),1.8
6772,Zimbabwe,2019,GDP real rates of growth (percent),-6.1
6773,Zimbabwe,2020,GDP real rates of growth (percent),-5.3


In [106]:
# Renaming Unamed: 1 

df_gdp = df_gdp.rename(columns = {'Unnamed: 1':'country',
                                 'Year': 'year'})
df_gdp

Unnamed: 0,country,year,Series,Value
0,"Total, all countries or areas",1995,GDP in current prices (millions of US dollars),31269041
1,"Total, all countries or areas",2005,GDP in current prices (millions of US dollars),47775404
2,"Total, all countries or areas",2010,GDP in current prices (millions of US dollars),66578017
3,"Total, all countries or areas",2015,GDP in current prices (millions of US dollars),75283835
4,"Total, all countries or areas",2019,GDP in current prices (millions of US dollars),87728744
...,...,...,...,...
6770,Zimbabwe,2010,GDP real rates of growth (percent),19.7
6771,Zimbabwe,2015,GDP real rates of growth (percent),1.8
6772,Zimbabwe,2019,GDP real rates of growth (percent),-6.1
6773,Zimbabwe,2020,GDP real rates of growth (percent),-5.3


In [107]:
# renaming values

df_gdp['Series'] = df_gdp['Series'].replace({
    'GDP in current prices (millions of US dollars)': 'gdp',
    'GDP real rates of growth (percent)': 'gdp_growth_rate'
})

df_gdp

Unnamed: 0,country,year,Series,Value
0,"Total, all countries or areas",1995,gdp,31269041
1,"Total, all countries or areas",2005,gdp,47775404
2,"Total, all countries or areas",2010,gdp,66578017
3,"Total, all countries or areas",2015,gdp,75283835
4,"Total, all countries or areas",2019,gdp,87728744
...,...,...,...,...
6770,Zimbabwe,2010,gdp_growth_rate,19.7
6771,Zimbabwe,2015,gdp_growth_rate,1.8
6772,Zimbabwe,2019,gdp_growth_rate,-6.1
6773,Zimbabwe,2020,gdp_growth_rate,-5.3


In [108]:
# Removing the region observations, all before Afghanistan

afghan_index = df_gdp[df_gdp['country'] == 'Afghanistan'].index[0] # first index that matches the condition

df_gdp = df_gdp.loc[afghan_index: ]
df_gdp

Unnamed: 0,country,year,Series,Value
840,Afghanistan,1995,gdp,2757
841,Afghanistan,2005,gdp,6221
842,Afghanistan,2010,gdp,14699
843,Afghanistan,2015,gdp,18713
844,Afghanistan,2019,gdp,18904
...,...,...,...,...
6770,Zimbabwe,2010,gdp_growth_rate,19.7
6771,Zimbabwe,2015,gdp_growth_rate,1.8
6772,Zimbabwe,2019,gdp_growth_rate,-6.1
6773,Zimbabwe,2020,gdp_growth_rate,-5.3


In [109]:
# Checking NAs
df_gdp.isna().sum()

country    0
year       0
Series     0
Value      0
dtype: int64

In [114]:
# Pivoting so that gdp and gdp_growth_rate are their own columns
df_gdp_hierarchical = df_gdp.pivot_table(index = ['country', 'year'], columns = 'Series', values = 'Value', aggfunc = 'first')
#df_gdp_pivot.reset_index(inplace = True)
df_gdp_hierarchical

Unnamed: 0_level_0,Series,GDP in constant 2015 prices (millions of US dollars),GDP per capita (US dollars),gdp,gdp_growth_rate
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,1995,6335,168,2757,30.5
Afghanistan,2005,9596,255,6221,7.5
Afghanistan,2010,14744,521,14699,5.2
Afghanistan,2015,18713,554,18713,-1.4
Afghanistan,2019,20510,501,18904,3.9
...,...,...,...,...,...
Zimbabwe,2010,14100,938,12042,19.7
Zimbabwe,2015,19963,1410,19963,1.8
Zimbabwe,2019,20705,1472,22595,-6.1
Zimbabwe,2020,19608,1383,21665,-5.3


In [120]:
# dropping the constant 2015 prices
df_gdp_hierarchical.rename(columns = {'GDP in constant 2015 prices (millions of US dollars)':'gdp_usa_price_2015',
                              'GDP per capita (US dollars)': 'gdp_per_capita'},
                          inplace = True)

df_gdp_hierarchical

Unnamed: 0_level_0,Series,gdp_usa_price_2015,gdp_per_capita,gdp,gdp_growth_rate
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,1995,6335,168,2757,30.5
Afghanistan,2005,9596,255,6221,7.5
Afghanistan,2010,14744,521,14699,5.2
Afghanistan,2015,18713,554,18713,-1.4
Afghanistan,2019,20510,501,18904,3.9
...,...,...,...,...,...
Zimbabwe,2010,14100,938,12042,19.7
Zimbabwe,2015,19963,1410,19963,1.8
Zimbabwe,2019,20705,1472,22595,-6.1
Zimbabwe,2020,19608,1383,21665,-5.3


In [122]:
# Re-converting back to a flat dataframe instead of hierarchical indexing for easier analysis in the future with machine learning
df_gdp_final = df_gdp_hierarchical.reset_index()
df_gdp_final

Series,country,year,gdp_usa_price_2015,gdp_per_capita,gdp,gdp_growth_rate
0,Afghanistan,1995,6335,168,2757,30.5
1,Afghanistan,2005,9596,255,6221,7.5
2,Afghanistan,2010,14744,521,14699,5.2
3,Afghanistan,2015,18713,554,18713,-1.4
4,Afghanistan,2019,20510,501,18904,3.9
...,...,...,...,...,...,...
1480,Zimbabwe,2010,14100,938,12042,19.7
1481,Zimbabwe,2015,19963,1410,19963,1.8
1482,Zimbabwe,2019,20705,1472,22595,-6.1
1483,Zimbabwe,2020,19608,1383,21665,-5.3


In [170]:
import pycountry_convert as pc

def get_continent(country_name):
    try:
        # Make sure country_name is a string, not a Series or DataFrame
        assert isinstance(country_name, str), f"Input must be str, got {type(country_name)}"
        
        country_code = pc.country_name_to_country_alpha2(country_name)
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    except Exception as e:
        print(f"Error processing country: {country_name} - {e}")
        # Return None or you could specify a default value, e.g., 'Unknown'
        return None

# Apply get_continent function row-wise using apply with axis=1
df_test = df_gdp_final
df_test['continent'] = df_gdp_final['country'].apply(get_continent)


Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Lao People's Dem. Rep. - "Invalid Country Name: 'Lao People's Dem. Rep.'"
Error processing country: Lao People's Dem. Rep. - "Invalid Country Name: 'Lao People's Dem. Rep.'"
Error processing country: Lao People's Dem. Rep. - "Invalid Country Name: 'Lao People's Dem. Rep.'"
Error processing country: Lao People's Dem. Rep. - "Invalid Country Name: 'Lao People's Dem. Rep.'"
Error processing country: Lao People's Dem. Rep. - "Invalid Country Name: 'Lao People's Dem. Rep.'"
Error processing country

In [171]:
# Renaming and Removing Countries

df_gdp_final['country'] = df_gdp_final['country'].replace('Bolivia (Plurin. State of)', 'Bolivia')
df_gdp_final['country'] = df_gdp_final['country'].replace('China, Hong Kong SAR', 'Hong Kong')
df_gdp_final['country'] = df_gdp_final['country'].replace('China, Macao SAR', 'Macao')
df_gdp_final['country'] = df_gdp_final['country'].replace('Côte d\x92Ivoire', 'Ivory Coast')
df_gdp_final['country'] = df_gdp_final['country'].replace("Dem. People's Rep. Korea", 'North Korea')
df_gdp_final['country'] = df_gdp_final['country'].replace("Dem. Rep. of the Congo", 'Democratic Republic of the Congo')
df_gdp_final['country'] = df_gdp_final['country'].replace('Iran (Islamic Republic of)', 'Iran')
df_gdp_final['country'] = df_gdp_final['country'].replace('Micronesia (Fed. States of)', 'Micronesia')
df_gdp_final['country'] = df_gdp_final['country'].replace('Netherlands (Kingdom of the)', 'Netherlands')
df_gdp_final['country'] = df_gdp_final['country'].replace('Republic of Korea', 'South Korea')
df_gdp_final['country'] = df_gdp_final['country'].replace('United Rep. of Tanzania', 'Tanzania')
df_gdp_final['country'] = df_gdp_final['country'].replace('Venezuela (Boliv. Rep. of)', 'Venezuela')

countries_to_remove = ["Lao People's Dem. Rep.", 'Netherlands Antilles [former]', 'Sint Maarten (Dutch part)']
df_removal = df_gdp_final[~df_gdp_final['country'].isin(countries_to_remove)]
df_removal

Series,country,year,gdp_usa_price_2015,gdp_per_capita,gdp,gdp_growth_rate,continent
0,Afghanistan,1995,6335,168,2757,30.5,Asia
1,Afghanistan,2005,9596,255,6221,7.5,Asia
2,Afghanistan,2010,14744,521,14699,5.2,Asia
3,Afghanistan,2015,18713,554,18713,-1.4,Asia
4,Afghanistan,2019,20510,501,18904,3.9,Asia
...,...,...,...,...,...,...,...
1480,Zimbabwe,2010,14100,938,12042,19.7,Africa
1481,Zimbabwe,2015,19963,1410,19963,1.8,Africa
1482,Zimbabwe,2019,20705,1472,22595,-6.1,Africa
1483,Zimbabwe,2020,19608,1383,21665,-5.3,Africa


In [172]:
df_test = df_removal
df_test['continent'] = df_test['country'].apply(get_continent)

Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Kosovo - "Invalid Country Name: 'Kosovo'"
Error processing country: Saint Vincent & Grenadines - "Invalid Country Name: 'Saint Vincent & Grenadines'"
Error processing country: Saint Vincent & Grenadines - "Invalid Country Name: 'Saint Vincent & Grenadines'"
Error processing country: Saint Vincent & Grenadines - "Invalid Country Name: 'Saint Vincent & Grenadines'"
Error processing country: Saint Vincent & Grenadines - "Invalid Country Name: 'Saint Vincent & Grenadines'"
Error processing country: Saint Vincent & Grenadines - "Invalid Country Name: 'Saint Vincent

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['continent'] = df_test['country'].apply(get_continent)


In [183]:
# Inserting continent for some countries manually
df_test.loc[df_test['country'] == 'State of Palestine', 'continent'] = 'Africa'
df_test.loc[df_test['country'] == 'Kosovo', 'continent'] = 'Europe'
df_test.loc[df_test['country'] == 'Timor-Leste', 'continent'] = 'Asia'
df_test.loc[df_test['country'] == 'Saint Vincent & Grenadines', 'continent'] = 'North America'
df_test.loc[df_test['country'] == 'Zanzibar', 'continent'] = 'Africa'

# Replacing 'Sudan [former]' with Sudan before 2010
df_test.loc[(df_test['country'] == 'Sudan [former]') & (df_test['year'] < 2010), 'country'] = 'Sudan' 
df_test = df_test[df_test['country'] != 'Sudan [former]']
df_test.loc[df_test['country'] == 'Sudan', 'continent'] = 'Africa'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [191]:
# Re-arranging columns and producing final clean DataFrame

column_order = [ 'country', 'continent', 'year', 'gdp', 'gdp_usa_price_2015', 'gdp_per_capita', 'gdp_growth_rate']

df_gdp_clean = df_test
df_gdp_clean = df_gdp_clean[column_order]

df_gdp_clean

Series,country,continent,year,gdp,gdp_usa_price_2015,gdp_per_capita,gdp_growth_rate
0,Afghanistan,Asia,1995,2757,6335,168,30.5
1,Afghanistan,Asia,2005,6221,9596,255,7.5
2,Afghanistan,Asia,2010,14699,14744,521,5.2
3,Afghanistan,Asia,2015,18713,18713,554,-1.4
4,Afghanistan,Asia,2019,18904,20510,501,3.9
...,...,...,...,...,...,...,...
1480,Zimbabwe,Africa,2010,12042,14100,938,19.7
1481,Zimbabwe,Africa,2015,19963,19963,1410,1.8
1482,Zimbabwe,Africa,2019,22595,20705,1472,-6.1
1483,Zimbabwe,Africa,2020,21665,19608,1383,-5.3


In [194]:
# Ensuring correct column types

df_gdp_clean['gdp_usa_price_2015'] = df_gdp_clean['gdp_usa_price_2015'].str.replace(',', '').astype(float)
df_gdp_clean['gdp'] = df_gdp_clean['gdp'].str.replace(',', '').astype(float)
df_gdp_clean['gdp_per_capita'] = df_gdp_clean['gdp_per_capita'].str.replace(',', '').astype(float)
df_gdp_clean['gdp_growth_rate'] = df_gdp_clean['gdp_growth_rate'].str.replace(',', '').astype(float)

df_gdp_clean.dtypes

Series
country                object
continent              object
year                    int64
gdp                   float64
gdp_usa_price_2015    float64
gdp_per_capita        float64
gdp_growth_rate       float64
dtype: object

In [196]:
# Saving the DataFrame to csv

df_gdp_clean.to_csv('E:\\Career\\Projects\\Data-Driven GDP Analysis\\dataframes\\gdp_df.csv', index = False)