# Data Cleaning

In [155]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## North American Global Crisis Data

In [156]:
# Use this cell if grabbing data from the internet
url="https://www.hbs.edu/faculty/Documents/ChartData/MapCharts/20160923_global_crisis_data.xlsx"
data=pd.read_excel(url)

In [157]:
# Use this cell if you have a local copy of the data
# filepath = '/20160923_global_crisis_data.csv'     # Use your own filepath
# data = pd.read_excel(filepath)

In [158]:
# Confirm that we have properly loaded the data into a dataframe
type(data)

pandas.core.frame.DataFrame

In [159]:
# Look at first 5 rows
data.head(5)

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_ Notes/Sources,"SOVEREIGN EXTERNAL DEBT 1: DEFAULT and RESTRUCTURINGS, 1800-2012--Does not include defaults on WWI debt to United States and United Kingdom and post-1975 defaults on Official External Creditors","SOVEREIGN EXTERNAL DEBT 2: DEFAULT and RESTRUCTURINGS, 1800-2012--Does not include defaults on WWI debt to United States and United Kingdom but includes post-1975 defaults on Official External Creditors",Defaults_External_Notes,GDP_Weighted_default,<,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises
0,,,,,x,,x,x,,,...,,x,,,x,x,,x,x,x
1,1.0,DZA,Algeria,1800.0,0,,0,0,,,...,,0,0.0,,0,,,0,0,0
2,1.0,DZA,Algeria,1801.0,0,,0,0,,,...,,0,0.0,,0,,,0,0,0
3,1.0,DZA,Algeria,1802.0,0,,0,0,,,...,,0,0.0,,0,,,0,0,0
4,1.0,DZA,Algeria,1803.0,0,,0,0,,,...,,0,0.0,,0,,,0,0,0


In [160]:
# Look at column names and types
data.columns

Index(['Case', 'CC3', 'Country', 'Year', 'Banking Crisis ',
       'Banking_Crisis_Notes', 'Systemic Crisis', 'Gold Standard', 'exch_usd',
       'exch_usd_alt1', 'exch_usd_alt2', 'exch_usd_alt3', 'conversion_notes',
       'national currency', 'exch_primary source code', 'exch_sources',
       'Domestic_Debt_In_Default', 'Domestic_Debt_ Notes/Sources',
       'SOVEREIGN EXTERNAL DEBT 1: DEFAULT and RESTRUCTURINGS, 1800-2012--Does not include defaults on WWI debt to United States and United Kingdom and post-1975 defaults on Official External Creditors',
       'SOVEREIGN EXTERNAL DEBT 2: DEFAULT and RESTRUCTURINGS, 1800-2012--Does not include defaults on WWI debt to United States and United Kingdom but includes post-1975 defaults on Official External Creditors',
       'Defaults_External_Notes', 'GDP_Weighted_default', '<',
       'Inflation, Annual percentages of average consumer prices',
       'Independence', 'Currency Crises', 'Inflation Crises'],
      dtype='object')

In [161]:
# Drop the first row because it is not actual data
data.drop(index=0)

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_ Notes/Sources,"SOVEREIGN EXTERNAL DEBT 1: DEFAULT and RESTRUCTURINGS, 1800-2012--Does not include defaults on WWI debt to United States and United Kingdom and post-1975 defaults on Official External Creditors","SOVEREIGN EXTERNAL DEBT 2: DEFAULT and RESTRUCTURINGS, 1800-2012--Does not include defaults on WWI debt to United States and United Kingdom but includes post-1975 defaults on Official External Creditors",Defaults_External_Notes,GDP_Weighted_default,<,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises
1,1.0,DZA,Algeria,1800.0,0,,0,0,,,...,,0,0.0,,0,,,0,0,0
2,1.0,DZA,Algeria,1801.0,0,,0,0,,,...,,0,0.0,,0,,,0,0,0
3,1.0,DZA,Algeria,1802.0,0,,0,0,,,...,,0,0.0,,0,,,0,0,0
4,1.0,DZA,Algeria,1803.0,0,,0,0,,,...,,0,0.0,,0,,,0,0,0
5,1.0,DZA,Algeria,1804.0,0,,0,0,,,...,,0,0.0,,0,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15186,70.0,ZWE,Zimbabwe,2012.0,0,,0,,361.9,,...,,1,1.0,,0,0,3.72,1,0,0
15187,70.0,ZWE,Zimbabwe,2013.0,0,,0,,361.9,,...,,1,1.0,,0,0,1.632,1,0,0
15188,70.0,ZWE,Zimbabwe,2014.0,0,,0,,,,...,,1,1.0,,0,0,-0.213,1,0,0
15189,70.0,ZWE,Zimbabwe,2015.0,,,0,,,,...,,1,1.0,,0,,-2.399,1,0,0


In [162]:
# Examine datatypes of the columns
data.dtypes

Case                                                                                                                                                                                                          float64
CC3                                                                                                                                                                                                            object
Country                                                                                                                                                                                                        object
Year                                                                                                                                                                                                          float64
Banking Crisis                                                                                                                                  

In [163]:
# Create a dataframe that only contains data from the US and Canada (North America, essentially)
north_american_data = pd.concat([
    data.loc[data['Country'] == 'Canada'],
    data.loc[data['Country'] == 'Mexico'], 
    data.loc[data['Country'] == 'United States']])
north_american_data.head(10)

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_ Notes/Sources,"SOVEREIGN EXTERNAL DEBT 1: DEFAULT and RESTRUCTURINGS, 1800-2012--Does not include defaults on WWI debt to United States and United Kingdom and post-1975 defaults on Official External Creditors","SOVEREIGN EXTERNAL DEBT 2: DEFAULT and RESTRUCTURINGS, 1800-2012--Does not include defaults on WWI debt to United States and United Kingdom but includes post-1975 defaults on Official External Creditors",Defaults_External_Notes,GDP_Weighted_default,<,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises
1737,9.0,CAN,Canada,1800.0,0,,0,0,0.2441,,...,,0,0.0,,0,,,0,0,0
1738,9.0,CAN,Canada,1801.0,0,,0,0,0.2379,,...,,0,0.0,,0,,,0,0,0
1739,9.0,CAN,Canada,1802.0,0,,0,0,0.2553,,...,,0,0.0,,0,,,0,0,0
1740,9.0,CAN,Canada,1803.0,0,,0,0,0.2527,,...,,0,0.0,,0,,,0,0,0
1741,9.0,CAN,Canada,1804.0,0,,0,0,0.243,,...,,0,0.0,,0,,,0,0,0
1742,9.0,CAN,Canada,1805.0,0,,0,0,0.2528,,...,,0,0.0,,0,,,0,0,0
1743,9.0,CAN,Canada,1806.0,0,,0,0,0.2478,,...,,0,0.0,,0,,,0,0,0
1744,9.0,CAN,Canada,1807.0,0,,0,0,0.2534,,...,,0,0.0,,0,,,0,0,0
1745,9.0,CAN,Canada,1808.0,0,,0,0,0.2418,,...,,0,0.0,,0,,,0,0,0
1746,9.0,CAN,Canada,1809.0,0,,0,0,0.2478,,...,,0,0.0,,0,,,0,0,0


In [164]:
# Drop unneccesary columns 
north_american_data = north_american_data.drop(columns=[
    'exch_usd_alt2', 
    'exch_usd_alt3',
    'conversion_notes',
    'SOVEREIGN EXTERNAL DEBT 2: DEFAULT and RESTRUCTURINGS, 1800-2012--Does not include defaults on WWI debt to United States and United Kingdom but includes post-1975 defaults on Official External Creditors',
    '<'])

In [165]:
# Rename that one SOVEREIGN EXTERNAL DEBT column
north_american_data = north_american_data.rename(columns={"SOVEREIGN EXTERNAL DEBT 1: DEFAULT and RESTRUCTURINGS, 1800-2012--Does not include defaults on WWI debt to United States and United Kingdom and post-1975 defaults on Official External Creditors": "Sovereign External Debt"})

In [166]:
# Describe quantatative data
north_american_data.describe()

Unnamed: 0,Case,Year,exch_usd_alt1
count,651.0,651.0,271.0
mean,38.0,1908.0,1.26906
std,23.298795,62.690007,3.06304
min,9.0,1800.0,0.0
25%,9.0,1854.0,0.0
50%,39.0,1908.0,0.00688
75%,66.0,1962.0,1.0761
max,66.0,2016.0,16.49


In [167]:
# Check for NANs (Spoiler Alert: There are a lot)
north_american_data.isna().any()

Case                                                        False
CC3                                                         False
Country                                                     False
Year                                                        False
Banking Crisis                                               True
Banking_Crisis_Notes                                         True
Systemic Crisis                                              True
Gold Standard                                               False
exch_usd                                                     True
exch_usd_alt1                                                True
national currency                                            True
exch_primary source code                                     True
exch_sources                                                False
Domestic_Debt_In_Default                                     True
Domestic_Debt_ Notes/Sources                                 True
Sovereign 

## Unemployment Data

In [168]:
# US
us_url = 'https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1168&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=LRUN64TTUSA156S&scale=left&cosd=1960-01-01&coed=2019-01-01&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=avg&fgst=lin&fgsnd=2009-06-01&line_index=1&transformation=lin&vintage_date=2020-10-29&revision_date=2020-10-29&nd=1960-01-01'
us_unemployment=pd.read_csv(us_url)

# Mexico
mx_url = 'https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=1168&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=LRUNTTTTMXA156N&scale=left&cosd=1987-01-01&coed=2019-01-01&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=avg&fgst=lin&fgsnd=2009-06-01&line_index=1&transformation=lin&vintage_date=2020-10-30&revision_date=2020-10-30&nd=1987-01-01'
mx_unemployment=pd.read_csv(mx_url)

# Canada
ca_url = 'https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=1168&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=LRUNTTTTCAA156S&scale=left&cosd=1960-01-01&coed=2019-01-01&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=avg&fgst=lin&fgsnd=2009-06-01&line_index=1&transformation=lin&vintage_date=2020-10-29&revision_date=2020-10-29&nd=1960-01-01'
ca_unemployment=pd.read_csv(ca_url)

In [169]:
us_unemployment.head(10)

Unnamed: 0,DATE,LRUN64TTUSA156S
0,1960-01-01,5.617115
1,1961-01-01,6.770271
2,1962-01-01,5.59621
3,1963-01-01,5.739126
4,1964-01-01,5.246175
5,1965-01-01,4.579955
6,1966-01-01,3.832191
7,1967-01-01,3.891057
8,1968-01-01,3.608531
9,1969-01-01,3.557987


## GDP & Per Capita GDP Data

In [170]:
# Aquire GDP and per capita GDP data
gdp_url = 'http://www.ggdc.net/MADDISON/Historical_Statistics/horizontal-file_02-2010.xls'

gdp_data = pd.read_excel(gdp_url, sheet_name='GDP')
percapita_gdp_data = pd.read_excel(gdp_url, sheet_name='PerCapita GDP')

In [171]:
# Confirm that we have properly loaded the data into a dataframe
print(type(gdp_data))
print(type(percapita_gdp_data))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [172]:
# Check out the dataframes
gdp_data.head(5)

Unnamed: 0,GDP Levels,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 190,Unnamed: 191,Unnamed: 192,Unnamed: 193,Unnamed: 194,Unnamed: 195,Unnamed: 196,Unnamed: 197,Unnamed: 198,Unnamed: 199
0,million 1990 International Geary-Khamis dollars,,,,,,,,,,...,,,,,,,,,,
1,,1.0,,1000.0,,1500.0,,1600.0,,1700.0,...,1999.0,2000.0,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,2008.0
2,Western Europe,,,,,,,,,,...,,,,,,,,,,
3,Austria,212.5,,297.5,,1414.0,,2093.0,,2483.0,...,162410.0,167878.0,169244.0,170755.0,172763.0,176795.0,181215.0,187557.0,194122.0,198004.0
4,Belgium,135.0,,170.0,,1225.0,,1561.0,,2288.0,...,204349.0,212010.0,213668.0,216891.0,219074.0,225609.0,230572.0,237490.0,243666.0,246103.0


In [173]:
# Check out the dataframes
percapita_gdp_data.head(5)

Unnamed: 0,Per Capita GDP,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 190,Unnamed: 191,Unnamed: 192,Unnamed: 193,Unnamed: 194,Unnamed: 195,Unnamed: 196,Unnamed: 197,Unnamed: 198,Unnamed: 199
0,(1990 International Geary-Khamis dollars),,,,,,,,,,...,,,,,,,,,,
1,,1.0,,1000.0,,1500.0,,1600.0,,1700.0,...,1999.0,2000.0,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,2008.0
2,Western Europe,,,,,,,,,,...,,,,,,,,,,
3,Austria,425.0,,425.0,,707.0,,837.2,,993.2,...,20065.093878,20691.415561,20812.893753,20955.874051,21165.047259,21626.929322,22140.725899,22892.682427,23674.04113,24130.547035
4,Belgium,450.0,,425.0,,875.0,,975.625,,1144.0,...,19964.428266,20656.45857,20761.238278,21032.935511,21205.859281,21801.602508,22246.561977,22881.63281,23446.949672,23654.763464


Stitching Unemployment Rate Data

In [174]:
north_american_data = north_american_data.reset_index(drop=True)

In [175]:
north_american_data

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,exch_sources,Domestic_Debt_In_Default,Domestic_Debt_ Notes/Sources,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises
0,9.0,CAN,Canada,1800.0,0,,0,0,0.2441,,...,Primary source is GFD market eop; alt1 series ...,0,,0,,0,,0,0,0
1,9.0,CAN,Canada,1801.0,0,,0,0,0.2379,,...,Primary source is GFD market eop; alt1 series ...,0,,0,,0,,0,0,0
2,9.0,CAN,Canada,1802.0,0,,0,0,0.2553,,...,Primary source is GFD market eop; alt1 series ...,0,,0,,0,,0,0,0
3,9.0,CAN,Canada,1803.0,0,,0,0,0.2527,,...,Primary source is GFD market eop; alt1 series ...,0,,0,,0,,0,0,0
4,9.0,CAN,Canada,1804.0,0,,0,0,0.243,,...,Primary source is GFD market eop; alt1 series ...,0,,0,,0,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,66.0,USA,United States,2012.0,0,,0,0,1.6251,,...,Primary source is UK pound level,0,,0,,0,2.078,1,0,0
647,66.0,USA,United States,2013.0,0,,0,0,1.6556,,...,Primary source is UK pound level,0,,0,,0,1.467,1,0,0
648,66.0,USA,United States,2014.0,0,,0,0,1.5573,,...,Primary source is UK pound level,0,,0,,0,1.61,1,0,0
649,66.0,USA,United States,2015.0,,,0,0,1.4738,,...,Primary source is UK pound level,0,,0,,0,0.118,1,0,


In [176]:
north_american_data["Unemployment Rate"] = 0

In [177]:
north_american_data

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_In_Default,Domestic_Debt_ Notes/Sources,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate
0,9.0,CAN,Canada,1800.0,0,,0,0,0.2441,,...,0,,0,,0,,0,0,0,0
1,9.0,CAN,Canada,1801.0,0,,0,0,0.2379,,...,0,,0,,0,,0,0,0,0
2,9.0,CAN,Canada,1802.0,0,,0,0,0.2553,,...,0,,0,,0,,0,0,0,0
3,9.0,CAN,Canada,1803.0,0,,0,0,0.2527,,...,0,,0,,0,,0,0,0,0
4,9.0,CAN,Canada,1804.0,0,,0,0,0.243,,...,0,,0,,0,,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,66.0,USA,United States,2012.0,0,,0,0,1.6251,,...,0,,0,,0,2.078,1,0,0,0
647,66.0,USA,United States,2013.0,0,,0,0,1.6556,,...,0,,0,,0,1.467,1,0,0,0
648,66.0,USA,United States,2014.0,0,,0,0,1.5573,,...,0,,0,,0,1.61,1,0,0,0
649,66.0,USA,United States,2015.0,,,0,0,1.4738,,...,0,,0,,0,0.118,1,0,,0


In [178]:
us_unemployment = us_unemployment.rename(columns={"DATE": "Year", "LRUN64TTUSA156S": "Unemployment Rate"})

In [179]:
us_unemployment_list = us_unemployment.loc[0:56, "Unemployment Rate"].tolist()

In [180]:
north_american_data.loc[594:650, "Unemployment Rate"] = us_unemployment_list

In [181]:
north_american_data

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_In_Default,Domestic_Debt_ Notes/Sources,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate
0,9.0,CAN,Canada,1800.0,0,,0,0,0.2441,,...,0,,0,,0,,0,0,0,0.000000
1,9.0,CAN,Canada,1801.0,0,,0,0,0.2379,,...,0,,0,,0,,0,0,0,0.000000
2,9.0,CAN,Canada,1802.0,0,,0,0,0.2553,,...,0,,0,,0,,0,0,0,0.000000
3,9.0,CAN,Canada,1803.0,0,,0,0,0.2527,,...,0,,0,,0,,0,0,0,0.000000
4,9.0,CAN,Canada,1804.0,0,,0,0,0.243,,...,0,,0,,0,,0,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,66.0,USA,United States,2012.0,0,,0,0,1.6251,,...,0,,0,,0,2.078,1,0,0,8.165597
647,66.0,USA,United States,2013.0,0,,0,0,1.6556,,...,0,,0,,0,1.467,1,0,0,7.485224
648,66.0,USA,United States,2014.0,0,,0,0,1.5573,,...,0,,0,,0,1.61,1,0,0,6.255356
649,66.0,USA,United States,2015.0,,,0,0,1.4738,,...,0,,0,,0,0.118,1,0,,5.366365


In [182]:
ca_unemployment = ca_unemployment.rename(columns={"DATE": "Year", "LRUNTTTTCAA156S": "Unemployment Rate"})

In [183]:
ca_unemployment_list = ca_unemployment.loc[0:56, "Unemployment Rate"].tolist()

In [184]:
north_american_data.loc[160:216, "Unemployment Rate"] = ca_unemployment_list

In [185]:
north_american_data.loc[160:216]

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_In_Default,Domestic_Debt_ Notes/Sources,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate
160,9.0,CAN,Canada,1960.0,0.0,,0.0,0,0.996,0.996,...,0.0,,0,,0,0.952614,1,0,0,6.991667
161,9.0,CAN,Canada,1961.0,0.0,,0.0,0,1.0431,1.0434,...,0.0,,0,,0,0.629164,1,0,0,7.183333
162,9.0,CAN,Canada,1962.0,0.0,,0.0,0,1.0772,1.0772,...,0.0,,0,,0,1.72003,1,0,0,5.958333
163,9.0,CAN,Canada,1963.0,0.0,,0.0,0,1.0806,1.0806,...,0.0,,0,,0,1.8953,1,0,0,5.583333
164,9.0,CAN,Canada,1964.0,0.0,,0.0,0,1.0738,1.0738,...,0.0,,0,,0,1.56935,1,0,0,4.708333
165,9.0,CAN,Canada,1965.0,0.0,,0.0,0,1.075,1.075,...,0.0,,0,,0,2.5945,1,0,0,3.95
166,9.0,CAN,Canada,1966.0,0.0,,0.0,0,1.0838,1.0838,...,0.0,,0,,0,3.68509,1,0,0,3.375
167,9.0,CAN,Canada,1967.0,0.0,,0.0,0,1.0808,1.0806,...,0.0,,0,,0,3.22191,1,0,0,3.833333
168,9.0,CAN,Canada,1968.0,0.0,,0.0,0,1.0728,1.0728,...,0.0,,0,,0,3.66681,1,0,0,4.525
169,9.0,CAN,Canada,1969.0,0.0,,0.0,0,1.0728,1.0728,...,0.0,,0,,0,4.67554,1,0,0,4.425


In [186]:
north_american_data

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_In_Default,Domestic_Debt_ Notes/Sources,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate
0,9.0,CAN,Canada,1800.0,0,,0,0,0.2441,,...,0,,0,,0,,0,0,0,0.000000
1,9.0,CAN,Canada,1801.0,0,,0,0,0.2379,,...,0,,0,,0,,0,0,0,0.000000
2,9.0,CAN,Canada,1802.0,0,,0,0,0.2553,,...,0,,0,,0,,0,0,0,0.000000
3,9.0,CAN,Canada,1803.0,0,,0,0,0.2527,,...,0,,0,,0,,0,0,0,0.000000
4,9.0,CAN,Canada,1804.0,0,,0,0,0.243,,...,0,,0,,0,,0,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,66.0,USA,United States,2012.0,0,,0,0,1.6251,,...,0,,0,,0,2.078,1,0,0,8.165597
647,66.0,USA,United States,2013.0,0,,0,0,1.6556,,...,0,,0,,0,1.467,1,0,0,7.485224
648,66.0,USA,United States,2014.0,0,,0,0,1.5573,,...,0,,0,,0,1.61,1,0,0,6.255356
649,66.0,USA,United States,2015.0,,,0,0,1.4738,,...,0,,0,,0,0.118,1,0,,5.366365


In [187]:
mx_unemployment

Unnamed: 0,DATE,LRUNTTTTMXA156N
0,1987-01-01,3.925
1,1988-01-01,3.6
2,1989-01-01,3.0
3,1990-01-01,2.75
4,1991-01-01,2.625
5,1992-01-01,2.825
6,1993-01-01,3.425
7,1994-01-01,3.7
8,1995-01-01,6.225
9,1996-01-01,5.45


In [188]:
mx_unemployment = mx_unemployment.rename(columns={"DATE": "Year", "LRUNTTTTMXA156N": "Unemployment Rate"})

In [189]:
mx_unemployment_list = mx_unemployment.loc[0:29, "Unemployment Rate"].tolist()

In [190]:
north_american_data.loc[404:433, "Unemployment Rate"] = mx_unemployment_list

In [191]:
north_american_data[404:433]

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_In_Default,Domestic_Debt_ Notes/Sources,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate
404,39.0,MEX,Mexico,1987.0,0.0,,0.0,0,0.0,2.2097,...,0,,1,Interest rate shocks,0.95,131.962,1,1,1,3.925
405,39.0,MEX,Mexico,1988.0,0.0,,0.0,0,0.0,2.281,...,0,,1,Interest rate shocks,0.95,113.501,1,0,1,3.6
406,39.0,MEX,Mexico,1989.0,0.0,,0.0,0,0.0,2.641,...,0,,1,Interest rate shocks,0.95,19.918,1,1,1,3.0
407,39.0,MEX,Mexico,1990.0,0.0,,0.0,0,0.0,2.9454,...,0,,1,Interest rate shocks,0.95,26.65,1,0,1,2.75
408,39.0,MEX,Mexico,1991.0,0.0,Government took over banking system.,1.0,0,0.0,3.071,...,0,,0,,0.0,22.608,1,0,1,2.625
409,39.0,MEX,Mexico,1992.0,0.0,,0.0,0,0.0,3.1154,...,0,,0,,0.0,15.517,1,0,0,2.825
410,39.0,MEX,Mexico,1993.0,1.0,Several financial institutions that held Ajust...,1.0,0,3.1059,3.1059,...,0,,0,,0.0,9.763,1,0,0,3.425
411,39.0,MEX,Mexico,1994.0,1.0,Several financial institutions that held Ajust...,1.0,0,5.325,5.325,...,0,,0,,0.0,6.986,1,1,0,3.7
412,39.0,MEX,Mexico,1995.0,1.0,Several financial institutions that held Ajust...,1.0,0,7.6425,7.6425,...,0,,0,,0.0,35.062,1,1,1,6.225
413,39.0,MEX,Mexico,1996.0,1.0,Several financial institutions that held Ajust...,1.0,0,7.8509,7.8509,...,0,,0,,0.0,34.35,1,0,1,5.45


In [192]:
gdp_data.columns

Index(['GDP Levels', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       ...
       'Unnamed: 190', 'Unnamed: 191', 'Unnamed: 192', 'Unnamed: 193',
       'Unnamed: 194', 'Unnamed: 195', 'Unnamed: 196', 'Unnamed: 197',
       'Unnamed: 198', 'Unnamed: 199'],
      dtype='object', length=200)

In [193]:
# Aquire GDP from vertical file
gdp_url = 'http://www.ggdc.net/MADDISON/Historical_Statistics/vertical-file_02-2010.xls'
#cols = ['A', 'W', 'X', 'BK']
gdp_data = pd.read_excel(gdp_url, sheet_name='GDP', usecols="A, W, X, BK")

In [194]:
gdp_data = gdp_data.rename(columns={"GDP": "Year", "Unnamed: 22": "Canada", "Unnamed: 23": "USA", "Unnamed: 62": "Mexico"})

In [195]:
gdp_data[12:]

Unnamed: 0,Year,Canada,USA,Mexico
12,1820,738,12548,5000
13,1821,,,
14,1822,,,
15,1823,,,
16,1824,,,
...,...,...,...,...
196,2004,769405,8738865,772208
197,2005,792487,9009770,797691
198,2006,815469,9253034,837576
199,2007,835856,9447347,866576


In [196]:
canada_gdp_list = gdp_data.loc[12:200, "Canada"].tolist()

In [197]:
north_american_data.loc[20:208, "GDP"] = canada_gdp_list

In [198]:
north_american_data[20:208]

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_ Notes/Sources,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate,GDP
20,9.0,CAN,Canada,1820.0,0,,0,0,0.2627,,...,,0,,0,,0,0,0,0.000000,738.0
21,9.0,CAN,Canada,1821.0,0,,0,0,0.2585,,...,,0,,0,,0,0,0,0.000000,
22,9.0,CAN,Canada,1822.0,0,,0,0,0.25,,...,,0,,0,,0,0,0,0.000000,
23,9.0,CAN,Canada,1823.0,0,,0,0,0.2551,,...,,0,,0,,0,0,0,0.000000,
24,9.0,CAN,Canada,1824.0,0,,0,0,0.2582,,...,,0,,0,,0,0,0,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,9.0,CAN,Canada,2003.0,0,,0,0,1.2965,1.2924,...,,0,,0,2.742,1,0,0,7.575000,746491.0
204,9.0,CAN,Canada,2004.0,0,,0,0,1.2028,1.2036,...,,0,,0,1.841,1,0,0,7.191667,769405.0
205,9.0,CAN,Canada,2005.0,0,,0,0,1.1623,1.1645,...,,0,,0,2.23,1,0,0,6.758333,792487.0
206,9.0,CAN,Canada,2006.0,0,,0,0,1.165,1.1653,...,,0,,0,2.018,1,0,0,6.333333,815469.0


In [199]:
north_american_data[454:645]

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_ Notes/Sources,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate,GDP
454,66.0,USA,United States,1820.0,0,,0,0,4.62321,,...,,0,,0,-7.84157,1,0,0,0.000000,
455,66.0,USA,United States,1821.0,0,,0,0,4.92854,,...,,0,,0,-3.90402,1,0,0,0.000000,
456,66.0,USA,United States,1822.0,0,,0,0,4.98504,,...,,0,,0,3.52719,1,0,0,0.000000,
457,66.0,USA,United States,1823.0,0,,0,0,4.78927,,...,,0,,0,-8.65493,1,0,0,0.000000,
458,66.0,USA,United States,1824.0,0,,0,0,4.86145,,...,,0,,0,-7.47805,1,0,0,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,66.0,USA,United States,2006.0,0,,0,0,1.963,,...,,0,,0,3.222,1,0,0,4.688935,
641,66.0,USA,United States,2007.0,1,,0,0,1.9847,,...,,0,,0,2.871,1,0,0,4.675080,
642,66.0,USA,United States,2008.0,1,,0,0,1.4626,,...,,0,,0,3.815,1,0,0,5.847526,
643,66.0,USA,United States,2009.0,1,,0,0,1.6154,,...,,0,,0,-0.32,1,0,0,9.377906,


In [200]:
usa_gdp_list = gdp_data.loc[12:200, "USA"].tolist()

In [201]:
north_american_data.loc[454:642, "GDP"] = usa_gdp_list

In [202]:
north_american_data[454:642]

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_ Notes/Sources,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate,GDP
454,66.0,USA,United States,1820.0,0,,0,0,4.62321,,...,,0,,0,-7.84157,1,0,0,0.000000,12548.0
455,66.0,USA,United States,1821.0,0,,0,0,4.92854,,...,,0,,0,-3.90402,1,0,0,0.000000,
456,66.0,USA,United States,1822.0,0,,0,0,4.98504,,...,,0,,0,3.52719,1,0,0,0.000000,
457,66.0,USA,United States,1823.0,0,,0,0,4.78927,,...,,0,,0,-8.65493,1,0,0,0.000000,
458,66.0,USA,United States,1824.0,0,,0,0,4.86145,,...,,0,,0,-7.47805,1,0,0,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637,66.0,USA,United States,2003.0,0,,0,0,1.785,,...,,0,,0,2.298,1,1,0,6.061592,8431121.0
638,66.0,USA,United States,2004.0,0,,0,0,1.931,,...,,0,,0,2.668,1,0,0,5.597952,8738865.0
639,66.0,USA,United States,2005.0,0,,0,0,1.722,,...,,0,,0,3.366,1,0,0,5.143966,9009770.0
640,66.0,USA,United States,2006.0,0,,0,0,1.963,,...,,0,,0,3.222,1,0,0,4.688935,9253034.0


In [203]:
north_american_data[237:425]

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_ Notes/Sources,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate,GDP
237,39.0,MEX,Mexico,1820.0,0,,0,0,0.9988,0.0000,...,,0,,0,,0,0,0,0.000000,
238,39.0,MEX,Mexico,1821.0,0,,0,0,0.9901,0.0000,...,,0,,0,-13.608,1,0,0,0.000000,
239,39.0,MEX,Mexico,1822.0,0,,0,0,0.9926,0.0000,...,,0,,0,,1,0,0,0.000000,
240,39.0,MEX,Mexico,1823.0,0,,0,0,0.995,0.0000,...,,0,,0,,1,0,0,0.000000,
241,39.0,MEX,Mexico,1824.0,0,,0,0,0.9911,0.0000,...,,0,,0,,1,0,0,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,39.0,MEX,Mexico,2003.0,0,,0,0,11.236,11.2360,...,,0,,0,4.551,1,0,0,3.405833,
421,39.0,MEX,Mexico,2004.0,0,,0,0,11.2648,11.2648,...,,0,,0,4.694,1,0,0,3.915833,
422,39.0,MEX,Mexico,2005.0,0,,0,0,10.7777,10.7777,...,,0,,0,3.99,1,0,0,3.559639,
423,39.0,MEX,Mexico,2006.0,0,,0,0,10.881,10.8810,...,,0,,0,3.633,1,0,0,3.563787,


In [204]:
mexico_gdp_list = gdp_data.loc[12:200, "Mexico"].tolist()

In [205]:
north_american_data.loc[237:425, "GDP"] = mexico_gdp_list

In [206]:
north_american_data[237:425]

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Domestic_Debt_ Notes/Sources,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate,GDP
237,39.0,MEX,Mexico,1820.0,0,,0,0,0.9988,0.0000,...,,0,,0,,0,0,0,0.000000,5000.0
238,39.0,MEX,Mexico,1821.0,0,,0,0,0.9901,0.0000,...,,0,,0,-13.608,1,0,0,0.000000,
239,39.0,MEX,Mexico,1822.0,0,,0,0,0.9926,0.0000,...,,0,,0,,1,0,0,0.000000,
240,39.0,MEX,Mexico,1823.0,0,,0,0,0.995,0.0000,...,,0,,0,,1,0,0,0.000000,
241,39.0,MEX,Mexico,1824.0,0,,0,0,0.9911,0.0000,...,,0,,0,,1,0,0,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,39.0,MEX,Mexico,2003.0,0,,0,0,11.236,11.2360,...,,0,,0,4.551,1,0,0,3.405833,742508.0
421,39.0,MEX,Mexico,2004.0,0,,0,0,11.2648,11.2648,...,,0,,0,4.694,1,0,0,3.915833,772208.0
422,39.0,MEX,Mexico,2005.0,0,,0,0,10.7777,10.7777,...,,0,,0,3.99,1,0,0,3.559639,797691.0
423,39.0,MEX,Mexico,2006.0,0,,0,0,10.881,10.8810,...,,0,,0,3.633,1,0,0,3.563787,837576.0


In [207]:
# Aquire GDP from vertical file
gdp_url = 'http://www.ggdc.net/MADDISON/Historical_Statistics/vertical-file_02-2010.xls'
#cols = ['A', 'W', 'X', 'BK']
gdp_percapita_data = pd.read_excel(gdp_url, sheet_name='PerCapita GDP', usecols="A, W, X, BK")

In [208]:
gdp_percapita_data = gdp_percapita_data.rename(columns={"GDP per capita": "Year", "Unnamed: 22": "Canada", "Unnamed: 23": "USA", "Unnamed: 62": "Mexico"})

In [209]:
gdp_percapita_data[12:]

Unnamed: 0,Year,Canada,USA,Mexico
12,1820,904.412,1257.25,759.071
13,1821,,,
14,1822,,,
15,1823,,,
16,1824,,,
...,...,...,...,...
196,2004,23942.9,29845.2,7357.19
197,2005,24470,30481.4,7511.01
198,2006,24970.7,31004.5,7795.06
199,2007,25809.2,31357.4,7972.11


In [210]:
canada_gdp_percapita_list = gdp_percapita_data.loc[12:200, "Canada"].tolist()

In [211]:
north_american_data.loc[20:208, "GDP PerCapita"] = canada_gdp_percapita_list

In [212]:
north_american_data[20:208]

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate,GDP,GDP PerCapita
20,9.0,CAN,Canada,1820.0,0,,0,0,0.2627,,...,0,,0,,0,0,0,0.000000,738.0,904.411765
21,9.0,CAN,Canada,1821.0,0,,0,0,0.2585,,...,0,,0,,0,0,0,0.000000,,
22,9.0,CAN,Canada,1822.0,0,,0,0,0.25,,...,0,,0,,0,0,0,0.000000,,
23,9.0,CAN,Canada,1823.0,0,,0,0,0.2551,,...,0,,0,,0,0,0,0.000000,,
24,9.0,CAN,Canada,1824.0,0,,0,0,0.2582,,...,0,,0,,0,0,0,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,9.0,CAN,Canada,2003.0,0,,0,0,1.2965,1.2924,...,0,,0,2.742,1,0,0,7.575000,746491.0,23409.043871
204,9.0,CAN,Canada,2004.0,0,,0,0,1.2028,1.2036,...,0,,0,1.841,1,0,0,7.191667,769405.0,23942.897153
205,9.0,CAN,Canada,2005.0,0,,0,0,1.1623,1.1645,...,0,,0,2.23,1,0,0,6.758333,792487.0,24470.048787
206,9.0,CAN,Canada,2006.0,0,,0,0,1.165,1.1653,...,0,,0,2.018,1,0,0,6.333333,815469.0,24970.726031


In [213]:
usa_gdp_percapita_list = gdp_percapita_data.loc[12:200, "USA"].tolist()

In [214]:
north_american_data.loc[454:642, "GDP PerCapita"] = usa_gdp_percapita_list

In [215]:
north_american_data[454:642]

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate,GDP,GDP PerCapita
454,66.0,USA,United States,1820.0,0,,0,0,4.62321,,...,0,,0,-7.84157,1,0,0,0.000000,12548.0,1257.250356
455,66.0,USA,United States,1821.0,0,,0,0,4.92854,,...,0,,0,-3.90402,1,0,0,0.000000,,
456,66.0,USA,United States,1822.0,0,,0,0,4.98504,,...,0,,0,3.52719,1,0,0,0.000000,,
457,66.0,USA,United States,1823.0,0,,0,0,4.78927,,...,0,,0,-8.65493,1,0,0,0.000000,,
458,66.0,USA,United States,1824.0,0,,0,0,4.86145,,...,0,,0,-7.47805,1,0,0,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637,66.0,USA,United States,2003.0,0,,0,0,1.785,,...,0,,0,2.298,1,1,0,6.061592,8431121.0,29074.234618
638,66.0,USA,United States,2004.0,0,,0,0,1.931,,...,0,,0,2.668,1,0,0,5.597952,8738865.0,29845.238827
639,66.0,USA,United States,2005.0,0,,0,0,1.722,,...,0,,0,3.366,1,0,0,5.143966,9009770.0,30481.353799
640,66.0,USA,United States,2006.0,0,,0,0,1.963,,...,0,,0,3.222,1,0,0,4.688935,9253034.0,31004.463179


In [216]:
mexico_gdp_percapita_list = gdp_percapita_data.loc[12:200, "Mexico"].tolist()

In [217]:
north_american_data.loc[237:425, "GDP PerCapita"] = mexico_gdp_percapita_list

In [218]:
north_american_data[237:425]

Unnamed: 0,Case,CC3,Country,Year,Banking Crisis,Banking_Crisis_Notes,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,...,Sovereign External Debt,Defaults_External_Notes,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate,GDP,GDP PerCapita
237,39.0,MEX,Mexico,1820.0,0,,0,0,0.9988,0.0000,...,0,,0,,0,0,0,0.000000,5000.0,759.070897
238,39.0,MEX,Mexico,1821.0,0,,0,0,0.9901,0.0000,...,0,,0,-13.608,1,0,0,0.000000,,
239,39.0,MEX,Mexico,1822.0,0,,0,0,0.9926,0.0000,...,0,,0,,1,0,0,0.000000,,
240,39.0,MEX,Mexico,1823.0,0,,0,0,0.995,0.0000,...,0,,0,,1,0,0,0.000000,,
241,39.0,MEX,Mexico,1824.0,0,,0,0,0.9911,0.0000,...,0,,0,,1,0,0,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,39.0,MEX,Mexico,2003.0,0,,0,0,11.236,11.2360,...,0,,0,4.551,1,0,0,3.405833,742508.0,7158.907385
421,39.0,MEX,Mexico,2004.0,0,,0,0,11.2648,11.2648,...,0,,0,4.694,1,0,0,3.915833,772208.0,7357.193093
422,39.0,MEX,Mexico,2005.0,0,,0,0,10.7777,10.7777,...,0,,0,3.99,1,0,0,3.559639,797691.0,7511.009374
423,39.0,MEX,Mexico,2006.0,0,,0,0,10.881,10.8810,...,0,,0,3.633,1,0,0,3.563787,837576.0,7795.064706


In [220]:
#checking dimensions of the data set
north_american_data.shape

(651, 25)

In [223]:
#Dropping some unnecessary text data from the dataset and creating a copy of it
df = north_american_data.drop(['Case','CC3','Banking_Crisis_Notes','national currency','exch_primary source code',
                          'exch_sources','Domestic_Debt_ Notes/Sources','Defaults_External_Notes'],axis=1)

In [224]:
df.columns

Index(['Country', 'Year', 'Banking Crisis ', 'Systemic Crisis',
       'Gold Standard', 'exch_usd', 'exch_usd_alt1',
       'Domestic_Debt_In_Default', 'Sovereign External Debt',
       'GDP_Weighted_default',
       'Inflation, Annual percentages of average consumer prices',
       'Independence', 'Currency Crises', 'Inflation Crises',
       'Unemployment Rate', 'GDP', 'GDP PerCapita'],
      dtype='object')

In [225]:
#Adding the new target column, Crisis, so that the models we build can be focused on only predicting if there was a crisis or not
df['Crisis'] = [0 for _ in range(651)]
for i in range(651):
    df.loc[i,'Crisis'] = 1 if (df.loc[i,'Banking Crisis '] or df.loc[i,'Currency Crises'] or df.loc[i,'Inflation Crises']
                      or df.loc[i,'Systemic Crisis']) else 0
df

Unnamed: 0,Country,Year,Banking Crisis,Systemic Crisis,Gold Standard,exch_usd,exch_usd_alt1,Domestic_Debt_In_Default,Sovereign External Debt,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Currency Crises,Inflation Crises,Unemployment Rate,GDP,GDP PerCapita,Crisis
0,Canada,1800.0,0,0,0,0.2441,,0,0,0,,0,0,0,0.000000,,,0
1,Canada,1801.0,0,0,0,0.2379,,0,0,0,,0,0,0,0.000000,,,0
2,Canada,1802.0,0,0,0,0.2553,,0,0,0,,0,0,0,0.000000,,,0
3,Canada,1803.0,0,0,0,0.2527,,0,0,0,,0,0,0,0.000000,,,0
4,Canada,1804.0,0,0,0,0.243,,0,0,0,,0,0,0,0.000000,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,United States,2012.0,0,0,0,1.6251,,0,0,0,2.078,1,0,0,8.165597,,,0
647,United States,2013.0,0,0,0,1.6556,,0,0,0,1.467,1,0,0,7.485224,,,0
648,United States,2014.0,0,0,0,1.5573,,0,0,0,1.61,1,0,0,6.255356,,,0
649,United States,2015.0,,0,0,1.4738,,0,0,0,0.118,1,0,,5.366365,,,1


In [226]:
df.isna().sum()

Country                                                       0
Year                                                          0
Banking Crisis                                                6
Systemic Crisis                                               4
Gold Standard                                                 0
exch_usd                                                     14
exch_usd_alt1                                               380
Domestic_Debt_In_Default                                      3
Sovereign External Debt                                       0
GDP_Weighted_default                                          0
Inflation, Annual percentages of average consumer prices     68
Independence                                                  0
Currency Crises                                               0
Inflation Crises                                              2
Unemployment Rate                                             0
GDP                                     

In [227]:
#Dropping the other crisis columns
df = df.drop(['Banking Crisis ','Systemic Crisis','Currency Crises','Inflation Crises'],axis=1)

In [228]:
df.isna().sum()

Country                                                       0
Year                                                          0
Gold Standard                                                 0
exch_usd                                                     14
exch_usd_alt1                                               380
Domestic_Debt_In_Default                                      3
Sovereign External Debt                                       0
GDP_Weighted_default                                          0
Inflation, Annual percentages of average consumer prices     68
Independence                                                  0
Unemployment Rate                                             0
GDP                                                         250
GDP PerCapita                                               250
Crisis                                                        0
dtype: int64

In [229]:
df.dtypes

Country                                                      object
Year                                                        float64
Gold Standard                                                object
exch_usd                                                     object
exch_usd_alt1                                               float64
Domestic_Debt_In_Default                                     object
Sovereign External Debt                                      object
GDP_Weighted_default                                         object
Inflation, Annual percentages of average consumer prices     object
Independence                                                 object
Unemployment Rate                                           float64
GDP                                                         float64
GDP PerCapita                                               float64
Crisis                                                        int64
dtype: object

In [230]:
#changing the exchange rate type to be float instead of string
df['exch_usd'] = df['exch_usd'].astype('float64')

In [231]:
#tried to use exch_usd_alt1 to fill in for some NaN values in exch_usd, but turns out those were also NaN for the rows we wanted to fix
for i in range(651):
    df.loc[i,'exch_usd'] = df.loc[i,'exch_usd_alt1'] if np.isnan(df.loc[i,'exch_usd']) else df.loc[i,'exch_usd']

df.isna().sum()

Country                                                       0
Year                                                          0
Gold Standard                                                 0
exch_usd                                                     14
exch_usd_alt1                                               380
Domestic_Debt_In_Default                                      3
Sovereign External Debt                                       0
GDP_Weighted_default                                          0
Inflation, Annual percentages of average consumer prices     68
Independence                                                  0
Unemployment Rate                                             0
GDP                                                         250
GDP PerCapita                                               250
Crisis                                                        0
dtype: int64

In [232]:
#Since I couldn't find the exchange rate for these 14 rows, so I dropped them
df.dropna(subset=['exch_usd'],inplace=True)
df

Unnamed: 0,Country,Year,Gold Standard,exch_usd,exch_usd_alt1,Domestic_Debt_In_Default,Sovereign External Debt,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Unemployment Rate,GDP,GDP PerCapita,Crisis
0,Canada,1800.0,0,0.2441,,0,0,0,,0,0.000000,,,0
1,Canada,1801.0,0,0.2379,,0,0,0,,0,0.000000,,,0
2,Canada,1802.0,0,0.2553,,0,0,0,,0,0.000000,,,0
3,Canada,1803.0,0,0.2527,,0,0,0,,0,0.000000,,,0
4,Canada,1804.0,0,0.2430,,0,0,0,,0,0.000000,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,United States,2012.0,0,1.6251,,0,0,0,2.078,1,8.165597,,,0
647,United States,2013.0,0,1.6556,,0,0,0,1.467,1,7.485224,,,0
648,United States,2014.0,0,1.5573,,0,0,0,1.61,1,6.255356,,,0
649,United States,2015.0,0,1.4738,,0,0,0,0.118,1,5.366365,,,1


In [234]:
#dropped the ech_usd_alt1 column since it wasn't really useful
df.drop(['exch_usd_alt1'],axis=1,inplace=True)

In [237]:
df.isna().sum()

Country                                                       0
Year                                                          0
Gold Standard                                                 0
exch_usd                                                      0
Domestic_Debt_In_Default                                      0
Sovereign External Debt                                       0
GDP_Weighted_default                                          0
Inflation, Annual percentages of average consumer prices     68
Independence                                                  0
Unemployment Rate                                             0
GDP                                                         233
GDP PerCapita                                               233
Crisis                                                        0
dtype: int64

In [238]:
#seeing which rows have the inflation data missing.
#canada is missing inflation values from 1800-1867
df[df['Inflation, Annual percentages of average consumer prices'].isnull()]

Unnamed: 0,Country,Year,Gold Standard,exch_usd,Domestic_Debt_In_Default,Sovereign External Debt,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Unemployment Rate,GDP,GDP PerCapita,Crisis
0,Canada,1800.0,0,0.2441,0,0,0,,0,0.0,,,0
1,Canada,1801.0,0,0.2379,0,0,0,,0,0.0,,,0
2,Canada,1802.0,0,0.2553,0,0,0,,0,0.0,,,0
3,Canada,1803.0,0,0.2527,0,0,0,,0,0.0,,,0
4,Canada,1804.0,0,0.2430,0,0,0,,0,0.0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,Canada,1863.0,1,0.6669,0,0,0,,0,0.0,,,0
64,Canada,1864.0,1,0.5575,0,0,0,,0,0.0,,,0
65,Canada,1865.0,1,0.6797,0,0,0,,0,0.0,,,0
66,Canada,1866.0,1,0.7851,0,0,0,,0,0.0,,,1


In [246]:
#trying to see for which years we don't have Mexico's GDP data (1814-1899,2009-2015)
df[df['GDP'].isnull()][df['Country']=='Mexico']['Year'].values

  df[df['GDP'].isnull()][df['Country']=='Mexico']['Year'].values


array([1814., 1815., 1816., 1817., 1818., 1819., 1821., 1822., 1823.,
       1824., 1825., 1826., 1827., 1828., 1829., 1830., 1831., 1832.,
       1833., 1834., 1835., 1836., 1837., 1838., 1839., 1840., 1841.,
       1842., 1843., 1844., 1845., 1846., 1847., 1848., 1849., 1850.,
       1851., 1852., 1853., 1854., 1855., 1856., 1857., 1858., 1859.,
       1860., 1861., 1862., 1863., 1864., 1865., 1866., 1867., 1868.,
       1869., 1871., 1872., 1873., 1874., 1875., 1876., 1877., 1878.,
       1879., 1880., 1881., 1882., 1883., 1884., 1885., 1886., 1887.,
       1888., 1889., 1891., 1892., 1893., 1894., 1896., 1897., 1898.,
       1899., 2009., 2010., 2011., 2012., 2013., 2014., 2015.])

In [247]:
#trying to see for which years we don't have Canada's GDP data (1800-1869,2009-2015)
df[df['GDP'].isnull()][df['Country']=='Canada']['Year'].values

  df[df['GDP'].isnull()][df['Country']=='Canada']['Year'].values


array([1800., 1801., 1802., 1803., 1804., 1805., 1806., 1807., 1808.,
       1809., 1810., 1811., 1812., 1813., 1814., 1815., 1816., 1817.,
       1818., 1819., 1821., 1822., 1823., 1824., 1825., 1826., 1827.,
       1828., 1829., 1831., 1832., 1833., 1834., 1835., 1836., 1837.,
       1838., 1839., 1841., 1842., 1843., 1844., 1845., 1846., 1847.,
       1848., 1849., 1851., 1852., 1853., 1854., 1855., 1856., 1857.,
       1858., 1859., 1861., 1862., 1863., 1864., 1865., 1866., 1867.,
       1868., 1869., 2009., 2010., 2011., 2012., 2013., 2014., 2015.])

In [248]:
#trying to see for which years we don't have USA's GDP data (1800-1869,2009-2015)
df[df['GDP'].isnull()][df['Country']=='United States']['Year'].values

  df[df['GDP'].isnull()][df['Country']=='United States']['Year'].values


array([1800., 1801., 1802., 1803., 1804., 1805., 1806., 1807., 1808.,
       1809., 1810., 1811., 1812., 1813., 1814., 1815., 1816., 1817.,
       1818., 1819., 1821., 1822., 1823., 1824., 1825., 1826., 1827.,
       1828., 1829., 1831., 1832., 1833., 1834., 1835., 1836., 1837.,
       1838., 1839., 1841., 1842., 1843., 1844., 1845., 1846., 1847.,
       1848., 1849., 1851., 1852., 1853., 1854., 1855., 1856., 1857.,
       1858., 1859., 1861., 1862., 1863., 1864., 1865., 1866., 1867.,
       1868., 1869., 2009., 2010., 2011., 2012., 2013., 2014., 2015.])

In [257]:
#finding out which rows correspond to USA, to see where to update the GDP values for the years 2009-2015
df[df['GDP'].isnull()][df['Country']=='United States']

  df[df['GDP'].isnull()][df['Country']=='United States']


Unnamed: 0,Country,Year,Gold Standard,exch_usd,Domestic_Debt_In_Default,Sovereign External Debt,GDP_Weighted_default,"Inflation, Annual percentages of average consumer prices",Independence,Unemployment Rate,GDP,GDP PerCapita,Crisis
434,United States,1800.0,0,4.462294,0,0,0,2.02703,1,0.000000,,,0
435,United States,1801.0,0,4.363002,0,0,0,1.3245,1,0.000000,,,0
436,United States,1802.0,0,4.474273,0,0,0,-15.6863,1,0.000000,,,0
437,United States,1803.0,0,4.662005,0,0,0,5.42636,1,0.000000,,,0
438,United States,1804.0,0,4.528986,0,0,0,4.41176,1,0.000000,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,United States,2011.0,0,1.553500,0,0,0,3.142,1,9.066351,,,0
646,United States,2012.0,0,1.625100,0,0,0,2.078,1,8.165597,,,0
647,United States,2013.0,0,1.655600,0,0,0,1.467,1,7.485224,,,0
648,United States,2014.0,0,1.557300,0,0,0,1.61,1,6.255356,,,0


In [254]:
#updating Canada's GDP from 2009-2015
df.loc[215,'GDP'] = 1556.13
df.loc[214, 'GDP'] = 1803.53
df.loc[213,'GDP'] = 1847.21
df.loc[212,'GDP'] = 1828.69
df.loc[211,'GDP'] = 1788.65
df.loc[210,'GDP'] = 1613.46
df.loc[209,'GDP'] = 1371.15

In [256]:
#updating Mexico's GDP from 2009-2015
df.loc[432,'GDP'] = 1170.56
df.loc[431,'GDP'] = 1314.56
df.loc[430,'GDP'] = 1274.44
df.loc[429,'GDP'] = 1201.09
df.loc[428,'GDP'] = 1180.49
df.loc[427,'GDP'] = 1057.8
df.loc[426,'GDP'] = 900.05

In [258]:
#updating USA's GDP from 2009-2015
df.loc[649,'GDP'] = 18238.301
df.loc[648,'GDP'] = 17527.258
df.loc[647,'GDP'] = 16784.851
df.loc[646,'GDP'] = 16197.007
df.loc[645,'GDP'] = 15542.582
df.loc[644,'GDP'] = 14992.052
df.loc[643,'GDP'] = 14448.93

In [259]:
df.isna().sum()

Country                                                       0
Year                                                          0
Gold Standard                                                 0
exch_usd                                                      0
Domestic_Debt_In_Default                                      0
Sovereign External Debt                                       0
GDP_Weighted_default                                          0
Inflation, Annual percentages of average consumer prices     68
Independence                                                  0
Unemployment Rate                                             0
GDP                                                         212
GDP PerCapita                                               233
Crisis                                                        0
dtype: int64

In [279]:
#writing this somewhat cleaned data to a CSV file
df.to_csv('somewhat_cleaned.csv')