### Load Data

In [1]:
#%matplotlib inline

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Files to Load 
MPI = "MPI_USInflowLPRs.csv"
UNODC = "UNODC_Homicides.csv"
CEPII = "CEPII_Distance.csv"
WEOGDP = "WEO_GDP.csv"
WEOPOP = "WEO_Population.csv"
WEOURATE = "WEO_Unemployment.csv"

# Read Data
df_Inflows = pd.read_csv(MPI)
df_Homicides = pd.read_csv(UNODC)
df_Distance = pd.read_csv(CEPII)
df_Income = pd.read_csv(WEOGDP)
df_Population = pd.read_csv(WEOPOP)
df_Unemployment = pd.read_csv(WEOURATE)

### Reshape Data

In [2]:
# Reshape data from wide to long
df_Inflows_Long = pd.melt(df_Inflows,id_vars=['Country','ISO'],var_name='Year', value_name='Inflows')
df_Homicides_Long = pd.melt(df_Homicides,id_vars=['Country','ISO'],var_name='Year', value_name='Homicides')
df_Income_Long = pd.melt(df_Income,id_vars=['Country','ISO'],var_name='Year', value_name='NGDPPC')
df_Population_Long = pd.melt(df_Population,id_vars=['Country','ISO'],var_name='Year', value_name='POP')
df_Unemployment_Long = pd.melt(df_Unemployment,id_vars=['Country','ISO'],var_name='Year', value_name='URATE')

In [3]:
# Convert Year to integer
df_Inflows_Long['Year'] = df_Inflows_Long['Year'].astype(np.int)
df_Homicides_Long['Year'] = df_Homicides_Long['Year'].astype(np.int)
df_Income_Long['Year'] = df_Income_Long['Year'].astype(np.int)
df_Population_Long['Year'] = df_Population_Long['Year'].astype(np.int)
df_Unemployment_Long['Year'] = df_Unemployment_Long['Year'].astype(np.int)

### US Data

In [4]:
# Distance between capitals
df_Dist_US = df_Distance[(df_Distance['iso_d'] == 'USA')].reset_index(drop=True)
df_Dist_US = df_Dist_US[['iso_o','contig','distcap']]
df_Dist_US.columns = ['ISO','Contiguous','Distance'] 

# US data
df_US_Inc = df_Income_Long[(df_Income_Long['ISO'] == 'USA')].reset_index(drop=True)
df_US_Pop = df_Population_Long[(df_Population_Long['ISO'] == 'USA')].reset_index(drop=True)
df_US_Unemp = df_Unemployment_Long[(df_Unemployment_Long['ISO'] == 'USA')].reset_index(drop=True)
df_US_Crime = df_Homicides_Long[(df_Homicides_Long['ISO'] == 'USA')].reset_index(drop=True)

# Keep selected columns for US data
df_US_Inc = df_US_Inc[['Year','NGDPPC']]
df_US_Pop = df_US_Pop[['Year','POP']]
df_US_Unemp = df_US_Unemp[['Year','URATE']]
df_US_Crime = df_US_Crime[['Year','Homicides']]

# Rename columns for US data
df_US_Inc.columns = ['Year','USNGDPPC'] 
df_US_Pop.columns = ['Year','USPOP'] 
df_US_Unemp.columns = ['Year','USURATE'] 
df_US_Crime.columns = ['Year','USHomicides'] 

# Merge US data
df_US = pd.merge(df_US_Inc, df_US_Pop, how='left', on='Year')
df_US = pd.merge(df_US, df_US_Unemp, how='left', on='Year')
df_US = pd.merge(df_US, df_US_Crime, how='left', on='Year')

### Merge Data

In [5]:
# Merge all data
df_Long = pd.merge(df_Inflows_Long, df_Homicides_Long, how='left', on=('Country','ISO','Year'))
df_Long = pd.merge(df_Long, df_Income_Long, how='left', on=('Country','ISO','Year'))
df_Long = pd.merge(df_Long, df_Unemployment_Long, how='left', on=('Country','ISO','Year'))
df_Long = pd.merge(df_Long, df_Population_Long, how='left', on=('Country','ISO','Year'))
df_Long = pd.merge(df_Long, df_Dist_US, how='left', on='ISO')
df_Long = pd.merge(df_Long, df_US, how='left', on='Year')

# Display the data frame for preview
df_Long.head()

Unnamed: 0,Country,ISO,Year,Inflows,Homicides,NGDPPC,URATE,POP,Contiguous,Distance,USNGDPPC,USPOP,USURATE,USHomicides
0,Afghanistan,AFG,1999,877.0,,,,,0.0,11155.07,34494.539,279.195,4.217,15522.0
1,Albania,ALB,1999,3695.0,532.0,1032.264,18.4,3.109,0.0,7770.42,34494.539,279.195,4.217,15522.0
2,Algeria,DZA,1999,789.0,,1630.071,29.293,29.965,0.0,6792.216,34494.539,279.195,4.217,15522.0
3,American Samoa,,1999,11.0,,,,,,,34494.539,279.195,4.217,15522.0
4,Angola,AGO,1999,57.0,,452.677,,16.625,0.0,10653.89,34494.539,279.195,4.217,15522.0


In [6]:
# Take Natural log
df_Long['Ln_Inflows'] = np.log(df_Long['Inflows'])
df_Long['Ln_Homicides'] = np.log(df_Long['Homicides'])
df_Long['Ln_Income'] = np.log(df_Long['NGDPPC'])
df_Long['Ln_Unemp'] = np.log(df_Long['URATE'])
df_Long['Ln_Pop'] = np.log(df_Long['POP'])
df_Long['Ln_Distance'] = np.log(df_Long['Distance'])

df_Long['Ln_USHomicides'] = np.log(df_Long['USHomicides'])
df_Long['Ln_USIncome'] = np.log(df_Long['USNGDPPC'])
df_Long['Ln_USUnemp'] = np.log(df_Long['USURATE'])
df_Long['Ln_USPop'] = np.log(df_Long['USPOP'])

In [7]:
#df_Long.head()

In [8]:
# List countries
df_Long = df_Long.sort_values(['Country','Year'])
df_Long.Country.unique()

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Angola',
       'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda',
       'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei Darussalam', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Canada', 'Cayman Islands', 'Central African Republic', 'Chad',
       'Chile', 'China', 'Colombia', 'Comoros', 'Costa Rica',
       "Cote d'Ivoire", 'Croatia', 'Cuba', 'Curacao', 'Cyprus',
       'Czech Republic', 'Democratic Republic of the Congo', 'Denmark',
       'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France',
       'French Guiana', 'French

In [9]:
# Count number of countries left in the dataset
len(df_Long['Country'].unique())

207

In [10]:
# Number of non-missing values of each column
df_Long.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3952 entries, 0 to 3951
Data columns (total 24 columns):
Country           3952 non-null object
ISO               3534 non-null object
Year              3952 non-null int64
Inflows           3774 non-null float64
Homicides         2378 non-null float64
NGDPPC            3483 non-null float64
URATE             2017 non-null float64
POP               3484 non-null float64
Contiguous        3401 non-null float64
Distance          3401 non-null float64
USNGDPPC          3952 non-null float64
USPOP             3952 non-null float64
USURATE           3952 non-null float64
USHomicides       3952 non-null float64
Ln_Inflows        3774 non-null float64
Ln_Homicides      2378 non-null float64
Ln_Income         3483 non-null float64
Ln_Unemp          2017 non-null float64
Ln_Pop            3484 non-null float64
Ln_Distance       3401 non-null float64
Ln_USHomicides    3952 non-null float64
Ln_USIncome       3952 non-null float64
Ln_USUnemp       

In [11]:
# Number of missing values for each column
df_Long.isnull().sum()

Country              0
ISO                418
Year                 0
Inflows            178
Homicides         1574
NGDPPC             469
URATE             1935
POP                468
Contiguous         551
Distance           551
USNGDPPC             0
USPOP                0
USURATE              0
USHomicides          0
Ln_Inflows         178
Ln_Homicides      1574
Ln_Income          469
Ln_Unemp          1935
Ln_Pop             468
Ln_Distance        551
Ln_USHomicides       0
Ln_USIncome          0
Ln_USUnemp           0
Ln_USPop             0
dtype: int64

In [12]:
# Percentage of missing values for each column
df_Long.isnull().mean().round(4) * 100

Country            0.00
ISO               10.58
Year               0.00
Inflows            4.50
Homicides         39.83
NGDPPC            11.87
URATE             48.96
POP               11.84
Contiguous        13.94
Distance          13.94
USNGDPPC           0.00
USPOP              0.00
USURATE            0.00
USHomicides        0.00
Ln_Inflows         4.50
Ln_Homicides      39.83
Ln_Income         11.87
Ln_Unemp          48.96
Ln_Pop            11.84
Ln_Distance       13.94
Ln_USHomicides     0.00
Ln_USIncome        0.00
Ln_USUnemp         0.00
Ln_USPop           0.00
dtype: float64

### Clean Data

In [13]:
# Remove rows with NAs for Homicides
df_clean = df_Long.dropna(subset=['Ln_Inflows'], how='any')
df_clean = df_Long.dropna(subset=['Homicides'], how='any')

# Percentage of missing values for each column
df_clean.isnull().mean().round(4) * 100

Country            0.00
ISO                0.00
Year               0.00
Inflows            0.97
Homicides          0.00
NGDPPC             0.08
URATE             27.04
POP                0.08
Contiguous         3.41
Distance           3.41
USNGDPPC           0.00
USPOP              0.00
USURATE            0.00
USHomicides        0.00
Ln_Inflows         0.97
Ln_Homicides       0.00
Ln_Income          0.08
Ln_Unemp          27.04
Ln_Pop             0.08
Ln_Distance        3.41
Ln_USHomicides     0.00
Ln_USIncome        0.00
Ln_USUnemp         0.00
Ln_USPop           0.00
dtype: float64

In [14]:
# Count number of countries left in the dataset
len(df_clean['Country'].unique())

162

In [15]:
# List of countries
df_clean = df_clean.sort_values(['Country','Year'])
df_clean.Country.unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia',
       'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Barbados',
       'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia',
       'Cameroon', 'Canada', 'Central African Republic', 'Chile', 'China',
       'Colombia', 'Costa Rica', 'Croatia', 'Cyprus', 'Denmark',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Estonia', 'Eswatini', 'Fiji', 'Finland', 'France', 'Georgia',
       'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala',
       'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hong Kong SAR',
       'Hungary', 'Iceland', 'India', 'Indonesia', 'Iraq', 'Ireland',
       'Islamic Republic of Iran', 'Israel', 'Italy', 'Jamaica', 'Japan',
       'Jordan', 'Kaz

In [16]:
# Summary Statistics
df_clean.describe()

Unnamed: 0,Year,Inflows,Homicides,NGDPPC,URATE,POP,Contiguous,Distance,USNGDPPC,USPOP,...,Ln_Inflows,Ln_Homicides,Ln_Income,Ln_Unemp,Ln_Pop,Ln_Distance,Ln_USHomicides,Ln_USIncome,Ln_USUnemp,Ln_USPop
count,2378.0,2355.0,2378.0,2376.0,1735.0,2376.0,2297.0,2297.0,2378.0,2378.0,...,2355.0,2378.0,2376.0,1735.0,2376.0,2297.0,2378.0,2378.0,2378.0,2378.0
mean,2008.023549,6779.284926,2424.586627,14580.359832,8.586199,44.031398,0.016543,8325.12817,47012.79259,303.736462,...,-inf,-inf,8.692331,1.9706,1.925797,8.884487,9.679818,10.746649,1.778002,5.715191
std,5.185766,18034.159581,7297.969866,18968.775959,5.35563,158.313745,0.12758,3858.466693,7034.233849,13.331977,...,,,1.471631,0.618631,2.027975,0.590591,0.075079,0.153156,0.27191,0.044117
min,1999.0,0.0,0.0,159.417,0.488,0.045,0.0,737.0425,34494.539,279.195,...,-inf,-inf,5.071523,-0.71744,-3.101093,6.602646,9.558459,10.448556,1.37801,5.63191
25%,2004.0,455.0,57.25,1997.0965,4.972,2.318,0.0,5901.343,41629.858,293.389,...,6.120297,4.047399,7.599449,1.603822,0.840702,8.682935,9.606159,10.636573,1.556459,5.681499
50%,2008.0,1377.0,254.5,5845.675,7.3,7.516,0.0,7770.42,47869.24,304.718,...,7.227662,5.539299,8.673457,1.987874,2.017034,8.958079,9.689551,10.776228,1.754923,5.719387
75%,2012.0,5336.0,878.75,20905.58325,10.892,27.2865,0.0,11448.37,51556.171,314.163,...,8.582232,6.7785,9.947772,2.388028,3.306392,9.345603,9.725556,10.850427,1.995788,5.749912
max,2017.0,218822.0,63895.0,120449.605,38.4,1390.08,1.0,16371.12,59894.998,325.326,...,12.296014,11.064996,11.698987,3.648057,7.237117,9.703274,9.85393,11.000348,2.262596,5.784828


In [17]:
# Replace Inf, -Inf with NaN, then delete NaN
df_clean['Ln_Inflows'] = df_clean['Ln_Inflows'][df_clean['Ln_Inflows'].replace([np.inf, -np.inf], np.nan).notnull()]
df_clean['Ln_Homicides'] = df_clean['Ln_Homicides'][df_clean['Ln_Homicides'].replace([np.inf, -np.inf], np.nan).notnull()]
df_clean.describe()

Unnamed: 0,Year,Inflows,Homicides,NGDPPC,URATE,POP,Contiguous,Distance,USNGDPPC,USPOP,...,Ln_Inflows,Ln_Homicides,Ln_Income,Ln_Unemp,Ln_Pop,Ln_Distance,Ln_USHomicides,Ln_USIncome,Ln_USUnemp,Ln_USPop
count,2378.0,2355.0,2378.0,2376.0,1735.0,2376.0,2297.0,2297.0,2378.0,2378.0,...,2353.0,2370.0,2376.0,1735.0,2376.0,2297.0,2378.0,2378.0,2378.0,2378.0
mean,2008.023549,6779.284926,2424.586627,14580.359832,8.586199,44.031398,0.016543,8325.12817,47012.79259,303.736462,...,7.214817,5.519236,8.692331,1.9706,1.925797,8.884487,9.679818,10.746649,1.778002,5.715191
std,5.185766,18034.159581,7297.969866,18968.775959,5.35563,158.313745,0.12758,3858.466693,7034.233849,13.331977,...,1.982076,2.226015,1.471631,0.618631,2.027975,0.590591,0.075079,0.153156,0.27191,0.044117
min,1999.0,0.0,0.0,159.417,0.488,0.045,0.0,737.0425,34494.539,279.195,...,1.098612,0.0,5.071523,-0.71744,-3.101093,6.602646,9.558459,10.448556,1.37801,5.63191
25%,2004.0,455.0,57.25,1997.0965,4.972,2.318,0.0,5901.343,41629.858,293.389,...,6.120297,4.077537,7.599449,1.603822,0.840702,8.682935,9.606159,10.636573,1.556459,5.681499
50%,2008.0,1377.0,254.5,5845.675,7.3,7.516,0.0,7770.42,47869.24,304.718,...,7.229114,5.541264,8.673457,1.987874,2.017034,8.958079,9.689551,10.776228,1.754923,5.719387
75%,2012.0,5336.0,878.75,20905.58325,10.892,27.2865,0.0,11448.37,51556.171,314.163,...,8.582419,6.778785,9.947772,2.388028,3.306392,9.345603,9.725556,10.850427,1.995788,5.749912
max,2017.0,218822.0,63895.0,120449.605,38.4,1390.08,1.0,16371.12,59894.998,325.326,...,12.296014,11.064996,11.698987,3.648057,7.237117,9.703274,9.85393,11.000348,2.262596,5.784828


In [18]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2378 entries, 2080 to 2911
Data columns (total 24 columns):
Country           2378 non-null object
ISO               2378 non-null object
Year              2378 non-null int64
Inflows           2355 non-null float64
Homicides         2378 non-null float64
NGDPPC            2376 non-null float64
URATE             1735 non-null float64
POP               2376 non-null float64
Contiguous        2297 non-null float64
Distance          2297 non-null float64
USNGDPPC          2378 non-null float64
USPOP             2378 non-null float64
USURATE           2378 non-null float64
USHomicides       2378 non-null float64
Ln_Inflows        2353 non-null float64
Ln_Homicides      2370 non-null float64
Ln_Income         2376 non-null float64
Ln_Unemp          1735 non-null float64
Ln_Pop            2376 non-null float64
Ln_Distance       2297 non-null float64
Ln_USHomicides    2378 non-null float64
Ln_USIncome       2378 non-null float64
Ln_USUnemp    

In [19]:
len(df_clean['Country'].unique())

162

In [20]:
# List of countries
df_clean = df_clean.sort_values(['Country','Year'])
df_clean.Country.unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia',
       'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Barbados',
       'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia',
       'Cameroon', 'Canada', 'Central African Republic', 'Chile', 'China',
       'Colombia', 'Costa Rica', 'Croatia', 'Cyprus', 'Denmark',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Estonia', 'Eswatini', 'Fiji', 'Finland', 'France', 'Georgia',
       'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala',
       'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hong Kong SAR',
       'Hungary', 'Iceland', 'India', 'Indonesia', 'Iraq', 'Ireland',
       'Islamic Republic of Iran', 'Israel', 'Italy', 'Jamaica', 'Japan',
       'Jordan', 'Kaz

### Save Data

In [21]:
# Export city data into a .csv
df_Long.to_csv("Working_Data_Long.csv", index=False)
df_clean.to_csv("Working_Data_Clean.csv", index=False)

In [23]:
len(df_Long['Country'].unique())

207