In [33]:
import pandas as pd
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import linear_model, decomposition
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

In [34]:
files = !ls data
frames = [pd.ExcelFile(f'data/{i}').parse().ix[1:] for i in files]
data = pd.concat(frames)

In [35]:
# show how the data looks like
data[:1]

Unnamed: 0,Country,Region,Sub Region,Road Maintenance,Road Network,Road maintenance per network,Net Energy Imports,Trade as a % of GDP,Access to Electricity,Quality of Electricity Supply,...,FDI inflows,RnD Expenditure as a % of GDP,Payments for IP use,Charges for IP use,Charges for IP use as % GDP,Hi Tech Exports as % of manufactured exports,GDP Growth,GDP Growth per capita,Population,GDP
1,Afghanistan,South Asia,South Asia,,42150,0,,,81.1357,,...,,,,,,,,,18034130,


In [36]:
# only data columns
data_no_text = data.drop(['Country', 'Region', 'Sub Region', 'Proximity to conflict '], axis=1)


In [37]:
# remove the rows with a NaN value for GDP
data_no_nan_row = data_no_text[pd.notnull(data['GDP'])]

In [38]:
# Fill NaN values with mean
data_full = data_no_nan_row.fillna(data_no_nan_row.mean())

In [39]:
# Normalise the data
df_norm = (data_full - data_full.mean()) / (data_full.max() - data_full.min())

In [40]:
# show how the eventual preprocessed data looks like
df_norm[:1]

Unnamed: 0,Road Maintenance,Road Network,Road maintenance per network,Net Energy Imports,Trade as a % of GDP,Access to Electricity,Quality of Electricity Supply,Employment to Population Ratio,Labor Productivity per Person Employed,Human Development Index,...,FDI inflows,RnD Expenditure as a % of GDP,Payments for IP use,Charges for IP use,Charges for IP use as % GDP,Hi Tech Exports as % of manufactured exports,GDP Growth,GDP Growth per capita,Population,GDP
2,-0.062951,-0.027152,-0.006941,0.027489,-0.081668,0.260117,-1.324477e-14,-0.094036,-0.156948,-0.011923,...,2.8568130000000005e-17,1.140136e-15,1.11393e-17,1.08297e-16,-0.009226,-0.129431,0.078597,0.113948,-0.023219,-0.015822


In [41]:
list(df_norm.columns)

['Road Maintenance',
 'Road Network',
 'Road maintenance per network',
 'Net Energy Imports',
 'Trade as a % of GDP',
 'Access to Electricity',
 'Quality of Electricity Supply',
 'Employment to Population Ratio',
 'Labor Productivity per Person Employed',
 'Human Development Index',
 'Population Growth ',
 'Civil Liberty and Political Freedom',
 'Youth Unemployment',
 'Media Freedom',
 'Corruption Perceptions Index ',
 ' Journal Articles ',
 'Journal Articles per capita',
 'FDI inflows ',
 ' RnD Expenditure as a % of GDP',
 'Payments for IP use',
 'Charges for IP use',
 'Charges for IP use as % GDP',
 'Hi Tech Exports as % of manufactured exports',
 'GDP Growth',
 'GDP Growth per capita',
 'Population',
 'GDP']

In [47]:
# split data in train and label
X = df_norm.iloc[:,:23].values.astype(float)  # all data
X_TLF = df_norm.iloc[:,[15, 16, 17, 18, 19, 20, 22]].values.astype(float)  # TLF
X_ESF = df_norm.iloc[:,[2,4,6, 7,8,9]].values.astype(float)  # ESF
X_PSF = df_norm.iloc[:,[10, 11, 12, 13, 14]].values.astype(float) # PSF
y = df_norm['GDP'].values.astype(float)

In [48]:
# do Bayesian Ridge
estimator = linear_model.BayesianRidge()
score = cross_val_score(estimator, X, y).mean()
print("X Score = %.2f" % score)
score = cross_val_score(estimator, X_TLF, y).mean()
print("X_TLF Score = %.2f" % score)
score = cross_val_score(estimator, X_ESF, y).mean()
print("X_ESF Score = %.2f" % score)
score = cross_val_score(estimator, X_ESF, y).mean()
print("X_ESF Score = %.2f" % score)
score = cross_val_score(estimator, X_PSF, y).mean()
print("X_PSF Score = %.2f" % score)

X Score = 0.93
X_TLF Score = 0.92
X_ESF Score = 0.05
X_ESF Score = 0.05
X_PSF Score = -0.06


In [49]:
# do Random forrest, 3-fold cross validation
estimator = RandomForestRegressor()
score = cross_val_score(estimator, X, y).mean()
print("X Score = %.2f" % score)
score = cross_val_score(estimator, X_TLF, y).mean()
print("X_TLF Score = %.2f" % score)
score = cross_val_score(estimator, X_ESF, y).mean()
print("X_ESF Score = %.2f" % score)
score = cross_val_score(estimator, X_ESF, y).mean()
print("X_ESF Score = %.2f" % score)
score = cross_val_score(estimator, X_PSF, y).mean()
print("X_PSF Score = %.2f" % score)

X Score = 0.83
X_TLF Score = 0.86
X_ESF Score = 0.48
X_ESF Score = 0.48
X_PSF Score = -0.67


In [None]:

# do PCA with a linear regression

linear = linear_model.LinearRegression()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('linear', linear)])
score = cross_val_score(pipe, X, y).mean()
print("Score = %.2f" % score)

In [45]:
# look at this manually
estimator = linear_model.BayesianRidge()
estimator.fit(X[:928], y[:928])
estimator.predict(X[928]), y[928]



(array([-0.04303388]), -0.015161349071091344)