In [1]:
import pandas as pd
import numpy as np

In [2]:
training_frame = pd.read_csv("/Users/FelixHoffmann/Desktop/GreenLeaders/Final_Training_Data_65515.csv",index_col=0)
application_frame = pd.read_csv("/Users/FelixHoffmann/Desktop/GreenLeaders/Final_Unlabelled_Data_7729.csv",index_col=0)

In [3]:
import imblearn
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [8]:
# Choose F2-leader as final pipeline (determined in Grid_Search.ipynb)
pipeline= imblearn.pipeline.Pipeline(steps=[('t1', RobustScaler()), ('over', SMOTE()),
                  ('under', RandomUnderSampler()),
                  ('m', QuadraticDiscriminantAnalysis())])

#get y and X for training and new application
y_train=np.array(training_frame['GreenLeaderBinary'])
X_train=np.array(training_frame.drop(columns=['GreenLeaderBinary']))

X_apply=np.array(application_frame.drop(columns=['GreenLeaderBinary']))


#fit pipeline on full labelled sample (use all available information)
pipeline.fit(X_train, y_train)

#predict labels for new observations
y_apply=pipeline.predict(X_apply)

#in sample predictions
y_insample=pipeline.predict(X_train)


## Choropleth for visualization on country level

In [11]:
#Add Country variable to observations

full_frame_countries = pd.read_csv("/Users/FelixHoffmann/Desktop/GreenLeaders/FullDataset.csv",index_col=0)

training_frame_incl_country=training_frame.join(full_frame_countries['Country'],how='left')
application_frame_incl_country=application_frame.join(full_frame_countries['Country'],how='left')

In [30]:
#add predictions to the frames
training_frame_incl_country['Prediction']=y_insample
application_frame_incl_country['Prediction']=y_apply

#concatenate to frame containing all observations, country, predictions
full_frame=pd.concat([application_frame_incl_country,training_frame_incl_country])

#give all GB countries GreatBritain country tag, otherwise use existing value
full_frame['Country'] = np.where(full_frame['Country']=='England', 'GreatBritain',full_frame['Country'])
full_frame['Country'] = np.where(full_frame['Country']=='Scotland', 'GreatBritain', full_frame['Country'])
full_frame['Country'] = np.where(full_frame['Country']=='Wales', 'GreatBritain', full_frame['Country'])
full_frame['Country'] = np.where(full_frame['Country']=='NorthernIreland', 'GreatBritain', full_frame['Country'])



In [31]:
#get averages by countries
country_grouping=full_frame.groupby('Country').mean()

In [28]:
import chart_studio.plotly as py
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
import plotly.graph_objs as go 

init_notebook_mode(connected=True)

In [32]:
data= dict(type='choropleth',
           colorscale='Portland',
           locations= country_grouping.index,
           locationmode = 'country names',
           z= country_grouping['Prediction'],
           marker = dict(line = dict(color = 'rgb(255,255,255)',width = 1)),
           colorbar={'title':'Proportion'}
          )

layout= dict(title="Predicted Proportion of GreenLeaders",
            geo=dict(scope='europe'))

choromap = go.Figure(data = [data],layout = layout)

#to plot in notebook
iplot(choromap)