In [1]:
# this workbook demonstrates how to use scikit-learn with PySpark to do single variable binary regression
# 
# it's here as an example of how to switch from SparkML to scikit-learn if you'd like to do your 
# data pipeline and formatting in spark, but switch to pandas and scikit-learn for
# building, fitting, and interpreting your model
# 
# this can be a good approach when the heavy lifting that benefits from the cluster is mainly
# limited to data preparation, but your ML model itself is relatively small and fits neatly into
# memory and/or runs on a single node, and you'd prefer to use pandas scikit-learn. 
#

In [2]:
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from pandas.api.types import is_string_dtype

In [3]:
data = pd.read_csv("gapminder_all_binary.csv")

In [4]:
LE = LabelEncoder()

In [5]:
for c in data.columns:
    if is_string_dtype(data[c]):
        data[c] = LE.fit_transform(data[c])

In [6]:
# Select Labels
y = data.lifeExp_2007

# Select systolic column
feature_cols = list(data.columns.values)
feature_cols.remove('lifeExp_2007')
feature_cols.remove('Over_65')

#feature_cols

In [7]:
X = data.loc[:, feature_cols] 

In [8]:
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X, y)

In [9]:
predictions = clf.predict(X)

In [10]:
pd.DataFrame({"lifeExp_2007":data['lifeExp_2007'], "Pred":[x for x in predictions]})

Unnamed: 0,Pred,lifeExp_2007
0,72.301,72.301
1,42.731,42.731
2,56.728,56.728
3,50.728,50.728
4,52.295,52.295
5,49.580,49.580
6,50.430,50.430
7,44.741,44.741
8,50.651,50.651
9,65.152,65.152


In [11]:
# sensitivity analysis

In [12]:
pd.DataFrame({"feature":feature_cols, "importance":[c for c in clf.feature_importances_]})

Unnamed: 0,feature,importance
0,continent,1.791713e-05
1,country,0.0005979724
2,gdpPercap_1952,2.226973e-05
3,gdpPercap_1957,0.0009904259
4,gdpPercap_1962,0.0
5,gdpPercap_1967,0.0002136422
6,gdpPercap_1972,0.0
7,gdpPercap_1977,1.610242e-05
8,gdpPercap_1982,6.228149e-09
9,gdpPercap_1987,2.673465e-06
