In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 

In [2]:
df = pd.read_csv((
    "https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
    "master/ESS_practice_data/ESSdata_Thinkful.csv")).dropna()
df.head()

Unnamed: 0,cntry,idno,year,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,gndr,agea,partner
0,CH,5.0,6,3.0,3.0,10.0,5.0,8.0,5.0,4.0,2.0,60.0,1.0
1,CH,25.0,6,6.0,5.0,7.0,5.0,9.0,3.0,2.0,2.0,59.0,1.0
2,CH,26.0,6,1.0,8.0,8.0,8.0,7.0,6.0,3.0,1.0,24.0,2.0
3,CH,28.0,6,4.0,6.0,6.0,7.0,10.0,6.0,2.0,2.0,64.0,1.0
4,CH,29.0,6,5.0,6.0,7.0,5.0,8.0,7.0,2.0,2.0,55.0,1.0


In [3]:
# transform to ordinal based upon percentiles, n is ordinal range
def ord_by_percentile(srs,n):
    pce = np.percentile(srs, [(i * 100 / n) for i in range(1, n + 1)])
    ret = pd.Series(np.zeros(srs.shape,  dtype=int)) 
    for i in range(len(pce)):
        ret = ret.where(srs < pce[i], other = i + 1)
    return ret

In [None]:
# trasform target to 0 and 1.
y = df['partner'] - 1

In [None]:
dfx = df['agea']

In [None]:
#added feature dfx.agea transformed by zscore 
dfx['age_z'] = dfx.agea.sub(dfx.agea.mean()).div(dfx.agea.std())
#added feature dfx.agea to ordinal by quintiles 
dfx['age_ord'] = ord_by_percentile(dfx.agea, 5)

In [None]:
dfx.tail()

In [None]:
# add n PCA features
n=3
pca = dfx.drop(['agea', 'age_z', 'age_ord'], axis=1).values 
pca = StandardScaler().fit_transform(pca)
pca = PCA(n_components=n).fit_transform(pca)
dfp = pd.DataFrame(data = pca, columns = ['pca' + str(i) for i in range(1,n+1)])

In [None]:
dfp.tail()

In [None]:
# Make the categorical variable 'country' into dummies.

X = pd.concat([dfx, pd.get_dummies(df['cntry'])], axis=1)
X.tail()


In [None]:
pd.get_dummies(df['cntry']).shape

In [None]:
# Set our outcome to 0 and 1.
y = df['partner'] - 1

#### initial results
Training set accuracy:
- Percent Type I errors: 0.04650845608292417
- Percent Type II errors: 0.17607746863066012

Test set accuracy:
- Percent Type I errors: 0.06257668711656442
- Percent Type II errors: 0.18527607361963191

### DRILL: Improve this gradient boost model

While this model is already doing alright, we've seen from the Type I and Type II error rates that there is definitely room for improvement.  Your task is to see how low you can get the error rates to go in the test set, based on your model in the training set.  Strategies you might use include:

* Creating new features
* Applying more overfitting-prevention strategies like subsampling
* More iterations
* Trying a different loss function
* Changing the structure of the weak learner: Allowing more leaves in the tree, or other modifications

Have fun!

In [None]:
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.1)
params = {'n_estimators': 500,'max_depth': 2, 'loss': 'deviance'}
clf = ensemble.GradientBoostingClassifier(**params)

In [None]:
prm0 = {'n_estimators': 500,'max_depth': 2,'loss': 'deviance'}
prm1 = {'n_estimators': 500,'max_depth': 2,'loss': 'exponential'}
prm2 = {'n_estimators': 750,'max_depth': 2,'loss': 'deviance'}
prm3 = {'n_estimators': 100,'max_depth': 2,'loss': 'deviance', 'verbose':1}
prm4 = {'n_estimators': 500,'max_depth': 3,'loss': 'deviance', 'verbose':1}
prm5 = {'n_estimators': 500,'max_depth': 4,'loss': 'deviance'}

In [None]:
clf.set_params(**prm5).fit(X_trn,  y_trn)
clf.score(X_tst, y_tst)

In [None]:
0.06134969325153374, 0.18404907975460122)

In [None]:
clf.loss_

In [None]:
accuracy_tab(y_tst, prd_tst)

In [None]:
pd.Series(clf.feature_importances_).idxmax()

In [None]:
feature_importance = clf.feature_importances_

# Make importances relative to max importance.
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
# returns (type I error %, type II error %)
def accuracy_tab(y,prd):
    tab = pd.crosstab(y, prd)
    return (tab.loc[0.0,1.0]  / tab.sum().sum(), tab.loc[1.0,0.0]  / tab.sum().sum())


In [None]:
train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(train_tI_errors, train_tII_errors, test_tI_errors, test_tII_errors))

In [None]:
table_train.sum().sum()

In [None]:
table_train

In [None]:
table_train.columns.name = 'target'
table_train.columns = ['false', 'True', 'totals']

In [None]:
table_train.index = ['false', 'true', 'totals' ]

In [None]:
table_train.index

In [None]:
table_train.columns

In [None]:
sns.set_style("whitegrid")
g = sns.PairGrid(X.iloc[:,:5], diag_sharey=False)
g.map_upper(plt.scatter, alpha=.5)
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))
g.map_diag(sns.kdeplot, lw=3)
plt.show()

In [None]:
sns.set_style("whitegrid")
g = sns.PairGrid(X.iloc[:,5:10], diag_sharey=False)
g.map_upper(plt.scatter, alpha=.5)
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))
g.map_diag(sns.kdeplot, lw=3)
plt.show()

In [None]:
A = df[df.partner == 1].sample(n=5000, replace=True)
B = df[df.partner == 2].sample(n=5000, replace=True)
AB = pd.concat([A,B])

In [None]:
X.agea.describe()

In [None]:
ax = sns.countplot(x="partner", hue="cntry", data=AB)