# Improving performance

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
# Load the data
df = pd.read_csv('../data/new_titanic_features.csv')

In [None]:
# Create Features and Labels
X = df[['Male', 'Family',
        'Pclass2_one', 'Pclass2_two', 'Pclass2_three',
        'Embarked_C', 'Embarked_Q', 'Embarked_S',
        'Age2', 'Fare3_Fare11to50', 'Fare3_Fare51+', 'Fare3_Fare<=10']]
y = df['Survived']


In [None]:
X.describe()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
print('Train Accuracy: {:0.3}'.format(accuracy_score(y_train, pred_train)))
print('Test Accuracy: {:0.3}'.format(accuracy_score(y_test, pred_test)))

In [None]:
confusion_matrix(y_test, pred_test)

In [None]:
print(classification_report(y_test, pred_test))

## Feature importances (wrong! see exercise 1)

In [None]:
coeffs = pd.Series(model.coef_.ravel(), index=X.columns)
coeffs

In [None]:
coeffs.plot(kind='barh')

## Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score, ShuffleSplit

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=.4, random_state=0)
scores = cross_val_score(model, X, y, cv=cv)
scores

In [None]:
'Crossval score: %0.3f +/- %0.3f ' % (scores.mean(), scores.std())

## Learning curve

In [None]:
from sklearn.model_selection import learning_curve

In [None]:
tsz = np.linspace(0.1, 1, 10)
train_sizes, train_scores, test_scores = learning_curve(model, X, y, train_sizes=tsz)

In [None]:
fig = plt.figure()
plt.plot(train_sizes, train_scores.mean(axis=1), 'ro-', label="Train Scores")
plt.plot(train_sizes, test_scores.mean(axis=1), 'go-', label="Test Scores")
plt.title('Learning Curve: Logistic Regression')
plt.ylim((0.5, 1.0))
plt.legend()
plt.draw()
plt.show()

### Exercise 1

Try rescaling the Age feature with [`preprocessing.StandardScaler`](http://scikit-learn.org/stable/modules/preprocessing.html) so that it will have comparable size to the other features.

- Do the model prediction change?
- Does the performance of the model change?
- Do the feature importances change?
- How can you explain what you've observed?

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
sc.fit(X_train[['Age2']])

In [None]:
X_train_sc = X_train.copy()
X_test_sc = X_test.copy()

X_train_sc['Age2'] = sc.transform(X_train[['Age2']])
X_test_sc['Age2'] = sc.transform(X_test[['Age2']])

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)
print('Train Accuracy (not scaled): {:0.3}'.format(accuracy_score(y_train, model.predict(X_train))))
print('Test Accuracy (not scaled): {:0.3}'.format(accuracy_score(y_test, model.predict(X_test))))

coeffs = pd.Series(model.coef_.ravel(), index=X.columns)

In [None]:
model.fit(X_train_sc, y_train)
print('Train Accuracy (scaled): {:0.3}'.format(accuracy_score(y_train, model.predict(X_train_sc))))
print('Test Accuracy (scaled): {:0.3}'.format(accuracy_score(y_test, model.predict(X_test_sc))))

coeffs_sc = pd.Series(model.coef_.ravel(), index=X.columns)

In [None]:
plt.figure(figsize=(15, 5))
plt.subplot(121)
coeffs.plot(kind='barh', title='Unscaled Age2')

plt.subplot(122)
coeffs_sc.plot(kind='barh', title='Scaled Age2')

plt.tight_layout()

Only the coefficients of the rescaled features can be interpreted as feature importances.

### Exercise 2

Experiment with another classifier for example `DecisionTreeClassifier`, `RandomForestClassifier`,  `SVC`, `MLPClassifier`, `SGDClassifier` or any other classifier of choice you can find here: http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html. 

- Train the model on both the scaled data and on the unscaled data
- Compare the score for the scaled and unscaled data
- how can you get the features importances for tree based models? Check [here](http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html) for some help.
- Which classifiers are impacted by the age rescale? Why?

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train, y_train)
print('Train Accuracy (not scaled): {:0.3}'.format(accuracy_score(y_train, model.predict(X_train))))
print('Test Accuracy (not scaled): {:0.3}'.format(accuracy_score(y_test, model.predict(X_test))))

coeffs = pd.Series(model.feature_importances_, index=X.columns)
coeffs.plot(kind='barh')

In [None]:
model.fit(X_train_sc, y_train)
print('Train Accuracy (scaled): {:0.3}'.format(accuracy_score(y_train, model.predict(X_train_sc))))
print('Test Accuracy (scaled): {:0.3}'.format(accuracy_score(y_test, model.predict(X_test_sc))))

coeffs = pd.Series(model.feature_importances_, index=X.columns)
coeffs.plot(kind='barh')

### Exercise 3

Pick your preferred classifier from Exercise 2 and search for the best hyperparameters. You can read about hyperparameter search [here](http://scikit-learn.org/stable/modules/grid_search.html)

- Decide the range of hyperparameters you intend to explore
- Try using [`GridSearchCV`](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV) to perform brute force search
- Try using [`RandomizedSearchCV`](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV) for a random search
- Once you've chosen the best classifier and the best hyperparameter set, redo the learning curve.
Do you need more data or a better model?

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

In [None]:
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

clf = RandomForestClassifier(n_estimators=20)

model = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=40, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
model.best_score_

In [None]:
model.score(X_test, y_test)

In [None]:
best = model.best_estimator_

In [None]:
best.fit(X_train, y_train)

In [None]:
best.score(X_test, y_test)

In [None]:
train_sizes, train_scores, test_scores = learning_curve(best, X, y, train_sizes=tsz)

In [None]:
fig = plt.figure()
plt.plot(train_sizes, train_scores.mean(axis=1), 'ro-', label="Train Scores")
plt.plot(train_sizes, test_scores.mean(axis=1), 'go-', label="Test Scores")
plt.title('Learning Curve: Logistic Regression')
plt.ylim((0.5, 1.0))
plt.legend()
plt.draw()
plt.show()

*Copyright &copy; 2017 CATALIT LLC.  All rights reserved.*