In [28]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
import sklearn.impute
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare
import split_scale

# Classification Modeling Exercise

In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

Do your work for these exercises in either a notebook or a python script named model within your classification directory.

1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?
2. Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.
3. Try out other combinations of features and models.
4. Choose you best model and evaluate it on the test dataset. Is it overfit?
5. **Bonus**: How do different strategies for handling the missing values in the age column affect model performance?
6. **Bonus**: How do different strategies for encoding sex affect model performance?
7. **Bonus**: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

$C=.01,.1,1,10,100,1000$

- 8. **Bonus Bonus**: how does scaling the data interact with your choice of C?

In [None]:
df = acquire.get_titanic_data()

In [None]:
df = df.drop(columns="deck")

In [None]:
# fix embarktown and embarked

df.embark_town = df.embark_town.fillna('Southampton')
df.embarked = df.embarked.fillna('S')

In [None]:
df.head()

### Initial Model

In [None]:
train, test = train_test_split(df, random_state=123, train_size=.8)
train, validate = train_test_split(train, train_size=.8)

X = train[['pclass', 'fare']]
y = train[['survived']]

logit1 = LogisticRegression(random_state=123).fit(X, y)

In [None]:
logit1.predict(X)

In [None]:
logit1.predict_proba(X)

In [None]:
logit1.score(X, y)

### 1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [None]:
train.head()

In [None]:
imputer = sklearn.impute.SimpleImputer(strategy='mean')
imputer.fit(train[['age']])
train.age = imputer.transform(train[['age']])
validate.age = imputer.transform(validate[['age']])
test.age = imputer.transform(test[['age']])

In [None]:
train.isna().sum()

In [None]:
print('    test: %d rows x %d columns' % test.shape)
print('   train: %d rows x %d columns' % train.shape)
print('validate: %d rows x %d columns' % validate.shape)

In [None]:
X = train[['pclass', 'fare', 'age']]
y = train[['survived']]

logit2 = LogisticRegression(random_state=123).fit(X, y)

In [None]:
logit2.predict(X)

In [None]:
logit2.predict_proba(X)

In [None]:
logit2.score(X, y)

In [None]:
X_validate = validate[['pclass', 'fare', 'age']]
y_validate = validate[['survived']]

In [None]:
logit2.score(X_validate, y_validate)

### 2. Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.

In [None]:
train.head()

In [None]:
# encode sex for both train and test
le = LabelEncoder()
train['sex_encoder'] = le.fit_transform(train.sex)
validate['sex_encoder'] = le.fit_transform(validate.sex)
test['sex_encoder'] = le.transform(test.sex)

In [None]:
train.head()

In [None]:
print('    test: %d rows x %d columns' % test.shape)
print('   train: %d rows x %d columns' % train.shape)
print('validate: %d rows x %d columns' % validate.shape)

In [None]:
X = train[['pclass', 'fare', 'age', 'sex_encoder']]
y = train[['survived']]

logit3 = LogisticRegression(random_state=123).fit(X, y)

In [None]:
logit3.predict(X)

In [None]:
logit3.predict_proba(X)

In [None]:
logit3.score(X, y)

In [None]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder']]
y_validate = validate[['survived']]

logit3.score(X_validate, y_validate)

### 3. Try out other combinations of features and models.

In [None]:
train.head()

In [None]:
# encoding embarked

encoder = sklearn.preprocessing.OneHotEncoder()
encoder.fit(train[["embarked"]])

m1 = encoder.transform(train[["embarked"]]).todense()

train = pd.concat([train, pd.DataFrame(m1, columns=encoder.categories_[0], index=train.index)], axis=1)

m2 = encoder.transform(validate[["embarked"]]).todense()

validate = pd.concat([validate, pd.DataFrame(m2, columns=encoder.categories_[0], index=validate.index)], axis=1)

m3 = encoder.transform(test[["embarked"]]).todense()

test = pd.concat([test, pd.DataFrame(m3, columns=encoder.categories_[0], index=test.index)], axis=1)

In [None]:
train.head()

In [None]:
print('    test: %d rows x %d columns' % test.shape)
print('   train: %d rows x %d columns' % train.shape)
print('validate: %d rows x %d columns' % validate.shape)

In [None]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit4 = LogisticRegression(random_state=123).fit(X, y)

In [None]:
logit4.predict(X)

In [None]:
logit4.predict_proba(X)

In [None]:
logit4.score(X, y)

In [None]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit4.score(X_validate, y_validate)

### 4. Choose you best model and evaluate it on the test dataset. Is it overfit?

In [None]:
X_test = test[['pclass', 'fare']]
y_test = test[['survived']]

logit1.score(X_test, y_test)

In [None]:
X_test = test[['pclass', 'fare', 'age']]
y_test = test[['survived']]

logit2.score(X_test, y_test)

In [None]:
X_test = test[['pclass', 'fare', 'age', 'sex_encoder']]
y_test = test[['survived']]

logit3.score(X_test, y_test)

In [None]:
X_test = test[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_test = test[['survived']]

logit4.score(X_test, y_test)

#### Looks like model 4 is the best fit without overfitting for the train set.

### 5. **Bonus**: How do different strategies for handling the missing values in the age column affect model performance?

By choosing to drop all the NA's caused major issues within the models. By imputing values the models were able to run without issue.

### 6. **Bonus**: How do different strategies for encoding sex affect model performance?

Because the choice is binary, the difference between One-Hot and Label encoding will result in similar findings.

### 7. **Bonus**: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

> Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

> $C=.01,.1,1,10,100,1000$

In [None]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit5 = LogisticRegression(random_state=123, C=.01).fit(X, y)

In [None]:
logit5.score(X, y)

In [None]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit5.score(X_validate, y_validate)

In [None]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit6 = LogisticRegression(random_state=123, C=.1).fit(X, y)
logit6.score(X, y)

In [None]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit6.score(X_validate, y_validate)

In [None]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit7 = LogisticRegression(random_state=123, C=1.0).fit(X, y)
logit7.score(X, y)

In [None]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit7.score(X_validate, y_validate)

In [None]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit8 = LogisticRegression(random_state=123, C=10.0).fit(X, y)
logit8.score(X, y)

In [None]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit8.score(X_validate, y_validate)

In [None]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit9 = LogisticRegression(random_state=123, C=100.0).fit(X, y)
logit9.score(X, y)

In [None]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit9.score(X_validate, y_validate)

In [None]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit10 = LogisticRegression(random_state=123, C=1000.0).fit(X, y)
logit10.score(X, y)

In [None]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit10.score(X_validate, y_validate)

### - 8. **Bonus Bonus**: how does scaling the data interact with your choice of C?

In [None]:
train_scaled = train[['age', 'fare']]
validate_scaled = validate[['age', 'fare']]
scaler, train_scaled, validate_scaled = split_scale.min_max_scaler(train_scaled, validate_scaled)
train_scaled.head()

In [None]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=.01).fit(X, y)
logit11.score(X, y)

In [None]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=.10).fit(X, y)
logit11.score(X, y)

In [None]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=1.0).fit(X, y)
logit11.score(X, y)

In [None]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=10.0).fit(X, y)
logit11.score(X, y)

In [None]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=100.0).fit(X, y)
logit11.score(X, y)

In [None]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=1000.0).fit(X, y)
logit11.score(X, y)

# Class Logic Regression Breakdown Review Section

# Classification Decision Tree Exercises

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)
1. Evaluate your in-sample results using the model score, confusion matrix, and classification report.
1. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
1. Run through steps 2-4 using entropy as your measure of impurity.
1. Which performs better on your in-sample data?

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
train.head()

In [None]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

In [None]:
clf = DecisionTreeClassifier(max_depth=7, random_state=123)
clf

In [None]:
clf.fit(X, y)

In [None]:
y_pred = clf.predict(X)
y_pred_proba = clf.predict_proba(X)

In [None]:
y_pred[0:10]

In [None]:
y_pred_proba

In [None]:
#confusion_matrix = pd.DataFrame(confusion_matrix(y, y_pred))
#confusion_matrix.index.name = 'actual'
confusion_matrix

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(clf.score(X, y)))

In [None]:
pd.DataFrame(classification_report(y, y_pred, output_dict=True))

In [None]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf.score(X_validate, y_validate)))

In [None]:
import graphviz
from graphviz import Graph

clf = DecisionTreeClassifier()
clf = clf.fit(X, y)

dot_data = export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

In [None]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))

# Classification Random Forest Exercises

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.
2. Evaluate your results using the model score, confusion matrix, and classification report.
3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.
5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
titanic = acquire.get_titanic_data()

In [None]:
def encode_sex(df):
    '''
    Returns a new dataframe with the ``sex`` column encoded.
    '''
    return df.assign(
        sex=(df.sex == 'female').astype(int)
    )

In [None]:
titanic = encode_sex(titanic)
titanic.age = titanic.age.fillna(titanic.age.mean()).astype("int")
titanic = titanic.drop(columns=["passenger_id", "embarked", "class", "deck", "embark_town"])

In [None]:
train, test = train_test_split(titanic)

In [None]:
X_train = train.drop(columns="survived")
y_train = train["survived"]
X_test = test.drop(columns="survived")
y_test = test["survived"]

In [None]:
y_train.shape

In [None]:
rf = RandomForestClassifier(random_state= 123, min_samples_leaf = 1, max_depth = 20)

rf.fit(X_train, y_train)

In [None]:
rf.feature_importances_

## Evaluate the Model

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, rf.predict(X_train)))

In [None]:
print(classification_report(y_train, rf.predict(X_train)))

## Test the Model

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

### 2. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.¶

In [None]:
rf = RandomForestClassifier(random_state= 123, min_samples_leaf = 5, max_depth = 3)

rf.fit(X_train, y_train)

## Evaluate the Model

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, rf.predict(X_train)))

In [None]:
print(classification_report(y_train, rf.predict(X_train)))

## Test the Model

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

# Classification KNN Exercises

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)
1. Evaluate your results using the model score, confusion matrix, and classification report.
1. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
1. Run through steps 2-4 setting k to 10
1. Run through setps 2-4 setting k to 20
1. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?


In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
titanic = acquire.get_titanic_data()

In [None]:
def encode_sex(df):
    '''
    Returns a new dataframe with the ``sex`` column encoded.
    '''
    return df.assign(
        sex=(df.sex == 'female').astype(int)
    )

In [None]:
titanic = encode_sex(titanic)
titanic.age = titanic.age.fillna(titanic.age.mean()).astype("int")
titanic = titanic.drop(columns=["passenger_id", "embarked", "class", "deck", "embark_town"])

In [None]:
train, test = train_test_split(titanic)

In [None]:
X_train = train.drop(columns="survived")
y_train = train["survived"]
X_test = test.drop(columns="survived")
y_test = test["survived"]

In [None]:
X_train.head()

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
knn = knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

knn.fit(X_train_scaled, y_train)

## Evaluate the Model

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))

In [None]:
print(confusion_matrix(y_train, knn.predict(X_train)))

In [None]:
print(classification_report(y_train, knn.predict(X_train)))

## 4. Run through steps 2-4 setting `k` to 10

In [None]:
knn = knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')

knn.fit(X_train_scaled, y_train)

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))

In [None]:
print(confusion_matrix(y_train, knn.predict(X_train)))

In [None]:
print(classification_report(y_train, knn.predict(X_train)))

## 5. Run through steps 2-4 setting `k` to 20

In [None]:
knn = knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')

knn.fit(X_train_scaled, y_train)

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))

In [None]:
print(confusion_matrix(y_train, knn.predict(X_train)))

In [None]:
print(classification_report(y_train, knn.predict(X_train)))

## 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

It would appear that the first version of the KNN with a `k` set to 5 was the best performing model with an accuracy of 86%.  

# Test

- For both the iris and the titanic data,

1. Determine which model (with hyperparameters) performs the best (try reducing the number of features to the top 4 features in terms of information gained for each feature individually).
1. Create a new dataframe with top 4 features.
1. Use the top performing algorithm with the metaparameters used in that model. Create the object, fit, transform on in-sample data, and evaluate the results with the training data. Compare your evaluation metrics with those from the original model (with all the features).
1. Run your final model on your out-of-sample dataframe (test_df). Evaluate the results.

In [2]:
iris = acquire.get_iris_data()

In [3]:
iris = iris.drop(columns='species_id')

In [4]:
iris = iris.rename(columns={'species_name': 'species'})

In [5]:
iris.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


In [6]:
def get_iris_splits(iris):
    '''
    Returns X and y for train, validate and test datasets
    '''
    # don't blow away our original data
    iris = iris.copy()
    
#     # ignore warnings just for this block
#     with warnings.catch_warnings():
#         warnings.simplefilter('ignore')
#         scaler, encoder, train, test = prepare_walkthrough.prep_iris(iris)
    
    # Which features are we going to look at?
    cols = ['species', 'sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    train = iris[cols]
    test = iris[cols]

    # validate data split
    train, validate = sklearn.model_selection.train_test_split(train, train_size=.80, random_state=123)

    # split into X and y
    X_train, y_train = train.drop(columns='species'), train.species
    X_validate, y_validate = validate.drop(columns='species'), validate.species
    X_test, y_test = test.drop(columns='species'), test.species
    
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

In [7]:
X_train, y_train, X_validate, y_validate, X_test, y_test = get_iris_splits(iris)

print('   train: %d rows' % X_train.shape[0])
print('validate: %d rows' % X_validate.shape[0])
print('    test: %d rows' % X_test.shape[0])

   train: 120 rows
validate: 30 rows
    test: 150 rows


In [8]:
X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
130,7.4,2.8,6.1,1.9
119,6.0,2.2,5.0,1.5
29,4.7,3.2,1.6,0.2
0,5.1,3.5,1.4,0.2
62,6.0,2.2,4.0,1.0


In [9]:
# a dataframe to hold our models' predictions for future comparison
evaluation = pd.DataFrame({
    'actual': y_validate
})

In [20]:
# species ~ sepal_length + sepal_width + petal_length + petal_wdith
X_train, y_train, X_validate, y_validate, X_test, y_test = get_iris_splits(iris)

dtree_model = DecisionTreeClassifier(max_depth=7, random_state=123)
dtree_model.fit(X_train, y_train)

dtree_model.predict_proba(X_validate)[:, 1]
evaluation['dtree_accuracy'] = ('{:.2%}'.format(dtree_model.score(X_validate, y_validate)))

In [22]:
# species ~ sepal_length + sepal_width + petal_length + petal_wdith
X_train, y_train, X_validate, y_validate, X_test, y_test = get_iris_splits(iris)

rf_model = RandomForestClassifier(random_state= 123, min_samples_leaf = 1, max_depth = 20)
rf_model.fit(X_train, y_train)

rf_model.predict_proba(X_validate)[:, 1]
evaluation['rf_accuracy'] = ('{:.2%}'.format(rf_model.score(X_validate, y_validate)))

In [29]:
# species ~ sepal_length + sepal_width + petal_length + petal_wdith
X_train, y_train, X_validate, y_validate, X_test, y_test = get_iris_splits(iris)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_validate = scaler.transform(X_validate)

knn_model = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn_model.fit(X_train, y_train)

knn_model.predict_proba(X_validate)[:, 1]
evaluation['knn_accuracy'] = ('{:.2%}'.format(knn_model.score(X_validate, y_validate)))

In [30]:
evaluation

Unnamed: 0,actual,dtree_accuracy,rf_accuracy,knn_accuracy
72,versicolor,96.67%,90.00%,100.00%
112,virginica,96.67%,90.00%,100.00%
132,virginica,96.67%,90.00%,100.00%
88,versicolor,96.67%,90.00%,100.00%
37,setosa,96.67%,90.00%,100.00%
138,virginica,96.67%,90.00%,100.00%
87,versicolor,96.67%,90.00%,100.00%
42,setosa,96.67%,90.00%,100.00%
8,setosa,96.67%,90.00%,100.00%
90,versicolor,96.67%,90.00%,100.00%
