In [1]:
# General DS Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Decision Tree and Model Evaluation Imports
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [3]:
# my acquire and prepare file
import acquire
import prepare

from prepare import train_validate_test_split

## Exercises
Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

In [None]:
### Acquire
titanic_df = acquire.get_titanic_data()
titanic_df.head()

In [None]:
#Prepare titanic data
titanic_df = prepare.prep_titanic(titanic_df)
titanic_df.head()

In [None]:
#Drop Columns not needed for modeling
titanic_df.drop(['embark_town'], axis=1, inplace=True)

In [None]:
titanic_df.head()

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [None]:
#Train validate test split
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [None]:
# split into train, validate, test
train, validate, test = train_validate_test_split(titanic_df, target='survived', seed=123)

In [None]:
#check shape

In [None]:
# create X & y version of train, where y is a series with just the target variable and X are all the features.
X_train = train.drop(columns=['survived'])
y_train = train.survived
train.survived

In [None]:
#Validate and Test 
X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
#Find Baseline
y_train.value_counts()
#Baseline is 0, did not survive

In [None]:
#Establish Baseline
y_train.info()

####  baseline prediction? 

In [None]:
#Baseline Accuracy
(y_train == 0).mean()

### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
tree = DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
# model.fit(X, y)

tree.fit(X_train, y_train)

In [None]:
print(export_text(tree, feature_names=X_train.columns.tolist()))

In [None]:
# Visualize the tree NOT WORKING
plt.figure(figsize=(12, 7))
plot_tree(tree, feature_names=X_train.columns, class_names=y_train.unique())
plt.show()

In [None]:
#Use your model to make predictions on the in-sample data
tree.predict(X_train)

In [1]:
y_predictions = tree.predict(X_train)
actual = y_train

NameError: name 'tree' is not defined

In [None]:
#Baseline accuracy
(y_train == 0).mean()

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
confusion_matrix(actual, predictions)

In [None]:
pd.crosstab(actual, predictions)

In [None]:
print(classification_report(actual, predictions))

### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
#On validate data
predictions = tree.predict(X_validate)
actual = y_validate

print(classification_report(actual, predictions))

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree.score(X_train, y_train)))

In [None]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
      .format(tree.score(X_validate, y_validate)))

In [None]:
acc = tree.score(X_train, y_train)
acc

In [None]:
#Could calculate this way, need to define y predictions
#TN, FP, FN, TP = confusion_matrix(y_train, y_predictions).ravel()
#TP, FP, FN, TP

### 5. Run through steps 2-4 using a different max_depth value.

#### Using Tree 2, 4 levels of depth

In [None]:
tree2 = DecisionTreeClassifier(max_depth=4, random_state=123)

In [None]:
# model.fit(X, y)

tree2.fit(X_train, y_train)

In [None]:
print(export_text(tree, feature_names=X_train.columns.tolist()))

In [None]:
#Use your model to make predictions on the in-sample data
tree2.predict(X_train)

In [None]:
predictions = tree2.predict(X_train)
actual = y_train

#### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
confusion_matrix(actual, predictions)

In [None]:
pd.crosstab(actual, predictions)

In [None]:
print(classification_report(actual, predictions))

In [None]:
#Calculating from validation data
predictions = tree2.predict(X_validate)
actual = y_validate

print(classification_report(actual, predictions))

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree2.score(X_train, y_train)))

In [None]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
      .format(tree2.score(X_validate, y_validate)))

### 6. Which model performs better on your in-sample data?

In [None]:
#Tree2 (model 2) performs better on my sample (training) data

### 7. Which model performs best on your out-of-sample data, the validate set?


In [None]:
#Tree 1 (model 1) performs better on the validate data

## CONCLUSIONS: By changing max depth to four levels, we increased accuracy slightly, but also made a more precise model (model 2 aka 'Tree 2' had a much lower false positive rate than model 1).

--------------------###------------------

## TELCO DATA SET

In [None]:
### Acquire
df = acquire.get_telco_data()
df.head()

In [None]:
#Prep telco data
def prep_telco_data(df):
    # Drop duplicate columns
    df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'], inplace=True)
       
    # Drop null values stored as whitespace    
    df['total_charges'] = df['total_charges'].str.strip()
    df = df[df.total_charges != '']
    
    # Convert to correct datatype
    df['total_charges'] = df.total_charges.astype(float)
    
    # Convert binary categorical variables to numeric
    df['gender_encoded'] = df.gender.map({'Female': 1, 'Male': 0})
    df['partner_encoded'] = df.partner.map({'Yes': 1, 'No': 0})
    df['dependents_encoded'] = df.dependents.map({'Yes': 1, 'No': 0})
    df['phone_service_encoded'] = df.phone_service.map({'Yes': 1, 'No': 0})
    df['paperless_billing_encoded'] = df.paperless_billing.map({'Yes': 1, 'No': 0})
    df['churn_encoded'] = df.churn.map({'Yes': 1, 'No': 0})
    
    # Get dummies for non-binary categorical variables
    dummy_df = pd.get_dummies(df[['multiple_lines', \
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type']], dummy_na=False, \
                              drop_first=True)
    
    # Concatenate dummy dataframe to original 
    df = pd.concat([df, dummy_df], axis=1)
    
    return df

In [None]:
df = prep_telco_data(df)

In [None]:
#Drop Columns not needed for modeling
df.drop(['gender','partner','dependents','phone_service','multiple_lines',\
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type', \
                              'paperless_billing', \
                              'churn'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
#Renaming churn encoded
df.rename(columns = {'churn_encoded':'churn'}, inplace = True)

In [None]:
df.head().T

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [None]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df, target='churn', seed=123)

In [None]:
# create X & y version of train, where y is a series with just the target variable and X are all the features.
X_train = train.drop(columns=['churn'])
y_train = train.churn
train.churn

In [None]:
#check shape
train.shape
validate.shape
test.shape

In [None]:
#Create validate and test data
#Validate and Test 
X_validate = validate.drop(columns=['churn'])
y_validate = validate.churn

X_test = test.drop(columns=['churn'])
y_test = test.churn

In [None]:
#Find Baseline
y_train.value_counts()
#Baseline is 0, customer did not churn

In [None]:
#Baseline Accuracy
(y_train == 0).mean()

### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
tree1 = DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
# model.fit(X, y)

tree1.fit(X_train, y_train)

In [None]:
print(export_text(tree1, feature_names=X_train.columns.tolist()))

In [None]:
#Use your model to make predictions on the in-sample data
tree1.predict(X_train)

--------------------##Random Forest Exercises##--------------------------------

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [None]:
### Acquire
titanic_df = acquire.get_titanic_data()
titanic_df.head()

In [None]:
#Prepare titanic data
titanic_df = prepare.prep_titanic(titanic_df)
titanic_df.head()

In [None]:
#Drop Columns not needed for modeling
titanic_df.drop(['embarked'], axis=1, inplace=True)

In [None]:
titanic_df.head()

In [None]:
from prepare import train_validate_test_split

In [None]:
# split into train, validate, test
# split into train, validate, test
train, validate, test = train_validate_test_split(titanic_df, target='survived', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [None]:
rf.fit(X_train, y_train)

In [None]:
print(rf.feature_importances_)

In [None]:
plt.bar(X_train.columns, rf.feature_importances_)
plt.show()

In [None]:
#Make predictions
y_pred = rf.predict(X_train)

In [None]:
rf.classes_

In [None]:
y_pred_proba = rf.predict_proba(X_train)
y_pred_proba

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
#  mode for target?
y_train.mode()

In [None]:
# Establish our baseline prediction 
baseline = y_train.mode()

# Next let establish a baseline accuracy
matches_baseline_prediction = y_train == 0

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline prediction: {baseline[0]}")
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [None]:
#Evaluate the model's performance on train
y_predictions = rf.predict(X_train)

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
# classification report on the actual y values and this model's predicted y values
report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of depth 10")
pd.DataFrame(report)

In [None]:
cm = confusion_matrix(y_train, y_pred)
print(cm)

### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### Training Set

In [None]:
# sklearn confusion matrix
cm = confusion_matrix(y_train, y_predictions)
print(cm)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf.classes_)

disp.plot()
plt.show()

In [None]:
#Creating classification report on training data

TN, FP, FN, TP = confusion_matrix(y_train,y_predictions).ravel()
ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### Looking at performance on validate data

In [None]:
rf.score(X_validate, y_validate)

In [None]:
#Accuracy of validate set
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

In [None]:
#predictions
y_pred = rf.predict(X_validate)

In [None]:
#Creating classification report on validate data

TN, FP, FN, TP = confusion_matrix(y_validate,y_pred).ravel()
ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

### Model 2: min_samples_leaf = 3 and max_depth = 5

In [None]:
### Acquire
titanic_df = acquire.get_titanic_data()
titanic_df.head()

In [None]:
#Prepare titanic data
titanic_df = prepare.prep_titanic(titanic_df)
titanic_df.head()

In [None]:
#Drop Columns not needed for modeling
titanic_df.drop(['embarked'], axis=1, inplace=True)

In [None]:
# split into train, validate, test
# split into train, validate, test
train, validate, test = train_validate_test_split(titanic_df, target='survived', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
#Increasing min samples leaf and decreasing max depth
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)

In [None]:
#Fit
rf2.fit(X_train, y_train)

In [None]:
#Print feature importances
print(rf2.feature_importances_)

In [None]:
#Show feature importances
plt.bar(X_train.columns, rf2.feature_importances_)
plt.show()

#### Make Predictions

In [None]:
#predictions
y_pred = rf2.predict(X_train)

In [None]:
rf2.classes_

In [None]:
y_pred_proba = rf2.predict_proba(X_train)
y_pred_proba
#Tells the prediction of probability a being made

#### Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
#  mode for target
y_train.mode()

In [None]:
# Establish our baseline prediction 
baseline = y_train.mode()

# Next let establish a baseline accuracy
matches_baseline_prediction = y_train == 0

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline prediction: {baseline[0]}")
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [None]:
#Evaluate the model's performance on train
y_predictions = rf2.predict(X_train)

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of random forest classifier model 2 on training set: {:.2f}'
     .format(rf2.score(X_train, y_train)))

In [None]:
# classification report on the actual y values and this model's predicted y values
report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of depth 5")
pd.DataFrame(report)

#### Calculate accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
# sklearn confusion matrix
cm = confusion_matrix(y_train, y_predictions)
print(cm)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf2.classes_)

disp.plot()
plt.show()

In [None]:
#Creating classification report on training data

TN, FP, FN, TP = confusion_matrix(y_train,y_predictions).ravel()
ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### Looking at performance on validate data

In [None]:
rf2.score(X_validate, y_validate)

In [None]:
#Accuracy of validate set
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf2.score(X_validate, y_validate)))

In [None]:
#predictions
y_pred = rf2.predict(X_validate)

In [None]:
#Creating classification report on validate data

TN, FP, FN, TP = confusion_matrix(y_validate,y_pred).ravel()
ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### Model 3: min_leaf_samples = 3 and max_depth = 10

In [None]:
### Acquire
titanic_df = acquire.get_titanic_data()
titanic_df.head()

In [None]:
#Prepare titanic data
titanic_df = prepare.prep_titanic(titanic_df)
titanic_df.head()

In [None]:
#Drop Columns not needed for modeling
titanic_df.drop(['embarked'], axis=1, inplace=True)

In [None]:
# split into train, validate, test
# split into train, validate, test
train, validate, test = train_validate_test_split(titanic_df, target='survived', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
#Min samples leaf = 3 and max depth of 10
rf3 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [None]:
#Fit
rf3.fit(X_train, y_train)

In [None]:
rf3.classes_

In [None]:
y_pred_proba = rf3.predict_proba(X_train)
y_pred_proba

#### Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
#  mode for target
y_train.mode()

In [None]:
#Evaluate the model's performance on train
y_predictions = rf3.predict(X_train)

In [None]:
print(classification_report(y_train, y_predictions))

In [None]:
print('Accuracy of random forest classifier model 3 on training set: {:.2f}'
     .format(rf3.score(X_train, y_train)))

In [None]:
# classification report on the actual y values and this model's predicted y values
report = classification_report(y_train, y_predictions, output_dict=True)
print("Tree of depth 10")
pd.DataFrame(report)

In [None]:
# sklearn confusion matrix
cm = confusion_matrix(y_train, y_predictions)
print(cm)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf3.classes_)

disp.plot()
plt.show()

In [None]:
#Creating classification report on training data

TN, FP, FN, TP = confusion_matrix(y_train,y_predictions).ravel()
ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### Looking at performance on validate data

In [None]:
rf3.score(X_validate, y_validate)

In [None]:
#Accuracy of validate set
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf3.score(X_validate, y_validate)))

In [None]:
#predictions
y_pred = rf3.predict(X_validate)

In [None]:
#Creating classification report on validate data

TN, FP, FN, TP = confusion_matrix(y_validate,y_pred).ravel()
ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### CONCLUSION:
After making a few models, which one has the best performance (or closest metrics) on both train and validate?

-------## KNN Problems ##-------------

## 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
### Acquire
titanic_df = acquire.get_titanic_data()
titanic_df.head()

In [None]:
#Prepare titanic data
titanic_df = prepare.prep_titanic(titanic_df)
titanic_df.head()

In [None]:
#Drop Columns not needed for modeling
titanic_df.drop(['embark_town'], axis=1, inplace=True)

In [None]:
titanic_df.head()

In [None]:
# split into train, validate, test
# split into train, validate, test
train, validate, test = train_validate_test_split(titanic_df, target='survived', seed=123)

In [None]:
#Impute age
train, validate, test = prepare.impute_mean_age(train, validate, test)

In [None]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
#X_train.shape
#X_validate.shape
X_test.shape

In [None]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [None]:
knn

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_train)

In [None]:
y_pred_proba = knn.predict_proba(X_train)
y_pred_proba

### 2. evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
pd.crosstab(y_train, y_pred)

In [None]:
plot_confusion_matrix(knn, X_train, y_train)

### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()

In [None]:
ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### Performance on Validate Data

In [None]:
print('Accuracy of KNN (k=5) classifier on validate set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

In [None]:
#Establish predictions from validate data
y_pred = knn.predict(X_validate)

In [None]:
plot_confusion_matrix(knn, X_validate, y_validate)

In [None]:
TN, FP, FN, TP = confusion_matrix(y_validate, y_pred).ravel()

ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### 4. Run through steps 2-4 setting k to 10

In [None]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')

In [None]:
#Fit
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_train)

In [None]:
y_pred_proba = knn.predict_proba(X_train)

#### evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
plot_confusion_matrix(knn, X_train, y_train)

In [None]:
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()

ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### Performance on Validate Data

In [None]:
print('Accuracy of KNN (k=10) classifier on validate set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

In [None]:
#Establish predictions from validate data
y_pred = knn.predict(X_validate)

In [None]:
plot_confusion_matrix(knn, X_validate, y_validate)

In [None]:
TN, FP, FN, TP = confusion_matrix(y_validate, y_pred).ravel()

ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### 5. Run through steps 2-4 setting k to 20

In [None]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')

In [None]:
#Fit
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_train)

In [None]:
y_pred_proba = knn.predict_proba(X_train)

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
plot_confusion_matrix(knn, X_train, y_train)

In [None]:
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()

ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### Performance on Validate Data

In [None]:
print('Accuracy of KNN (k=20) classifier on validate set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

In [None]:
#Establish predictions from validate data
y_pred = knn.predict(X_validate)

In [None]:
plot_confusion_matrix(knn, X_validate, y_validate)

In [None]:
TN, FP, FN, TP = confusion_matrix(y_validate, y_pred).ravel()

ALL = TP + TN + FP + FN

TP, TN, FP, FN

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

### For in-sample Data
#### Model 1 (k =5)

#### Model 2 (k=10)

#### Model 3 (k=20)


In [None]:
### The first model appears to perform better on in-sample data in some key measures (accuracy 76%), and leads
### the three models in Recall (65%). Model 2 has slightly better precision but lower accuracy and a lower recall (51%).
### The last model performs almost last in all metrics.

### 7. Which model performs best on our out-of-sample data from validate?

In [None]:
#The first model performs best on out of sample data (accuracy 76 percent). The third model performs
# slightly better on out of sample data (validate)

In [None]:
#Find Baseline
y_train.value_counts()
#Baseline is 0, did not survive

In [None]:
#Baseline Accuracy
(y_train == 0).mean()

### -------------------LOGISTIC REGRESSION----------------------

## Exercises

In [75]:
### Acquire
titanic_df = acquire.get_titanic_data()
### Prepare
titanic_df = prepare.prep_titanic(titanic_df)
titanic_df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embark_town,alone,is_female,embark_Cherbourg,embark_Queenstown,embark_Southampton
0,0,3,22.0,1,0,7.25,Southampton,0,0,0,0,1
1,1,1,38.0,1,0,71.2833,Cherbourg,0,1,1,0,0
2,1,3,26.0,0,0,7.925,Southampton,1,1,0,0,1
3,1,1,35.0,1,0,53.1,Southampton,0,1,0,0,1
4,0,3,35.0,0,0,8.05,Southampton,1,0,0,0,1


In [76]:
#Dropping embark_town
titanic_df.drop(['embark_town'], axis=1, inplace=True)

In [77]:
train, validate, test = train_validate_test_split(titanic_df, target='survived', seed=123)

In [78]:
#Impute age
train, validate, test = prepare.impute_mean_age(train, validate, test)

In [79]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

### 1. Create a model that includes age in addition to fare and pclass.

In [80]:
# Create the logistic regression
logit1 = LogisticRegression(random_state=123)

# specify features
features1 = ['age', 'fare', 'pclass']

# Fit a model using only these specified features
logit1.fit(X_train[features1], y_train)

y_pred1 = logit1.predict(X_train[features1])

print("Logistic Regression using age, pclass, fare, and gender features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit1.score(X_train[features1], y_train)))

# classification report for Model 2 using train data
print(classification_report(y_train, y_pred1))

Logistic Regression using age, pclass, fare, and gender features
Accuracy of Logistic Regression classifier on training set: 0.70
              precision    recall  f1-score   support

           0       0.71      0.87      0.78       384
           1       0.66      0.42      0.51       239

    accuracy                           0.70       623
   macro avg       0.68      0.64      0.65       623
weighted avg       0.69      0.70      0.68       623



In [66]:
print('Coefficient: \n', logit1.coef_)
print('Intercept: \n', logit1.intercept_)

Coefficient: 
 [[-0.04012056  0.00639302 -0.9277399 ]]
Intercept: 
 [2.64369846]


In [67]:
y_pred_proba = logit1.predict_proba(X_train[features])

In [68]:
y_pred_proba

array([[0.6826964 , 0.3173036 ],
       [0.52545906, 0.47454094],
       [0.72297807, 0.27702193],
       ...,
       [0.6294886 , 0.3705114 ],
       [0.76367561, 0.23632439],
       [0.57143343, 0.42856657]])

In [69]:
print(confusion_matrix(y_train, y_pred))

[[333  51]
 [139 100]]


In [70]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78       384
           1       0.66      0.42      0.51       239

    accuracy                           0.70       623
   macro avg       0.68      0.64      0.65       623
weighted avg       0.69      0.70      0.68       623



### Performance on Validate

In [71]:
y_pred1 = logit1.predict(X_validate[features])

In [72]:
print("Model 1 performance on validate")

# accuracy of model 1
print('Accuracy: {:.2f}'.format(logit1.score(X_validate[features], y_validate)))

# confusion matrix of model 1
print(confusion_matrix(y_validate, y_pred1))

# classification report of model 1
print(classification_report(y_validate, y_pred1))

Model 1 performance on validate
Accuracy: 0.69
[[67 15]
 [27 25]]
              precision    recall  f1-score   support

           0       0.71      0.82      0.76        82
           1       0.62      0.48      0.54        52

    accuracy                           0.69       134
   macro avg       0.67      0.65      0.65       134
weighted avg       0.68      0.69      0.68       134



### Baseline Accuracy

In [73]:
y_train.value_counts()
#Baseline is 0, did not survive
#Baseline Accuracy
(y_train == 0).mean()

0.6163723916532905

### 1. Does this model perform better than your baseline? 

In [None]:
#Yes it does (0.70)

### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [19]:
# Create the logistic regression
logit2 = LogisticRegression(random_state=123)

# specify features
features2 = ['age', 'fare', 'pclass', 'is_female']

# Fit a model using only these specified features
logit2.fit(X_train[features2], y_train)

y_pred = logit2.predict(X_train[features2])

print("Logistic Regression using age, pclass, fare, and gender features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train[features2], y_train)))

Logistic Regression using age, pclass, fare, and gender features
Accuracy of Logistic Regression classifier on training set: 0.79


In [20]:
print('Coefficient: \n', logit2.coef_)
print('Intercept: \n', logit2.intercept_)

Coefficient: 
 [[-0.03815414  0.00292526 -1.10149471  2.41405025]]
Intercept: 
 [2.13283132]


In [24]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[323  61]
 [ 68 171]]
              precision    recall  f1-score   support

           0       0.83      0.84      0.83       384
           1       0.74      0.72      0.73       239

    accuracy                           0.79       623
   macro avg       0.78      0.78      0.78       623
weighted avg       0.79      0.79      0.79       623



### Performance on Validate

In [74]:
y_pred2 = logit2.predict(X_validate[features])

ValueError: X has 3 features, but LogisticRegression is expecting 4 features as input.

In [59]:
print("Model 2 performance on validate")

# accuracy of model 2
print('Accuracy: {:.2f}'.format(logit2.score(X_validate[features], y_validate)))

# confusion matrix of model 2
print(confusion_matrix(y_validate, y_pred))

# classification report of model 2
print(classification_report(y_validate, y_pred))

Model 2 performance on validate


ValueError: X has 3 features, but LogisticRegression is expecting 4 features as input.

### 3. Try out other combinations of features and models.

In [34]:
# Create the logistic regression
logit3 = LogisticRegression(random_state=123)

# specify features
features = ['age', 'fare', 'pclass']

# Fit a model using only these specified features
logit3.fit(X_train[features], y_train)

y_pred = logit3.predict(X_train[features])

print("Logistic Regression using age, pclass, fare, and gender features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit3.score(X_train[features], y_train)))

Logistic Regression using age, pclass, fare, and gender features
Accuracy of Logistic Regression classifier on training set: 0.70


In [35]:
print('Coefficient: \n', logit3.coef_)
print('Intercept: \n', logit3.intercept_)

Coefficient: 
 [[-0.04012056  0.00639302 -0.9277399 ]]
Intercept: 
 [2.64369846]


In [36]:
print(confusion_matrix(y_train, y_pred))

[[333  51]
 [139 100]]


In [37]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78       384
           1       0.66      0.42      0.51       239

    accuracy                           0.70       623
   macro avg       0.68      0.64      0.65       623
weighted avg       0.69      0.70      0.68       623



### Performance on Validate

In [49]:
y_pred3 = logit3.predict_proba(X_validate[features])

In [50]:
y_pred3.shape

(134, 2)

In [52]:
print("Model 3 performance on validate")

# accuracy of model 3
print('Accuracy: {:.2f}'.format(logit3.score(X_validate[features], y_validate)))

# confusion matrix of model 3
print(confusion_matrix(y_validate, y_pred3))

# classification report of model 3
print(classification_report(y_validate, y_pred3))

Model 3 performance on validate
Accuracy: 0.69


ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets

### Using Pclass, alone, gender

In [53]:
# Create the logistic regression
logit4 = LogisticRegression(random_state=123)

# specify features
features = ['pclass', 'alone', 'is_female']

# Fit a model using only these specified features
logit4.fit(X_train[features], y_train)

y_pred = logit4.predict(X_train[features])

print("Logistic Regression using age, pclass, fare, and gender features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit4.score(X_train[features], y_train)))

Logistic Regression using age, pclass, fare, and gender features
Accuracy of Logistic Regression classifier on training set: 0.78


In [54]:
print('Coefficient: \n', logit4.coef_)
print('Intercept: \n', logit4.intercept_)

Coefficient: 
 [[-0.9164229  -0.34778639  2.36609031]]
Intercept: 
 [0.90697958]


In [55]:
y_pred4 = logit4.predict_proba(X_train[features])

In [56]:
y_pred4.shape

(623, 2)

In [58]:
print(confusion_matrix(y_train, y_pred))

ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets

### Using Pclass, sex, embark Cherbourg

In [39]:
# Create the logistic regression
logit5 = LogisticRegression(random_state=123)

# specify features
features = ['pclass', 'is_female', 'embark_Cherbourg']

# Fit a model using only these specified features
logit5.fit(X_train[features], y_train)

y_pred = logit5.predict(X_train[features])

print("Logistic Regression using age, pclass, fare, and gender features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit5.score(X_train[features], y_train)))

Logistic Regression using age, pclass, fare, and gender features
Accuracy of Logistic Regression classifier on training set: 0.78


In [42]:
#Make predictions
y_pred_proba = logit5.predict_proba(X_train[features])

In [None]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))
print(confusion_matrix(y_train, y_pred))

### 4. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [None]:
# Using Pclass, sex, embark Cherbourg gave the best results

In [None]:
# performance on train data

y_pred = logit.predict(X_train)

print("Model 1: solver = lbfgs, c = 1")

# accuracy of model 1
print('Accuracy: {:.2f}'.format(logit.score(X_train, y_train)))

# confusion matrix of model 1
print(confusion_matrix(y_train, y_pred))

# classification report of model 1
print(classification_report(y_train, y_pred))

In [None]:
# performance on test

y_pred = logit.predict(X_test)
print("Model 1: solver = lbfgs, c = 1")
# accuracy of model 4
print('Accuracy: {:.2f}'.format(logit.score(X_test, y_test)))
# confusion matrix of model 4
print(confusion_matrix(y_test, y_pred))
# classification report of model 4
print(classification_report(y_test, y_pred))