In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

from acquire import get_titanic_data
from prepare import prep_titanic
import warnings
warnings.filterwarnings("ignore")

In [2]:
train, validate, test = prep_titanic()
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')
train.head()

train -> (497, 14)
validate -> (214, 14)
test -> (178, 14)


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,embarked_Q,embarked_S,sex_male
583,583,0,1,male,36.0,0,0,40.125,First,Cherbourg,1,0,0,1
337,337,1,1,female,41.0,0,0,134.5,First,Cherbourg,1,0,0,0
50,50,0,3,male,7.0,4,1,39.6875,Third,Southampton,0,0,1,1
218,218,1,1,female,32.0,0,0,76.2917,First,Cherbourg,1,0,0,0
31,31,1,1,female,29.916875,1,0,146.5208,First,Cherbourg,0,0,0,0


### 1. Start by defining your baseline model.

In [3]:
train.dropna(inplace=True)

In [4]:
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [5]:
train['baseline_prediction'] = 'not survived'
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,embarked_Q,embarked_S,sex_male,baseline_prediction
583,583,0,1,male,36.0,0,0,40.125,First,Cherbourg,1,0,0,1,not survived
337,337,1,1,female,41.0,0,0,134.5,First,Cherbourg,1,0,0,0,not survived
50,50,0,3,male,7.0,4,1,39.6875,Third,Southampton,0,0,1,1,not survived
218,218,1,1,female,32.0,0,0,76.2917,First,Cherbourg,1,0,0,0,not survived
31,31,1,1,female,29.916875,1,0,146.5208,First,Cherbourg,0,0,0,0,not survived


In [6]:
pd.crosstab(train.baseline_prediction, train.survived)

survived,0,1
baseline_prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
not survived,307,190


In [7]:
baseline_accuracy = 1- (train.survived).mean()
baseline_accuracy

0.6177062374245472

### 2. Create another model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [8]:
X_train = train[['pclass','age','fare']]
y_train = train[['survived']]
X_validate = validate[['pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['pclass','age','fare']]
y_test = test[['survived']]

In [9]:
# Do we need to Impute for age to use the most frequent value?
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_train)
X_train = imp_mean.transform(X_train)

In [10]:
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_validate)
X_validate = imp_mean.transform(X_validate)

In [11]:
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_test)
X_test = imp_mean.transform(X_test)

In [12]:
logit = LogisticRegression()

In [13]:
logit = logit.fit(X_train, y_train)

In [14]:
print(logit.coef_)

print(logit.intercept_)

[[-0.97983178 -0.03051881  0.00266519]]
[2.52970125]


In [15]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [16]:
y_pred_proba

array([[0.36397951, 0.63602049],
       [0.34139883, 0.65860117],
       [0.6265983 , 0.3734017 ],
       [0.31505319, 0.68494681],
       [0.26359851, 0.73640149],
       [0.56992912, 0.43007088],
       [0.66202017, 0.33797983],
       [0.56377681, 0.43622319],
       [0.78619933, 0.21380067],
       [0.75721249, 0.24278751],
       [0.83302159, 0.16697841],
       [0.5524311 , 0.4475689 ],
       [0.26852561, 0.73147439],
       [0.44097115, 0.55902885],
       [0.65470646, 0.34529354],
       [0.69246728, 0.30753272],
       [0.78606491, 0.21393509],
       [0.20972644, 0.79027356],
       [0.60657891, 0.39342109],
       [0.75981083, 0.24018917],
       [0.67296533, 0.32703467],
       [0.78618813, 0.21381187],
       [0.44717612, 0.55282388],
       [0.70630063, 0.29369937],
       [0.43849213, 0.56150787],
       [0.74851029, 0.25148971],
       [0.44449993, 0.55550007],
       [0.47490776, 0.52509224],
       [0.33049719, 0.66950281],
       [0.55461193, 0.44538807],
       [0.

In [17]:
# accuracy
logit.score(X_train, y_train)

0.716297786720322

In [18]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

[[265  42]
 [ 99  91]]


In [19]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79       307
           1       0.68      0.48      0.56       190

    accuracy                           0.72       497
   macro avg       0.71      0.67      0.68       497
weighted avg       0.71      0.72      0.70       497



### I calculated a baseline accuracy 62% and the model showed a accuracy of 72%

### 3. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [20]:
X_train = train[['sex_male','pclass','age','fare']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','age','fare']]
y_test = test[['survived']]

In [21]:
logit2 = LogisticRegression()

In [22]:
logit2 = logit.fit(X_train, y_train)

In [23]:
print(logit2.coef_)

print(logit2.intercept_)

[[-2.45878213e+00 -1.11402368e+00 -2.66594879e-02  9.02716903e-04]]
[4.30664987]


In [24]:
y_pred = logit2.predict(X_train)
y_pred_proba = logit2.predict_proba(X_train)

In [25]:
# accuracy
logit2.score(X_train, y_train)

0.7987927565392354

In [26]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

[[263  44]
 [ 56 134]]


In [27]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.71      0.73       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.80      0.80      0.80       497



### Accuracy is at 80% now with sex included

### 4. Try out other combinations of features and models.

In [28]:
# Creating a model with alone embarked Q & S & fare

In [29]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,embarked_Q,embarked_S,sex_male,baseline_prediction
583,583,0,1,male,36.0,0,0,40.125,First,Cherbourg,1,0,0,1,not survived
337,337,1,1,female,41.0,0,0,134.5,First,Cherbourg,1,0,0,0,not survived
50,50,0,3,male,7.0,4,1,39.6875,Third,Southampton,0,0,1,1,not survived
218,218,1,1,female,32.0,0,0,76.2917,First,Cherbourg,1,0,0,0,not survived
31,31,1,1,female,29.916875,1,0,146.5208,First,Cherbourg,0,0,0,0,not survived


In [30]:
X_train = train[['alone','embarked_Q','embarked_S','fare']]
y_train = train[['survived']]
X_validate = validate[['alone','embarked_Q','embarked_S','fare']]
y_validate = validate[['survived']]
X_test = test[['alone','embarked_Q','embarked_S','fare']]
y_test = test[['survived']]

In [31]:
logit3 = LogisticRegression()

In [32]:
logit3 = logit3.fit(X_train, y_train)

In [33]:
print(logit3.coef_)

print(logit3.intercept_)

[[-0.82913469  0.20969254 -0.26724887  0.00935761]]
[-0.12836476]


In [34]:
# accuracy
logit3.score(X_train, y_train)

0.6720321931589537

In [35]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

[[263  44]
 [ 56 134]]


In [36]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.71      0.73       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.80      0.80      0.80       497



### Going to do 3. the model set with sex and change some of the 

In [37]:
X_train = train[['sex_male','pclass','age','fare']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','age','fare']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','age','fare']]
y_test = test[['survived']]

In [38]:
logit4 = LogisticRegression(C=1000, class_weight={0:1, 1:3}, random_state=123, max_iter=500, solver='lbfgs')


In [39]:
logit4.fit(X_train, y_train)

LogisticRegression(C=1000, class_weight={0: 1, 1: 3}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=500, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
LogisticRegression(C=1000, random_state=123)


LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [41]:
print('Coefficient: \n', logit4.coef_)
print('Intercept: \n', logit4.intercept_)

Coefficient: 
 [[-2.48146317e+00 -1.04576529e+00 -2.88485000e-02  1.36953490e-03]]
Intercept: 
 [5.31245845]


In [42]:
y_pred = logit4.predict(X_train)

In [43]:
y_pred_proba = logit4.predict_proba(X_train)

In [44]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit4.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.73


With a low c ~.1 accuracy went down 1 hundredth .01 to .79. I increased C to 1500 and accuracy stayed at 80%. When weights were added class_weight={0: 1, 1: 99}, the accuracy went down tremendously to .38. Adjusted the class_weight={0: 1, 1: 3} and got 73%


In [45]:
X_train = train[['sex_male','pclass','sibsp','age']]
y_train = train[['survived']]
X_validate = validate[['sex_male','pclass','sibsp','age']]
y_validate = validate[['survived']]
X_test = test[['sex_male','pclass','sibsp','age']]
y_test = test[['survived']]

In [46]:
logit5 = LogisticRegression()

In [47]:
logit5 = logit5.fit(X_train, y_train)

In [48]:
print(logit5.coef_)

print(logit5.intercept_)

[[-2.57815189 -1.14702511 -0.29479208 -0.03177517]]
[4.79231522]


In [49]:
# accuracy
logit5.score(X_train, y_train)

0.7867203219315896

In [50]:
#Confusion Matrix
print(confusion_matrix(y_train, y_pred))

[[205 102]
 [ 31 159]]


In [51]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.67      0.76       307
           1       0.61      0.84      0.71       190

    accuracy                           0.73       497
   macro avg       0.74      0.75      0.73       497
weighted avg       0.77      0.73      0.74       497



### 5. Use you best 3 models to predict and evaluate on your validate sample. Logit 2, 4, & 5

In [59]:
y_pred2 = logit2.predict(X_validate)
y_pred4 = logit4.predict(X_validate)
y_pred5 = logit5.predict(X_validate)

print("Model 2: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit2.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred2))

print(classification_report(y_validate, y_pred2))

print("Model 4: solver = lbfgs, c = 1000 ,class_weight={0:1, 1:3}, random_state=123, max_iter=500")

print('Accuracy: {:.2f}'.format(logit4.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred4))

print(classification_report(y_validate, y_pred4))

print("Model 5: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit5.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred5))

print(classification_report(y_validate, y_pred5))

Model 2: solver = lbfgs, c = 1
Accuracy: 0.74
[[94 38]
 [18 64]]
              precision    recall  f1-score   support

           0       0.84      0.71      0.77       132
           1       0.63      0.78      0.70        82

    accuracy                           0.74       214
   macro avg       0.73      0.75      0.73       214
weighted avg       0.76      0.74      0.74       214

Model 4: solver = lbfgs, c = 1000 ,class_weight={0:1, 1:3}, random_state=123, max_iter=500
Accuracy: 0.68
[[76 56]
 [13 69]]
              precision    recall  f1-score   support

           0       0.85      0.58      0.69       132
           1       0.55      0.84      0.67        82

    accuracy                           0.68       214
   macro avg       0.70      0.71      0.68       214
weighted avg       0.74      0.68      0.68       214

Model 5: solver = lbfgs, c = 1
Accuracy: 0.76
[[109  23]
 [ 28  54]]
              precision    recall  f1-score   support

           0       0.80      0.8

### 6. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [60]:
y_pred = logit5.predict(X_test)
y_pred_proba = logit5.predict_proba(X_test)

print("Model 5: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit5.score(X_test, y_test)))

print(confusion_matrix(y_validate, y_pred5))

print(classification_report(y_validate, y_pred5))

Model 5: solver = lbfgs, c = 1
Accuracy: 0.81
[[109  23]
 [ 28  54]]
              precision    recall  f1-score   support

           0       0.80      0.83      0.81       132
           1       0.70      0.66      0.68        82

    accuracy                           0.76       214
   macro avg       0.75      0.74      0.74       214
weighted avg       0.76      0.76      0.76       214



In [None]:
My model got an accuracy of .79 on test, .76 on validate and .81 on test

In [66]:
!git status

On branch master
Your branch is up to date with 'origin/master'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   model.ipynb[m
	[31mmodified:   model_evaluation.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mEvaluatingModelPerformance.pdf[m
	[31mModeling.pdf[m
	[31mPython_Seaborn_Cheat_Sheet.pdf[m
	[31mclassification .pdf[m
	[31mexplore_practice.ipynb[m
	[31miris_df.csv[m
	[31mtitanic_df.csv[m
	[31muntidy_data/[m

no changes added to commit (use "git add" and/or "git commit -a")


In [55]:
!git add acquire.py
!git add classification_exercises.ipynb
!git add model_evaluation.ipynb
!git add model_logistic_regression_lect.ipynb
!git add prepare.py

In [63]:
!git add model.ipynb

In [64]:
!git commit -m "Adding model work"

[master be3cbb6] Adding model work
 1 file changed, 2152 insertions(+)
 create mode 100644 model.ipynb


In [65]:
!git push

Enumerating objects: 4, done.
Counting objects: 100% (4/4), done.
Delta compression using up to 8 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 11.31 KiB | 3.77 MiB/s, done.
Total 3 (delta 1), reused 0 (delta 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/george887/classification_exercises.git
   e25efed..be3cbb6  master -> master
