In [1]:
#Je achete le boeuf!
#plotting imports
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings("ignore")


#modeling imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz

#pipeline modules
import acquire
import explore
import prepare

In [2]:
df = prepare.prep_titanic()

df.dropna(inplace=True)
df.isnull().sum()

passenger_id    0
survived        0
pclass          0
sex             0
age             0
sibsp           0
parch           0
fare            0
embarked        0
class           0
embark_town     0
alone           0
dtype: int64

In [3]:
#DATA IS ACQUIRED AND PREPARED.
#LETS DO THE 4 WAY SPLIT OF X-y, and each of those are split into train and test.
X=df[['pclass','age','fare','sibsp','parch']]
y=df[['survived']]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30,random_state=123)

# THIS IS THE LOGREG PART

In [4]:
logit = LogisticRegression(C=1,class_weight={1:2},random_state=123,solver='saga')

In [5]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
x = logit.coef_

In [None]:
print(f"The coefficients for the predictors are {x}.")
print(f"The y-intercept for the model is {logit.intercept_}.")

In [6]:
#An array of predicted labels of the target variable generated by fitting the Xtrain and ytrain to LogReg model
#Some are right, some are wrong.
y_pred = logit.predict(X_train)

In [7]:
y_pred_proba = logit.predict_proba(X_train)

In [8]:
blob = confusion_matrix(y_train, y_pred)

In [9]:
#That little insignificant 2x2 matrix is the confusion matrix. 
print(blob)

[[169 124]
 [ 52 154]]


### Looks like terrible precision but okay recall.

In [10]:
cr=(classification_report(y_train,y_pred))

In [11]:
score = logit.score(X_test,y_test)

In [16]:
score

0.5627906976744186

### Why do all that work when sklearn.metrics.classification_report will do it for us

In [None]:
print(cr)

In [13]:
true_negative = blob[0][0]
true_positive = blob[1][1]
false_negative = blob[1][0]
false_positive = blob[0][1]

In [14]:
accuracy=((true_negative+true_positive)/len(y_train))
#all the correctly predicted positives divided by that AND the ones that were predicted negative but shouldn't have
recall=true_positive/(true_positive+false_negative)
#All the corretly predicted positives, divied by that and the ones that the model was too eager to predict as positive
precision=true_positive/(true_positive+false_positive)
#Works horizontally along the actual positive. FP/FP+TN
false_positive_rate=false_positive/(false_positive+true_negative)

In [15]:
print(f"The accuracy is {accuracy}")
print(f"The recall is {recall}")
print(f"The precision is {precision}")

The accuracy is 0.6472945891783567
The recall is 0.7475728155339806
The precision is 0.5539568345323741


In [None]:
def do_the_logRegression(X_train, y_train, my_solver):
    logit = LogisticRegression(C=1,class_weight={1:2},random_state=123,solver=my_solver)
    logit.fit(X_train, y_train)
    y_pred = logit.predict(X_train)
    y_pred_proba=logit.predict_proba(X_train)
    score = logit.score(X_train,y_train)
    return score

In [None]:
do_the_logRegression(X_train, y_train, 'lbfgs')

In [None]:
solvers = ["lbfgs", "liblinear", "sag", "saga", "newton-cg"]

In [None]:
for i in solvers:
    print(f"The accuracy for {i} as the solver is {do_the_logRegression(X_train, y_train, i)}")

## DECISION TREES

In [None]:
# Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)
# The Ol' 1-2-3
# 1
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)

In [None]:
# 2
clf.fit(X_train, y_train)

In [None]:
# 3
y_pred = clf.predict(X_train)
y_pred_proba = clf.predict_proba(X_train)

In [None]:
# type(y_pred)
# y_pred[:6]
# y_train[:6]

In [None]:
conf_matx = confusion_matrix(y_train, y_pred)

In [None]:
print(conf_matx)

In [None]:
type(conf_matx)

# conf_matx.transpose()

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
def do_the_decisionTree():
    clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=123)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_train)
    y_pred_proba = clf.predict_proba(X_train)
    score = clf.score(X_train, y_train)
    return score

In [None]:
do_the_decisionTree()

# RANDOM FORESTS

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)

In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_train)
y_pred_proba = rf.predict_proba(X_train)

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
class_report = classification_report(y_train, y_pred, output_dict=True)

In [None]:
class_report['0']

In [None]:
class_report['1']

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

In [None]:
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)
rf2.fit(X_train, y_train)
y_pred2 = rf2.predict(X_train)
y_pred_proba2 = rf2.predict_proba(X_train)

In [None]:
class_report2 = classification_report(y_train, y_pred2, output_dict=True)

In [None]:
class_report2['1']

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf2.score(X_test, y_test)))