In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
 

In [None]:
filename = 'titanic.csv'
path = 'C:/Users/daniy/Desktop/SEM3/INTRO TO AI (PYTHON)/Lab Assignment 4/Exercise#1_daniyal'
fullpath = os.path.join(path, filename)
titanic_daniyal = pd.read_csv(fullpath)

In [None]:
# print first 3 records
print(titanic_daniyal.head(3))

In [None]:
# print shape of dataframe
print(titanic_daniyal.shape)

In [None]:
# print dataframe info
print(titanic_daniyal.info())

In [None]:
# prints unique values for Sex column
print(titanic_daniyal['Sex'].unique())

In [None]:
# prints unique values for Pclass column
print(titanic_daniyal['Pclass'].unique())

In [None]:
# prints crosstab of survived vs pclass
pd.crosstab(titanic_daniyal['Survived'], titanic_daniyal['Pclass']).plot(kind = 'bar')
plt.title('# Survived vs Passenger Class')
plt.xlabel('Passenger Class')
plt.ylabel('# of Survivors')

In [None]:
# prints crosstab of survived vs sex
pd.crosstab(titanic_daniyal['Survived'], titanic_daniyal['Sex']).plot(kind = 'bar')
plt.title('# Survived vs Gender')
plt.xlabel('Gender')
plt.ylabel('# of Survivors')

In [None]:
pd.plotting.scatter_matrix(titanic_daniyal[['Sex', 
                                            'Pclass', 
                                            'Fare', 
                                            'SibSp', 
                                            'Parch']], 
                           alpha=0.4, figsize=(13,15))

In [None]:
titanic_daniyal = titanic_daniyal.drop(['PassengerId', 
                                        'Name', 
                                        'Ticket', 
                                        'Cabin'], axis = 1)

In [None]:
catvars = ['Sex', 'Embarked']
for var in catvars:
    catlist = pd.get_dummies(titanic_daniyal[var], prefix = var)
    titanic_daniyal = titanic_daniyal.join(catlist)

In [None]:
titanic_daniyal = titanic_daniyal.drop(catvars, axis = 1)

In [None]:
titanic_daniyal = titanic_daniyal.fillna(titanic_daniyal.mean())

In [None]:
for column in titanic_daniyal:
    titanic_daniyal[column] = titanic_daniyal[column].astype(float)

In [None]:
def normalize(df):
    normalized = (df - df.min()) / (df.max() - df.min())
    return normalized
titanic_daniyal = normalize(titanic_daniyal)

In [None]:
print(titanic_daniyal.head(2))

In [None]:
titanic_daniyal.hist(figsize = (9, 10))

In [None]:
featurecols = ['Pclass',
               'Age',
               'SibSp',
               'Parch',
               'Fare',
               'Sex_female',
               'Sex_male',
               'Embarked_C',
               'Embarked_Q',
               'Embarked_S']
x_daniyal = titanic_daniyal[featurecols]
y_daniyal = titanic_daniyal['Survived']

In [None]:
# sets seed to 72
np.random.seed(72)

In [None]:
x_train_daniyal, x_test_daniyal, y_train_daniyal, y_test_daniyal = train_test_split(x_daniyal, 
                                                                                    y_daniyal, 
                                                                                    test_size = 0.3)

In [None]:
lr = LogisticRegression()
lr.fit(x_train_daniyal, y_train_daniyal)

In [None]:
pd.DataFrame(zip(x_train_daniyal.columns, np.transpose(lr.coef_)))

In [None]:
scores = cross_val_score(LogisticRegression(solver = 'lbfgs'), x_train_daniyal, y_train_daniyal, 
                         scoring = 'accuracy', cv = 10)
print(scores)

In [None]:
print(scores.mean())

In [None]:
for i in np.arange (0.1, 0.5, 0.05):
    x_train_daniyal, x_test_daniyal, y_train_daniyal, y_test_daniyal = train_test_split(x_daniyal, y_daniyal, test_size = i)
    lr = LogisticRegression()
    lr.fit(x_train_daniyal, y_train_daniyal)
    scores = cross_val_score(LogisticRegression(solver = 'lbfgs'), x_train_daniyal, y_train_daniyal, 
                             scoring = 'accuracy', cv = 10)
    print("Testing data size: ", i)
    print("Min Score: ", min(scores))
    print("Max Score: ", max(scores))
    print("Mean Score: ", scores.mean())
    print()

In [None]:
x_train_daniyal, x_test_daniyal, y_train_daniyal, y_test_daniyal = train_test_split(x_daniyal, y_daniyal, test_size = 0.3)
lr = LogisticRegression()
lr.fit(x_train_daniyal, y_train_daniyal)

In [None]:
y_pred_daniyal = lr.predict_proba(x_test_daniyal)

In [None]:
y_pred_daniyal_flag = y_pred_daniyal[:, 1] > 0.5

In [None]:
print(accuracy_score(y_test_daniyal, y_pred_daniyal_flag))

In [None]:
print(confusion_matrix(y_test_daniyal, y_pred_daniyal_flag))

In [None]:
print(classification_report(y_test_daniyal, y_pred_daniyal_flag))

In [None]:
y_pred_daniyal_flag = y_pred_daniyal[:, 1] > 0.75
print(accuracy_score(y_test_daniyal, y_pred_daniyal_flag))
print(confusion_matrix(y_test_daniyal, y_pred_daniyal_flag))
print(classification_report(y_test_daniyal, y_pred_daniyal_flag))