# Importing Libs 

In [42]:
import pandas as pd
import os
import warnings

warnings.filterwarnings('ignore')
import numpy as np

from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split

## Reading Data


In [2]:
#comando get current working directory

path = os.getcwd()
print(path)

/Users/emillypitman/Documents/kaggle_titanic/notebooks


In [3]:
#comando para mudar diretorio
os.chdir("/Users/emillypitman/Documents/kaggle_titanic/data") 

In [51]:
df_train = pd.read_csv("train_cleaned.csv")
df_test = pd.read_csv("test_cleaned.csv")

In [52]:
df_train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


In [53]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,1,1,0
1,893,3,47.0,1,0,7.0,0,0,1
2,894,2,62.0,0,0,9.6875,1,1,0
3,895,3,27.0,0,0,8.6625,1,0,1
4,896,3,22.0,1,1,12.2875,0,0,1


# Getting X and y

In [54]:
X = df_train[
    [
        'Pclass',
        'Age',
        'SibSp',
        'Parch',
        'Fare',
        'Sex_male',
        'Embarked_Q',
        'Embarked_S'
    ]
]
y =  df_train["Survived"]


In [55]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,1,0,1
1,1,38.0,1,0,71.2833,0,0,0
2,3,26.0,0,0,7.925,0,0,1
3,1,35.0,1,0,53.1,0,0,1
4,3,35.0,0,0,8.05,1,0,1


In [56]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

## Train Test Split


In [57]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
print(X_train.shape)
print(y_train.shape)

(712, 8)
(712,)


In [59]:
print(X_val.shape)
print(y_val.shape)

(179, 8)
(179,)


# Logistic Regression without Cross-Validation

In [60]:
lr = LogisticRegression()

lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)
y_pred_proba = lr.predict_proba(X_val)[:, 1]

print('Accuracy: {}'.format(accuracy_score(y_val, y_pred)))
print('AUC: {}'.format(roc_auc_score(y_val, y_pred_proba)))
print('{}'.format(classification_report(y_val, y_pred)))

Accuracy: 0.7988826815642458
AUC: 0.8803088803088803
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



# Cross validation

In [61]:
lr = LogisticRegression()
kf = KFold(n_splits=5, random_state = 42)

In [62]:
accs_list = []
aucs_list = []

for train_index, val_index in kf.split(X_train):
    # Getting index for train/val splits
    X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
    y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Fitting logistic regression on training indexes
    lr.fit(X_train_, y_train_)
    
    # Predicting  on validation indexes
    y_pred_val = lr.predict(X_val_)
    y_proba_val = lr.predict_proba(X_val_)[:, 1]
    
    # Calculating metrics
    acc = accuracy_score(y_val_, y_pred_val)
    auc = roc_auc_score(y_val_, y_proba_val)
    
    #Saving acc and auc
    accs_list.append(acc)
    aucs_list.append(auc)


# Evaluating results

In [63]:
print('Accuracies: {} +- {}'.format(np.mean(accs_list), np.std(accs_list)))
print('AUCS: {} +- {}'.format(np.mean(aucs_list), np.std(aucs_list)))

Accuracies: 0.79068255687974 +- 0.029645568962170632
AUCS: 0.8444525867819067 +- 0.024192689003076594


# Cross validation with oneliner

In [64]:
cross_val_accs = cross_val_score(lr, X_train, y_train, cv=5, scoring='accuracy')

In [65]:
cross_val_accs

array([0.81818182, 0.7972028 , 0.78873239, 0.73943662, 0.82394366])

In [66]:
print('Accuracies: {} +- {}'.format(np.mean(cross_val_accs), np.std(cross_val_accs)))

Accuracies: 0.7934994582881907 +- 0.02998432443672972


# Predicting on test set

In [67]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,1,1,0
1,893,3,47.0,1,0,7.0,0,0,1
2,894,2,62.0,0,0,9.6875,1,1,0
3,895,3,27.0,0,0,8.6625,1,0,1
4,896,3,22.0,1,1,12.2875,0,0,1


In [70]:
ids = df_test['PassengerId']
X_test = df_test.drop('PassengerId', axis=1)

y_pred_final = lr.predict(X_test)

In [77]:
df_final = pd.concat([ids, pd.DataFrame(y_pred_final)], axis=1)

In [79]:
df_final.columns = ['PassengerId', 'Survived']

In [80]:
df_final.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


# Saving submission file

In [81]:
df_final.to_csv('submission_lr.csv', index=False)