### 0. Importing packages

In [227]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from warnings import simplefilter
simplefilter(action="ignore",category=FutureWarning)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### 1. Load in data

In [153]:
path_to_data = "../data/"
gender_submission = pd.read_csv(path_to_data + "gender_submission.csv")
test = pd.read_csv(path_to_data + "test.csv")
train = pd.read_csv(path_to_data + "train.csv")

In [112]:
# example of a submission. DF has PassengerID and survival as encoded in binary
# Survived = 1, Didn't survive = 0
gender_submission.head(3)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0


### 2. Data Quality Check

In [113]:
# create X and y
X = train.drop("Survived",axis=1)
y = train['Survived']

#### 2.1 Drop cols with more than 1% NaNs

In [114]:
df_NaNs_percent = pd.DataFrame(X.isna().mean()*100,columns=['Percentage_NaNs']).sort_values(by='Percentage_NaNs', ascending=False)
df_NaNs_percent.reset_index(level=0, inplace= True)
df_NaNs_percent.rename(columns = {'index':'feature'},inplace=True)
df_NaNs_percent

Unnamed: 0,feature,Percentage_NaNs
0,Cabin,77.104377
1,Age,19.86532
2,Embarked,0.224467
3,PassengerId,0.0
4,Pclass,0.0
5,Name,0.0
6,Sex,0.0
7,SibSp,0.0
8,Parch,0.0
9,Ticket,0.0


In [115]:
# Drop columns with more than 1% NaNs
cols_to_drop = list(df_NaNs_percent.loc[df_NaNs_percent["Percentage_NaNs"] > 1,'feature'])
X = X.drop(columns = cols_to_drop)
print("Number of cols dropped: {}".format(len(cols_to_drop)))
print("-------------------------------------")
print("")
for column in cols_to_drop:
    print(column)

Number of cols dropped: 2
-------------------------------------

Cabin
Age


#### 2.2 Dropping rows in cols with NaN

In [182]:
# drop the rows from y too
y = y[~X['Embarked'].isna()]
# Drop 2 rows from dataset with NaN
X = X.dropna()

In [183]:
X = X.drop(['Name'],axis = 1)

In [184]:
X_enc = pd.get_dummies(X, columns=['Sex','Embarked'])

In [186]:
le = LabelEncoder()
le.fit(X_enc['Ticket'])
LabelEncoder()
list(le.classes_)
X_enc['Ticket'] = le.transform(X_enc['Ticket']) 

### 3A. Logistic Regression (Produces 48%)

In [187]:
sc = StandardScaler()

X_scaled = sc.fit_transform(X_enc)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y , test_size = 0.3, random_state=42)

# create classifier
logreg = LogisticRegression()

logreg.fit(X_train,y_train)

y_pred = logreg.predict(X_test)

scoring = ['accuracy', 'precision', 'recall_macro']
cv_score = cross_validate(logreg, X_test, y_test, scoring=scoring, cv=5)
print("Accuracy {}, Precision: {}, Recall: {}".format(np.mean(cv_score['test_accuracy']),np.mean(cv_score['test_precision']),np.mean(cv_score['test_recall_macro'])))

Accuracy 0.7978336827393431, Precision: 0.7554761904761905, Recall: 0.7763725490196078


### 3B. Random Forest Classifier (Produces 74%)

In [None]:
param_grid = {'criterion': ['gini',"entropy"]}

rf = RandomForestClassifier(n_estimators=1000)

X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2, random_state = 42)

rf_cv = GridSearchCV(rf,param_grid,cv=5)

rf_cv.fit(X_train,y_train)

print("Tuned logistic Regression Parameter: {}".format(rf_cv.best_params_))
print("Tuned logistic Regression Accuracy: {}".format(rf_cv.best_score_))

### 4. Make prediction on the test data

In [154]:
# Need to preprocess the test data in the same way.
# We can drop age and cabin as they were also dropped when we trained our data. However we cannot drop Fare, 
# as this was used when we trained the data. So we will impute with the average
test.drop(['Age', 'Cabin'],axis=1,inplace=True)
test.drop(['Name'],axis=1,inplace=True)
test_enc = pd.get_dummies(test, columns=['Sex','Embarked'])


In [163]:
le = LabelEncoder()
le.fit(test_enc['Ticket'])
LabelEncoder()
list(le.classes_)
test_enc['Ticket'] = le.transform(test_enc['Ticket']) 

In [176]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(np.array(test_enc['Fare']).reshape(-1, 1))
test_enc['Fare'] = imp_mean.transform(np.array(test_enc['Fare']).reshape(-1, 1))

In [190]:
final_pred = logreg.predict(test_enc)

### 5. Output Prediction

In [226]:
output = pd.DataFrame({'PassengerID': test_enc["PassengerId"], 'Survived': final_pred})
output_loc = "../predictions/"
output.to_csv(output_loc+str(datetime.now()),index=False)