### Import the essential libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

SEED = np.random.seed(9)

In [2]:
path = '../../data/processed/'

train = pd.read_csv(f'{path}train_processed.csv')
test = pd.read_csv(f'{path}test_processed.csv')

### Applying the models to the training dataset

In [3]:
X = train.drop(['PassengerId','Survived'], axis=1)
y = train['Survived']

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=SEED)

#### Logistic Regression

In [5]:
rl_clf = LogisticRegression(random_state=SEED)
rl_clf = rl_clf.fit(X_train,y_train)

y_pred_rl = rl_clf.predict(X_val)

In [6]:
y_pred_rl = rl_clf.predict(X_val)

##### Acurracy

In [7]:
lr_ac = accuracy_score(y_val, y_pred_rl)
print(f"Accuracy score (Logistic Regression): {lr_ac:.3f}")

Accuracy score (Logistic Regression): 0.800


In [8]:
rl_cm = confusion_matrix(y_val, y_pred_rl)
print(f"Confusion matrix: \n{rl_cm}")

Confusion matrix: 
[[155  22]
 [ 37  81]]


#### Random Forest Classifier

In [9]:
rf_clf = RandomForestClassifier(random_state=SEED)
rf_clf = rf_clf.fit(X_train,y_train)

In [10]:
y_pred_rf = rf_clf.predict(X_val)

##### Acurracy

In [11]:
rf_ac = accuracy_score(y_val, y_pred_rf)
print(f"Accuracy score (Random Forest): {rf_ac:.3f}")

Accuracy score (Random Forest): 0.810


In [12]:
rf_cm = confusion_matrix(y_val, y_pred_rf)
print(f"Confusion matrix: \n{rl_cm}")

Confusion matrix: 
[[155  22]
 [ 37  81]]


#### MLP Classifier

In [13]:
mlp_clf = MLPClassifier(random_state=SEED,max_iter=10000, learning_rate='adaptive')
mlp_clf = mlp_clf.fit(X_train,y_train)

In [14]:
y_pred_mlp = mlp_clf.predict(X_val)

##### Acurracy

In [15]:
mlp_ac = accuracy_score(y_val, y_pred_mlp)
print(f"Accuracy score (MLP): {mlp_ac:.3f}")

Accuracy score (MLP): 0.793


In [16]:
mlp_cm = confusion_matrix(y_val, y_pred_mlp)
print(f"Confusion matrix: \n{mlp_cm}")

Confusion matrix: 
[[158  19]
 [ 42  76]]


### Ranking the accuracy of machine learning models

In [17]:
rank = pd.DataFrame({'Models': ['Logistic','RandomForest','MLP'],
                     'Accuracy': [lr_ac,rf_ac,mlp_ac]})

rank

Unnamed: 0,Models,Accuracy
0,Logistic,0.8
1,RandomForest,0.810169
2,MLP,0.79322


### Test Dataset

In [18]:
test.head(3)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,IsMale,Relatives,Embarked
0,892,3,0.588235,0,0,-0.28067,1,0,2
1,893,3,1.568627,1,0,-0.3158,0,1,0
2,894,2,2.745098,0,0,-0.201943,1,0,2


In [19]:
X_test = test.drop('PassengerId', axis=1)

#### Making predictions using our best model

In [20]:
y_pred = rf_clf.predict(X_test)

- Creating a new column with predictions in the test dataset

In [21]:
test['Survived'] = y_pred
test.sample(2)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,IsMale,Relatives,Embarked,Survived
382,1274,3,0.0,0,0,0.00194,0,0,0,1
90,982,3,-0.392157,1,0,-0.023479,0,1,0,1


In [22]:
submission = test[['PassengerId', 'Survived']]

In [23]:
submission.to_csv('../../data/predict/submission1.csv', index=False)

Plubic Score: 0.744