In [82]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier

In [13]:
data_X = np.loadtxt('preprocessed_data/train_X.csv', delimiter=',')
data_y = np.loadtxt('preprocessed_data/train_y.csv', delimiter=',')

n = int(len(data_X) * 0.8)

train_X, train_y = data_X[:n], data_y[:n]
val_X, val_y = data_X[n:], data_y[n:]

In [14]:
train_X.shape, train_y.shape, val_X.shape, val_y.shape

((624, 25), (624,), (156, 25), (156,))

In [66]:
dummy_clf = DummyClassifier(strategy="prior")
dummy_clf.fit(train_X, train_y)

In [68]:
y_pred = dummy_clf.predict(val_X)
accuracy_score(val_y, y_pred)

0.5769230769230769

In [76]:
rf = RandomForestClassifier()

parameters = {
    'n_estimators': [200, 230, 260, 300],
    'max_depth': [170, 200, 230, 260, 300],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

clf = GridSearchCV(rf, parameters, scoring='accuracy', cv=5, n_jobs=-1)
clf.fit(train_X, train_y)

In [70]:
clf.best_estimator_

In [78]:
clf.best_score_

0.7980387096774194

In [77]:
y_pred = clf.predict(val_X)
accuracy_score(val_y, y_pred)

0.8269230769230769

In [84]:
test_data = np.loadtxt('preprocessed_data/test_X.csv', delimiter=',')
passenger_ids = np.loadtxt('preprocessed_data/passenger_ids.csv', delimiter=',')

test_preds = clf.predict(test_data)
final_preds = np.c_[passenger_ids.astype(int), test_preds.astype(int)]

df = pd.DataFrame(final_preds, columns=['PassengerId', 'Survived'])
df.to_csv('submission_ml.csv', index=False)