# Loading Basic Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2

# Load Dataset

In [2]:
def load_dataset(filename, train_or_test):
    df = pd.read_csv(filename)
    df = df.set_index('PassengerId')
    if(train_or_test):
        df['Transported'] = df['Transported'].astype(int)
        X = df.drop(['Transported'],axis=1)
        y = df['Transported']
        return X, y
    else:
        return df

In [8]:
X_train, y_train = load_dataset('../data/data_train.csv', 1)

In [7]:
X_test = load_dataset('../data/data_test.csv', 0)

# Modelling

In [21]:
def run(X_train, y_train, X_valid, y_valid):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)

    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_valid)
    print("Accuracy:",accuracy_score(y_valid, y_pred))

In [22]:
# feature selection
def select_features(X_train, y_train, X_test, k):
    fs = SelectKBest(score_func=chi2, k=k)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

# X_train_fs, X_valid_fs, fs = select_features(X_train, y_train, X_valid, k)

## `k=33` best accuracy

In [23]:
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test, 33)

In [24]:
scaler = StandardScaler()
X_train_fs = scaler.fit_transform(X_train_fs)
X_test_fs = scaler.transform(X_test_fs)

model = LogisticRegression()
model.fit(X_train_fs, y_train)

y_pred = model.predict(X_test_fs)

In [25]:
submission = pd.DataFrame({'PassengerId':X_test.index ,'Transported': y_pred.astype(bool)},
        columns=['PassengerId', 'Transported'])

submission.to_csv("../submissions/submission_with_SelectKBest_lg.csv",index=False)

# SVC()

In [28]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

# defining parameter range
param_grid = {'C': [1, 1.5, 2, 3, 5, 10, 12, 20],
              'gamma': [0.002, 0.003, 0.004, 0.001],
              'kernel': ['rbf']
}

grid = RandomizedSearchCV(SVC(random_state=32), param_grid, verbose = 100, 
    scoring='accuracy', cv=3, n_iter=25)
 
# fitting the model for grid search
grid.fit(X_train_fs, y_train)
best_model = grid.best_estimator_

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV 1/3; 1/25] START C=10, gamma=0.004, kernel=rbf..............................
[CV 1/3; 1/25] END ............C=10, gamma=0.004, kernel=rbf; total time=   2.3s
[CV 2/3; 1/25] START C=10, gamma=0.004, kernel=rbf..............................
[CV 2/3; 1/25] END ............C=10, gamma=0.004, kernel=rbf; total time=   2.3s
[CV 3/3; 1/25] START C=10, gamma=0.004, kernel=rbf..............................
[CV 3/3; 1/25] END ............C=10, gamma=0.004, kernel=rbf; total time=   2.1s
[CV 1/3; 2/25] START C=2, gamma=0.004, kernel=rbf...............................
[CV 1/3; 2/25] END .............C=2, gamma=0.004, kernel=rbf; total time=   1.8s
[CV 2/3; 2/25] START C=2, gamma=0.004, kernel=rbf...............................
[CV 2/3; 2/25] END .............C=2, gamma=0.004, kernel=rbf; total time=   2.4s
[CV 3/3; 2/25] START C=2, gamma=0.004, kernel=rbf...............................
[CV 3/3; 2/25] END .............C=2, gamma=0.004

[CV 3/3; 17/25] END ............C=3, gamma=0.004, kernel=rbf; total time=   2.6s
[CV 1/3; 18/25] START C=1, gamma=0.003, kernel=rbf..............................
[CV 1/3; 18/25] END ............C=1, gamma=0.003, kernel=rbf; total time=   1.9s
[CV 2/3; 18/25] START C=1, gamma=0.003, kernel=rbf..............................
[CV 2/3; 18/25] END ............C=1, gamma=0.003, kernel=rbf; total time=   1.9s
[CV 3/3; 18/25] START C=1, gamma=0.003, kernel=rbf..............................
[CV 3/3; 18/25] END ............C=1, gamma=0.003, kernel=rbf; total time=   2.0s
[CV 1/3; 19/25] START C=1.5, gamma=0.001, kernel=rbf............................
[CV 1/3; 19/25] END ..........C=1.5, gamma=0.001, kernel=rbf; total time=   1.9s
[CV 2/3; 19/25] START C=1.5, gamma=0.001, kernel=rbf............................
[CV 2/3; 19/25] END ..........C=1.5, gamma=0.001, kernel=rbf; total time=   1.9s
[CV 3/3; 19/25] START C=1.5, gamma=0.001, kernel=rbf............................
[CV 3/3; 19/25] END ........

In [30]:
print(best_model)

SVC(C=10, gamma=0.004, random_state=32)


In [31]:
y_pred = best_model.predict(X_test_fs)

In [32]:
submission = pd.DataFrame({'PassengerId':X_test.index ,'Transported': y_pred.astype(bool)},
        columns=['PassengerId', 'Transported'])

submission.to_csv("../submissions/submission_with_SelectKBest_SVC.csv",index=False)