# Loading Basic Libraries

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2

# Load Dataset

In [59]:
def load_dataset(filename, train_or_test):
    df = pd.read_csv(filename)
    df = df.set_index('PassengerId')
    if(train_or_test):
        df['Transported'] = df['Transported'].astype(int)
        X = df.drop(['Transported'],axis=1)
        y = df['Transported']
        return X, y
    else:
        return df

In [61]:
X, y = load_dataset('../data/data_train.csv', 1)

# Split data

In [63]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,stratify=y,test_size=0.2,random_state=0)

# Modelling

In [64]:
def run(X_train, y_train, X_valid, y_valid):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)

    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_valid)
    print("Accuracy:",accuracy_score(y_valid, y_pred))

In [65]:
# feature selection
def select_features(X_train, y_train, X_test, k):
    fs = SelectKBest(score_func=chi2, k=k)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

# X_train_fs, X_valid_fs, fs = select_features(X_train, y_train, X_valid, k)

In [34]:
for k in range(2, X_df.shape[1]+1): 
    print("k =",k)
    X_train_fs, X_valid_fs, fs = select_features(X_train, y_train, X_valid, k)
    run(X_train_fs, y_train, X_valid_fs, y_valid)

k = 2
Accuracy: 0.5968947671075331
k = 3
Accuracy: 0.6302472685451409
k = 4
Accuracy: 0.6906267970097757
k = 5
Accuracy: 0.7619321449108684
k = 6
Accuracy: 0.7636572742955722
k = 7
Accuracy: 0.7642323174238068
k = 8
Accuracy: 0.777458309373203
k = 9
Accuracy: 0.78205865439908
k = 10
Accuracy: 0.7797584818861415
k = 11
Accuracy: 0.7837837837837838
k = 12
Accuracy: 0.7832087406555491
k = 13
Accuracy: 0.7855089131684876
k = 14
Accuracy: 0.7866589994249569
k = 15
Accuracy: 0.7763082231167338
k = 16
Accuracy: 0.7722829212190915
k = 17
Accuracy: 0.7878090856814262
k = 18
Accuracy: 0.7912593444508338
k = 19
Accuracy: 0.7889591719378953
k = 20
Accuracy: 0.7924094307073031
k = 21
Accuracy: 0.7918343875790684
k = 22
Accuracy: 0.7918343875790684
k = 23
Accuracy: 0.7912593444508338
k = 24
Accuracy: 0.7912593444508338
k = 25
Accuracy: 0.7912593444508338
k = 26
Accuracy: 0.7889591719378953
k = 27
Accuracy: 0.7889591719378953
k = 28
Accuracy: 0.7883841288096607
k = 29
Accuracy: 0.7883841288096607
k =

In [36]:
run(X_train, y_train, X_valid, y_valid)

Accuracy: 0.7906843013225991


## `k=33` best accuracy

In [66]:
X_train_fs, X_valid_fs, fs = select_features(X_train, y_train, X_valid, 33)
run(X_train_fs, y_train, X_valid_fs, y_valid)

Accuracy: 0.7947096032202415


In [69]:
X_test = load_dataset('../data/data_test.csv', 0)

In [72]:
X_train_fs, X_test, fs = select_features(X, y, X_test, 33)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_fs)
    X_valid = scaler.transform(X_)

    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_valid)
    print("Accuracy:",accuracy_score(y_valid, y_pred))

# Recursive Feature Elimination (`RFE`)

In [40]:
def my_rfe(X_train, y_train, X_test, model, n_features):
    rfe = RFE(model, n_features_to_select=n_features)
    rfe.fit(X_train, y_train)
    X_train_rfe = rfe.transform(X_train)
    X_test_rfe = rfe.transform(X_test)
    return X_train_rfe, X_test_rfe, rfe

In [42]:
X_train_rfe, X_valid_rfe, rfe = my_rfe(X_train, y_train, X_valid, LogisticRegression(max_iter=10000), 3)

In [52]:
for k in range(2, X_df.shape[1], 5): 
    print("n_features =",k)
    X_train_rfe, X_valid_rfe, rfe = my_rfe(X_train, y_train, X_valid, LogisticRegression(max_iter=10000), 3)   
    run(X_train_rfe, y_train, X_valid_rfe, y_valid)

n_features = 2
Accuracy: 0.7130534790109259
n_features = 7
Accuracy: 0.7130534790109259
n_features = 12
Accuracy: 0.7130534790109259
n_features = 17
Accuracy: 0.7130534790109259
n_features = 22
Accuracy: 0.7130534790109259
n_features = 27
Accuracy: 0.7130534790109259
n_features = 32
Accuracy: 0.7130534790109259
