# Loading Basic Libraries

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Load Dataset

In [8]:
def load_dataset(filename):
    df = pd.read_csv(filename)
    df['Transported'] = df['Transported'].astype(int)
    df = df.set_index('PassengerId')
    X = df.drop(['Transported'],axis=1)
    y = df['Transported']
    return X, y

In [9]:
X, y = load_dataset('../data/data_train.csv')

# Split data

In [31]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,stratify=y,test_size=0.2,random_state=0)

# Modelling

In [32]:
def run(X_train, y_train, X_valid, y_valid):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)

    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_valid)
    print("Accuracy:",accuracy_score(y_valid, y_pred))

In [33]:
# feature selection
def select_features(X_train, y_train, X_test, k):
    fs = SelectKBest(score_func=chi2, k=k)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

# X_train_fs, X_valid_fs, fs = select_features(X_train, y_train, X_valid, k)

In [34]:
for k in range(2, X_df.shape[1]+1): 
    print("k =",k)
    X_train_fs, X_valid_fs, fs = select_features(X_train, y_train, X_valid, k)
    run(X_train_fs, y_train, X_valid_fs, y_valid)

k = 2
Accuracy: 0.5968947671075331
k = 3
Accuracy: 0.6302472685451409
k = 4
Accuracy: 0.6906267970097757
k = 5
Accuracy: 0.7619321449108684
k = 6
Accuracy: 0.7636572742955722
k = 7
Accuracy: 0.7642323174238068
k = 8
Accuracy: 0.777458309373203
k = 9
Accuracy: 0.78205865439908
k = 10
Accuracy: 0.7797584818861415
k = 11
Accuracy: 0.7837837837837838
k = 12
Accuracy: 0.7832087406555491
k = 13
Accuracy: 0.7855089131684876
k = 14
Accuracy: 0.7866589994249569
k = 15
Accuracy: 0.7763082231167338
k = 16
Accuracy: 0.7722829212190915
k = 17
Accuracy: 0.7878090856814262
k = 18
Accuracy: 0.7912593444508338
k = 19
Accuracy: 0.7889591719378953
k = 20
Accuracy: 0.7924094307073031
k = 21
Accuracy: 0.7918343875790684
k = 22
Accuracy: 0.7918343875790684
k = 23
Accuracy: 0.7912593444508338
k = 24
Accuracy: 0.7912593444508338
k = 25
Accuracy: 0.7912593444508338
k = 26
Accuracy: 0.7889591719378953
k = 27
Accuracy: 0.7889591719378953
k = 28
Accuracy: 0.7883841288096607
k = 29
Accuracy: 0.7883841288096607
k =

In [36]:
run(X_train, y_train, X_valid, y_valid)

Accuracy: 0.7906843013225991


# Recursive Feature Elimination (`RFE`)

In [41]:
X=X_df
y=y_df

In [45]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,stratify=y,test_size=0.3,random_state=0)

In [48]:
from sklearn.feature_selection import RFE

model = LogisticRegression(max_iter=100000)
rfe = RFE(model, 4)
fit = rfe.fit(X_train, y_train)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)



Num Features: 4
Selected Features: [ True False False False False False False False False  True False False
 False False False False False False False False False False  True  True
 False False False False False False False False False False False False]


In [None]:
for k in range(2, X_df.shape[1]+1): 
    print("k =",k)
    kbest = SelectKBest(k=k)
    fit = kbest.fit(X_df, y_df)
    X = fit.transform(X_df)
    y=y_df
    X_train, X_valid, y_train, y_valid = train_test_split(X,y,stratify=y,
                                                      test_size=0.3,random_state=0)
    run(X_train, y_train, X_valid, y_valid)

In [49]:
y_pred = fit.predict(X_valid)
print("Accuracy:",accuracy_score(y_valid, y_pred))

Accuracy: 0.7285276073619632
