In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from scipy.stats import chi2_contingency
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFE, SelectKBest, chi2
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve, auc

In [2]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')
submission_sample = pd.read_csv('../data/sample_submission.csv')

In [3]:
train_data['Transported'] = train_data['Transported'].replace({True: 'True', False: 'False'})
train_data['VIP'] = train_data['VIP'].replace({True: 'True', False: 'False'})
test_data['VIP'] = test_data['VIP'].replace({True: 'True', False: 'False'})
train_data['CryoSleep'] = train_data['CryoSleep'].replace({True: 'True', False: 'False'})
test_data['CryoSleep'] = test_data['CryoSleep'].replace({True: 'True', False: 'False'})

In [4]:
train_target = train_data['Transported']
train_data = train_data.drop(['Name', 'PassengerId', 'Transported', 'Cabin'], axis=1)
test_data = test_data.drop(['Name', 'PassengerId', 'Cabin'], axis=1)

In [5]:
cat_feat = train_data.select_dtypes(np.object)
num_feat = train_data.select_dtypes(np.number)

In [6]:
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(train_data, train_target, test_size=0.3, random_state=42)

In [7]:
cat_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder())
    ])

num_pipe = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', Normalizer())
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('encoder', cat_pipe, cat_feat.columns),
        ('scaler', num_pipe, num_feat.columns)
    ])

In [8]:
test_scores = [] 

for i in np.arange(1,17): 
    clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('selector', SelectKBest(score_func=chi2, k=i)),
        ('classifier', DecisionTreeClassifier())
    ])
    
    clf.fit(X_train_val, y_train_val)
    test_scores.append(clf.score(X_test_val, y_test_val))

print('best k values =', test_scores.index(max(test_scores))+1)
print('classification score =', max(test_scores))

best k values = 16
classification score = 0.7396472392638037


In [35]:
clf = Pipeline(
    steps=[
       ('preprocessor', preprocessor),
       ('selector', SelectKBest(score_func=chi2, k=16)),
       ('classifier', DecisionTreeClassifier())
    ])

In [36]:
clf.fit(X_train_val, y_train_val)

In [37]:
clf.predict(X_test_val)

array(['False', 'False', 'True', ..., 'False', 'True', 'True'],
      dtype=object)

In [38]:
print('train score =',clf.score(X_train_val, y_train_val))
print('test score =', clf.score(X_test_val, y_test_val))

train score = 0.9232539030402629
test score = 0.7404141104294478


In [39]:
grid_param = {
    'classifier__max_depth': [4, 5, 6, 10, None],
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__min_samples_split': [2, 3],
    'classifier__min_samples_leaf': [1, 2]
}

grid = GridSearchCV(clf, grid_param, cv=5)

grid.fit(X_train_val, y_train_val)

In [40]:
grid.best_params_

{'classifier__max_depth': 10,
 'classifier__max_features': None,
 'classifier__min_samples_leaf': 1,
 'classifier__min_samples_split': 2}

In [41]:
grid.score(X_train_val, y_train_val)

0.838783894823336

In [42]:
grid.score(X_test_val, y_test_val)

0.772239263803681