In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

## <a id="read_data"></a> Read Data

In [None]:
path = (Path('..') / 'Kaggle-Titanic-Spacechip-Competion')

In [None]:
train_data = pd.read_csv(path / 'train.csv')
test_data = pd.read_csv(path / 'test.csv')

In [None]:
train_data.columns

In [None]:
X_train = pd.read_parquet(path / 'X_train.pq')
X_test = pd.read_parquet(path / 'X_test.pq')
X_val = pd.read_parquet(path / 'X_val.pq')

y_train = pd.read_parquet(path / 'y_train.pq').Transported
y_test = pd.read_parquet(path / 'y_test.pq').Transported
y_val = pd.read_parquet(path / 'y_val.pq').Transported

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
X_train

# Feature Selection

## Basic Methods

### Remove constant features

In [None]:
from sklearn.feature_selection import VarianceThreshold
from feature_engine.selection import DropConstantFeatures

In [None]:
var_threshold = VarianceThreshold(threshold=0).set_output(transform="pandas")

In [None]:
var_threshold_df = var_threshold.fit_transform(X_train)

In [None]:
sum(var_threshold.get_support())

In [None]:
sel = DropConstantFeatures(tol=1)
# sel.fit(X_train)

# X_train_t = sel.transform(X_train)
# X_train_t.shape

### Remove quasi-constant features

In [None]:
sel = VarianceThreshold(threshold=0.1).set_output(transform="pandas")
sel.fit(X_train)
X_train_t = sel.transform(X_train)
X_train_t.shape

### Remove duplicate features

In [None]:
from feature_engine.selection import DropDuplicateFeatures

In [None]:
sel = DropDuplicateFeatures()
sel.fit(X_train)
X_train_t = sel.transform(X_train)
X_train_t.shape

In [None]:
# sel.duplicated_feature_sets_

### Applying to X_train

In [None]:
var_t = VarianceThreshold(threshold=0.1).set_output(transform="pandas")
drop_d = DropDuplicateFeatures()

X_train_ = drop_d.fit_transform(X_train)
X_train = var_t.fit_transform(X_train_)

X_test = drop_d.transform(X_test)
X_test = var_t.transform(X_test)

X_val = drop_d.transform(X_val)
X_val = var_t.transform(X_val)

In [None]:
X_test.shape[1]

## Correlation

### Correlated Features

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from feature_engine.selection import SmartCorrelatedSelection

In [None]:
corrmat = X_train.corr(method="pearson")

In [None]:
if X_train.shape[1] < 20:
    fig, ax = plt.subplots()
    fig.set_size_inches(10, 8)
    cmap = sns.diverging_palette(220, 20, as_cmap=True)
    sns.heatmap(corrmat, cmap=cmap, annot=True)
else:
    pass

In [None]:
from feature_engine.selection import DropCorrelatedFeatures

sel = DropCorrelatedFeatures(method="pearson", threshold=0.9)
sel.fit(X_train)
X_train_t = sel.transform(X_train)
X_train_t.shape

In [None]:
sel = SmartCorrelatedSelection(
    method="pearson",
    threshold=0.9,
    selection_method="model_performance",
    estimator=RandomForestClassifier(n_estimators=5, random_state=10),
    scoring="roc_auc",
    cv=3,
)

In [None]:
# sel.fit(X_train, y_train)

In [None]:
# X_train_t = sel.transform(X_train)
# X_train_t.shape

### Applying to X_train

In [None]:
smart_corr = SmartCorrelatedSelection(
    method="pearson",
    threshold=0.9,
    selection_method="model_performance",
    estimator=RandomForestClassifier(n_estimators=5, random_state=10),
    scoring="roc_auc",
    cv=3,
)

X_train = smart_corr.fit_transform(X_train, y_train)
X_test = smart_corr.transform(X_test)
X_val = smart_corr.transform(X_val)

In [None]:
X_train.to_parquet(path / 'X_train.pq')
X_test.to_parquet(path / 'X_test.pq')
X_val.to_parquet(path / 'X_val.pq')

## Statistical Methods

In [None]:
X_train

## Models

In [None]:
import joblib
from sklearn.metrics import accuracy_score

gbc_model = joblib.load('best_gbc_model.pkl')
lr_model = joblib.load('best_lr_model.pkl')
svc_model = joblib.load('best_svc_model.pkl')
xgb_model = joblib.load('best_xgb_model.pkl')
stacking_clf = joblib.load('stacking_clf.pkl')

In [None]:
X_train

In [None]:
gbc_model.fit(X_train, y_train)

In [None]:
svc_model.fit(X_train, y_train)

In [None]:
lr_model.fit(X_train, y_train)

In [None]:
xgb_model.fit(X_train, y_train)

In [None]:
stacking_clf.fit(X_train, y_train)

In [None]:
def accuracy_check(model, X_val, y_val):

    y_pred = model.predict(X_val)
    eval_accuracy = accuracy_score(y_val, y_pred, )

    return eval_accuracy

In [None]:
print(accuracy_check(stacking_clf, X_train, y_train))
print(accuracy_check(stacking_clf, X_test, y_test))
print(accuracy_check(stacking_clf, X_val, y_val))

In [None]:
print(accuracy_check(gbc_model, X_train, y_train))
print(accuracy_check(gbc_model, X_test, y_test))
print(accuracy_check(gbc_model, X_val, y_val))

In [None]:
print(accuracy_check(xgb_model, X_train, y_train))
print(accuracy_check(xgb_model, X_test, y_test))
print(accuracy_check(xgb_model, X_val, y_val))

In [None]:
accuracy_check(lr_model, X_val, y_val)

In [None]:
accuracy_check(svc_model, X_val, y_val)