# Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.cluster import KMeans
import sklearn
import mlflow
from mlflow.models import infer_signature

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes

mlflow.set_tracking_uri("http://127.0.0.1:5000")

# import keras_tuner as kt
from sklearn.ensemble import StackingClassifier
import tensorflow as tf
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector, make_column_transformer

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
import sklearn.preprocessing as prepoc

from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier
from scipy.stats import randint, uniform, reciprocal, expon

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# for building and training neural networks
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from hyperopt import STATUS_OK, STATUS_FAIL, Trials, fmin, hp, tpe

# Reading the data

In [None]:
from pathlib import Path

In [None]:
path = (Path('..') / 'Kaggle-Titanic-Spacechip-Competion')

In [None]:
train_data = pd.read_csv(path / 'train.csv')
test_data = pd.read_csv(path / 'test.csv')

# EDA

### Outliers

In [None]:
fig, axs = plt.subplots(3, 2, figsize=(10, 6))  # Create a figure and a 1x2 subplot

# Plot on the first subplot
axs[0][0].scatter(treated_train_data['Transported'], treated_train_data['VRDeck'])
axs[0][0].set_xlabel('Transported')
axs[0][0].set_ylabel('VRDeck')
axs[0][0].set_title('Scatter Plot: Transported vs VRDeck')

# Plot on the second subplot
axs[0][1].scatter(treated_train_data['Transported'], treated_train_data['ShoppingMall'])
axs[0][1].set_xlabel('Transported')
axs[0][1].set_ylabel('ShoppingMall')
axs[0][1].set_title('Scatter Plot: Transported vs ShoppingMall')

# Plot on the second subplot
axs[1][0].scatter(treated_train_data['Transported'], treated_train_data['Spa'])
axs[1][0].set_xlabel('Transported')
axs[1][0].set_ylabel('Spa')
axs[1][0].set_title('Scatter Plot: Transported vs Spa')

# Plot on the second subplot
axs[1][1].scatter(treated_train_data['Transported'], treated_train_data['RoomService'])
axs[1][1].set_xlabel('Transported')
axs[1][1].set_ylabel('RoomService')
axs[1][1].set_title('Scatter Plot: Transported vs RoomService')

# Plot on the second subplot
axs[2][0].scatter(treated_train_data['Transported'], treated_train_data['total_spent'])
axs[2][0].set_xlabel('Transported')
axs[2][0].set_ylabel('total_spent')
axs[2][0].set_title('Scatter Plot: Transported vs Total Spent')

# Plot on the second subplot
axs[2][1].scatter(treated_train_data['Transported'], treated_train_data['FoodCourt'])
axs[2][1].set_xlabel('Transported')
axs[2][1].set_ylabel('FoodCourt')
axs[2][1].set_title('Scatter Plot: Transported vs FoodCourt')

plt.tight_layout()  # Adjust the padding between and around the subplots
plt.show()

In [None]:
drop_ix_food = treated_train_data[(treated_train_data['FoodCourt']>20000)].index
drop_ix_shopping = treated_train_data[(treated_train_data['ShoppingMall']>10000)].index

treated_train_data = treated_train_data.drop(drop_ix_food)
treated_train_data = treated_train_data.drop(drop_ix_shopping)

### Checking relationships between features

In [None]:
numerical_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

figsize = (1.5, 1.3)
g = sns.PairGrid(train_data, hue='Transported', vars=numerical_columns, height=figsize[1], aspect=figsize[0]/figsize[1])
g.map_lower(sns.scatterplot)
legend = g.add_legend(loc='upper right', bbox_to_anchor=(0.65, 0.7))

for i, j in zip(*np.triu_indices_from(g.axes, 0)):
    g.axes[i, j].set_visible(False)

plt.show()

In [None]:
y_train = treated_train_data['Transported'].astype(int)
treated_train_data.drop('Transported', axis=1, inplace=True)
treated_train_data.drop('PassengerId', axis=1, inplace=True)
treated_train_data.drop('passenger_group', axis=1, inplace=True)

### Feature Distribution

In [None]:
# Plot histograms for each variable
fig, axes = plt.subplots(nrows=1, ncols=len(numerical_columns), figsize=(12, 4))

for i, var in enumerate(numerical_columns):
    axes[i].hist(treated_train_data[var], bins=20, alpha=0.7)
    axes[i].set_title(var)

plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import PowerTransformer

normalizer_pipeline = Pipeline([
    ('feature_processing', ColumnTransformer([
        ('normalizer', PowerTransformer(method='yeo-johnson'), make_column_selector(dtype_include=float)),
        ('encoder', OneHotEncoder(), make_column_selector(dtype_exclude=np.number))
    ], remainder='passthrough')),
])

treated_train_data = pd.DataFrame(normalizer_pipeline.fit_transform(treated_train_data), 
                                  columns= [col.replace('normalizer__', '').replace('remainder__', '').replace('encoder__', '') for col in normalizer_pipeline.named_steps['feature_processing'].get_feature_names_out()])

treated_test_data = pd.DataFrame(normalizer_pipeline.transform(treated_test_data), 
                                 columns= [col.replace('normalizer__', '').replace('remainder__', '').replace('encoder__', '') for col in normalizer_pipeline.named_steps['feature_processing'].get_feature_names_out()])

In [None]:
# Plot histograms for each variable
fig, axes = plt.subplots(nrows=1, ncols=len(numerical_columns), figsize=(12, 4))

for i, var in enumerate(numerical_columns):
    axes[i].hist(treated_train_data[var], bins=20, alpha=0.7)
    axes[i].set_title(var)

plt.tight_layout()
plt.show()

# Models

In [None]:
X_train = pd.read_parquet(path / 'X_train.pq')
X_test = pd.read_parquet(path / 'X_test.pq')
X_val = pd.read_parquet(path / 'X_val.pq')

pca_train = pd.read_parquet(path / 'pca_train.pq')
pca_test = pd.read_parquet(path / 'pca_test.pq')
pca_val = pd.read_parquet(path / 'pca_val.pq')

y_train = pd.read_parquet(path / 'y_train.pq').Transported
y_test = pd.read_parquet(path / 'y_test.pq').Transported
y_val = pd.read_parquet(path / 'y_val.pq').Transported

In [None]:
columns = [
    "passenger_group_count_frequency_yj",
    "passenger_group_ordinal_enc_equal_freq",
    "passenger_group_count_frequency_equal_freq_ef_ordinal_yj"
]

X_train = X_train.drop(columns, axis=1)
X_test = X_test.drop(columns, axis=1)
X_val = X_val.drop(columns, axis=1)


In [None]:
print(X_train.shape)
print(y_test.shape)
print(y_val.shape)

In [None]:
X_train = pca_train
X_test = pca_test
X_val = pca_val

y_train = y_train.iloc[pca_train.index]
y_test = y_test.iloc[pca_test.index]
y_val = y_val.iloc[pca_val.index]

In [None]:
def objective(params):
    # MLflow will track the parameters and results for each run
    result = train_model(
        params,
        epochs=3,
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
    )
    return result

### Gradient Classifier

In [None]:
X_train

In [None]:
mlflow.set_experiment("/mlops_gbc_4")

signature = infer_signature(X_train, y_train)

space = {
    'n_estimators': hp.quniform('n_estimators', 680, 830, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.003), np.log(0.0075)),
    'max_depth': 4,
    'subsample': hp.uniform('subsample', 0.92, 1),
    'min_samples_split': 3,
    'min_samples_leaf': 1,
    'max_features': hp.uniform('max_features', 0.31, 0.37)
}

def train_model(params, epochs, X_train, y_train, X_val, y_val):

    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    params['min_samples_split'] = int(params['min_samples_split'])
    params['min_samples_leaf'] = int(params['min_samples_leaf'])

    model = GradientBoostingClassifier(**params)

    with mlflow.start_run(nested=True):

        model.fit(X_train, y_train)

        # Predict on the validation set
        preds = model.predict(X_val)
        mse = mean_squared_error(y_val, preds)
        eval_rmse = np.sqrt(mse)
        eval_accuracy = accuracy_score(y_val, preds)

        # Log hyperparameters and the evaluation metric
        mlflow.log_params(params)
        mlflow.log_metric("eval_rmse", eval_rmse)
        mlflow.log_metric("eval_accuracy", eval_accuracy)

        mlflow.sklearn.log_model(
            model, "model",
            signature=signature
        )

    return {"loss": eval_rmse, "status": STATUS_OK, "model": model}

with mlflow.start_run():
    trials=Trials()
    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=50,
        trials=trials
    )

    best_run = sorted(trials.results, key=lambda x:x["loss"])[0]

    mlflow.log_params(best)
    mlflow.log_metric("eval_rmse", best_run['loss'])
    mlflow.log_metric("eval_accuracy", best_run['eval_accuracy'])

    mlflow.sklearn.log_model(best_run["model"], "model", signature=signature)

    print(f"Best Parameters: {best}")
    print(f"Best eval_accuracy: {best_run['eval_accuracy']}")

logged_model = 'runs:/f90df2cba435421598e4bd2872fb0bfd/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

y_pred = loaded_model.predict(X_val)

eval_accuracy = accuracy_score(y_val, y_pred)
eval_accuracy

### XGBoost

In [None]:
# mlflow.set_experiment("/mlops_xgb_2")

signature = infer_signature(X_train, y_train)

space = {
    'n_estimators': hp.quniform('n_estimators', 200, 1000, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.015), np.log(0.032)),
    'max_depth': 4,
    'subsample': hp.uniform('subsample', 0.7, 1),
    'min_child_weight': hp.quniform('min_child_weight', 3, 6, 1),
}

def train_model(params, epochs, X_train, y_train, X_val, y_val):

    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])
    params['min_child_weight'] = int(params['min_child_weight'])

    model = XGBClassifier(**params, enable_categorical=True)

    with mlflow.start_run(nested=True):

        model.fit(X_train, y_train)

        # Predict on the validation set
        preds = model.predict(X_val)
        mse = mean_squared_error(y_val, preds)
        eval_rmse = np.sqrt(mse)
        eval_accuracy = accuracy_score(y_val, preds)

        # Log hyperparameters and the evaluation metric
        mlflow.log_params(params)
        mlflow.log_metric("eval_rmse", eval_rmse)
        mlflow.log_metric("eval_accuracy", eval_accuracy)

        mlflow.sklearn.log_model(
            model, "model",
            signature=signature
        )

    return {"loss": eval_rmse, "status": STATUS_OK, "model": model}

with mlflow.start_run():
    trials=Trials()
    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=50,
        trials=trials
    )

    best_run = sorted(trials.results, key=lambda x: x["loss"])[0]

    mlflow.log_params(best)
    mlflow.log_metric("eval_rmse", best_run['loss'])

    mlflow.sklearn.log_model(best_run["model"], "model", signature=signature)

    print(f"Best Parameters: {best}")
    print(f"Best eval_accuracy: {best_run['accuracy']}")

logged_model = 'runs:/b65a47d54de047dc87554b67a89186a0/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

y_pred = loaded_model.predict(X_val)

eval_accuracy = accuracy_score(y_val, y_pred)
eval_accuracy

### Logistic Regression

In [None]:
# mlflow.set_experiment("/mlops_xgb_2")

signature = infer_signature(X_train, y_train)

space = {
    'C': hp.loguniform('C', np.log(1e-4), np.log(1e2)),
    'solver': hp.choice('solver', ['liblinear', 'saga']),
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'fit_intercept': hp.choice('fit_intercept', [True, False]),
    'class_weight': hp.choice('class_weight', [None, 'balanced'])
}

def train_model(params, epochs, X_train, y_train, X_val, y_val):

    if params['solver'] == 'liblinear' and params['penalty'] == 'l1':
        params['dual'] = False
    elif params['solver'] == 'saga' and params['penalty'] == 'l1':
        params['dual'] = False
    else:
        return {'loss': np.inf, 'status': STATUS_FAIL}

    model = LogisticRegression(**params, max_iter=1000)

    with mlflow.start_run(nested=True):

        model.fit(X_train, y_train)

        # Predict on the validation set
        preds = model.predict(X_val)
        mse = mean_squared_error(y_val, preds)
        eval_rmse = np.sqrt(mse)
        eval_accuracy = accuracy_score(y_val, preds)

        # Log hyperparameters and the evaluation metric
        mlflow.log_params(params)
        mlflow.log_metric("eval_rmse", eval_rmse)
        mlflow.log_metric("eval_accuracy", eval_accuracy)

        mlflow.sklearn.log_model(
            model, "model",
            signature=signature
        )

    return {"loss": eval_rmse, "status": STATUS_OK, "model": model}

with mlflow.start_run():
    trials=Trials()
    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=50,
        trials=trials
    )

    best_run = sorted(trials.results, key=lambda x:x["loss"])[0]

    mlflow.log_params(best)
    mlflow.log_metric("eval_rmse", best_run['loss'])

    mlflow.sklearn.log_model(best_run["model"], "model", signature=signature)

logged_model = 'runs:/f90df2cba435421598e4bd2872fb0bfd/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

y_pred = loaded_model.predict(X_val)

eval_accuracy = accuracy_score(y_val, y_pred)
eval_accuracy

### SVClassifier

In [None]:
# mlflow.set_experiment("/mlops_svc_1")

signature = infer_signature(X_train, y_train)

space_svc = {
    'C': hp.uniform('svc__C', 0.1, 10.0),
    'kernel': hp.choice('svc__kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    'gamma': hp.choice('svc__gamma', ['scale', 'auto']),
    'degree': hp.randint('svc__degree', 2, 5),
}

def train_model(params, epochs, X_train, y_train, X_val, y_val):

    model = SVC(**params)

    with mlflow.start_run(nested=True):

        model.fit(X_train, y_train)

        # Predict on the validation set
        preds = model.predict(X_val)
        mse = mean_squared_error(y_val, preds)
        eval_rmse = np.sqrt(mse)
        eval_accuracy = accuracy_score(y_val, preds)

        # Log hyperparameters and the evaluation metric
        mlflow.log_params(params)
        mlflow.log_metric("eval_rmse", eval_rmse)
        mlflow.log_metric("eval_accuracy", eval_accuracy)

        mlflow.sklearn.log_model(
            model, "model",
            signature=signature
        )

    return {"loss": eval_rmse, "status": STATUS_OK, "model": model}

with mlflow.start_run():
    trials=Trials()
    best = fmin(
        fn=objective,
        space=space_svc,
        algo=tpe.suggest,
        max_evals=50,
        trials=trials
    )

    best_run = sorted(trials.results, key=lambda x: x["loss"])[0]

    mlflow.log_params(best)
    mlflow.log_metric("eval_rmse", best_run['loss'])

    mlflow.sklearn.log_model(best_run["model"], "model", signature=signature)

    print(f"Best Parameters: {best}")
    print(f"Best eval_accuracy: {best_run['loss']}")

### Stacking

In [None]:
from mlflow.tracking import MlflowClient
import joblib

client = MlflowClient()

models = {}
registered_models = client.search_registered_models()

for model in registered_models:

    latest_version = max(int(v.version) for v in model.latest_versions)
    model_uri = f"models:/{model.name}/{latest_version}"

    loaded_model = mlflow.sklearn.load_model(model_uri)

    joblib.dump(loaded_model, f'{model.name}.pkl')

    models[model.name] = loaded_model

In [None]:
stacking_clf = StackingClassifier(

    estimators=[
        ('xgb', models['best_xgb_model']),
        ('lr', models['best_lr_model']),
        ('svc', models['best_svc_model']),
        ('gbc', models['best_gbc_model']),
    ],
    cv = 5
)

stacking_clf.fit(X_train, y_train)

In [None]:
stacking_accuracy = cross_val_score(stacking_clf, X_val, y_val,
                              scoring='accuracy', cv=10)

In [None]:
pd.Series(stacking_accuracy).describe()

In [None]:
stacking_clf.predict_proba(treated_test_data)

In [None]:
pd.DataFrame(stacking_clf.predict(treated_test_data).astype(bool),test_passenger_id, columns=['Transported']).to_csv(path / 'red_1.csv')

## Neural Network

In [None]:
X_train_array = X_train.to_numpy()
y_train_array = y_train.to_numpy()

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(30, activation='relu'),
    tf.keras.layers.Dense(12, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_array, y_train_array, epochs=100, validation_data=(X_val, y_val))

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
# Define layers
hidden_layer1 = tf.keras.layers.Dense(30, activation='relu')
hidden_layer2 = tf.keras.layers.Dense(30, activation='relu')
concat_layer = tf.keras.layers.Concatenate()
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')

# Define input layer
input_ = tf.keras.layers.Input(shape=X_train.shape[1:])

# Connect layers
hidden1 = hidden_layer1(input_)
hidden2 = hidden_layer2(hidden1)
concat = concat_layer([input_, hidden2])
output = output_layer(concat)

# Define model
model = tf.keras.Model(inputs=[input_], outputs=[output])

# Compile model with loss function
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train_array, y_train_array, epochs=100, validation_data=(X_val, y_val))

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
def build_model(hp):
    n_hidden = hp.Int('n_hidden', min_value=6, max_value=11, default=9)
    n_neurons = hp.Int('n_neurons', min_value=60, max_value=80)
    learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=2e-3,
                            sampling = 'log')
    optimizer = hp.Choice('optimizer', values=['sgd', 'adam', 'adagrad', 'adadelta', 'rmsprop', 'adamax', 'nadam'])
    if optimizer == 'sgd':
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    elif optimizer == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer == 'rmsprop':
        optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == 'adamax':
        optimizer = tf.keras.optimizers.Adamax(learning_rate=learning_rate)

    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Flatten())

    for _ in range(n_hidden):
        model.add(tf.keras.layers.Dense(n_neurons, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer,
                 metrics=['accuracy'])
    return model

In [None]:
import keras_tuner as kt

In [None]:
random_search_tuner = kt.RandomSearch(
    build_model, objective='val_accuracy', max_trials=40, overwrite=True,
    directory='my_kaggle_comp', project_name='my_rnd_search')

random_search_tuner.search(X_train, y_train, epochs=40,
                          validation_data=(X_val, y_val))

In [None]:
top3_models = random_search_tuner.get_best_models(num_models=6)

In [None]:
best_model=top3_models[0]

In [None]:
top3_params = random_search_tuner.get_best_hyperparameters(num_trials=3)
for param in top3_params:
    print(param.values)

In [None]:
top3_trials = random_search_tuner.oracle.get_best_trials(num_trials=3)
for summ in top3_trials:
    print(summ.summary())

In [None]:
best_model.predict(treated_test_data).astype(bool)

In [None]:
predictions = best_model.predict(treated_test_data)
print(predictions[:10])  # Print the first 10 predictions

In [None]:
binary_predictions = (predictions > 0.5).astype(int)

In [None]:
binary_predictions

In [None]:
pd.DataFrame(binary_predictions.astype(bool),test_passenger_id, columns=['Transported']).to_csv(path / 'pred_2.csv')

In [None]:
X_train.describe()

# Testing out some new features

In [None]:
treated_train_data.columns()

In [None]:
kmeans = KMeans(n_clusters=60, random_state=42, n_init=10) 
X_train['Cluster'] = kmeans.fit_predict(X_train)

In [None]:
X_train['RoomService/total_spent'] = X_train['RoomService']/X_train['total_spent']
X_train['FoodCourt/total_spent'] = X_train['FoodCourt']/X_train['total_spent']
X_train['ShoppingMall/total_spent'] = X_train['ShoppingMall']/X_train['total_spent']
X_train['Spa/total_spent'] = X_train['Spa']/X_train['total_spent']
X_train['VRDeck/total_spent'] = X_train['VRDeck']/X_train['total_spent']

In [None]:
rnd_search_xgb.fit(X_train, y_train)

print(rnd_search_xgb.best_params_)

print(rnd_search_xgb.best_score_)

final_xgb = rnd_search_xgb.best_estimator_

feature_importances = final_xgb['xgb'].feature_importances_

important_features = sorted(zip(feature_importances,
         X_train.columns),
         reverse=True)

features = []

for feature in important_features[0:20]:
    features.append(feature[1])

In [None]:
important_features

In [None]:
df_concat = pd.concat([X_train, y_train], axis=1)
df_concat

In [None]:
numerical_columns = ['RoomService/total_spent','FoodCourt/total_spent','ShoppingMall/total_spent','Spa/total_spent','VRDeck/total_spent']

figsize = (1.5, 1.3)
g = sns.PairGrid(df_concat, hue='Transported', vars=numerical_columns, height=figsize[1], aspect=figsize[0]/figsize[1])
g.map_lower(sns.scatterplot)
legend = g.add_legend(loc='upper right', bbox_to_anchor=(0.65, 0.7))

for i, j in zip(*np.triu_indices_from(g.axes, 0)):
    g.axes[i, j].set_visible(False)

plt.show()

In [None]:
original_feat = []
new_feat = []
original_feat = X_train.columns[:-10]
original_feat = X_train[original_feat]
new_feat = X_train.columns[-10:]
new_feat = X_train[new_feat]

new_feat