In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import QuantileTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

## <a id="read_data"></a> Read Data

In [None]:
path = (Path('..') / 'Kaggle-Titanic-Spacechip-Competion')

In [None]:
train_data = pd.read_csv(path / 'train.csv')
test_data = pd.read_csv(path / 'test.csv')

In [None]:
y_train = train_data['Transported']
y_train = y_train.astype(int)

train_data = train_data.drop(columns=['Transported',
                                      'PassengerId',
                                      'Name'])

In [None]:
train_data

#### Split data

In [None]:
# First, split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(train_data, y_train, test_size=0.2, random_state=42)

# Then, split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

#### Initial treatment

In [None]:
# # The name of the passengers isn't going to be used for nothing in this notebook
# train_data_1 = train_data.drop('Name', axis=1)
# test_data_1 = test_data.drop('Name',axis=1)
# # The passenger IDs will be used at the end to return the correct IDs with the model previsions
# passenger_id = train_data_1['PassengerId']
# test_passenger_id = test_data_1['PassengerId']

## Feature Engineering

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

import feature_engine.imputation as mdi
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.discretisation import DecisionTreeDiscretiser
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.encoding import OrdinalEncoder

from feature_engine.transformation import PowerTransformer  as fe_PowerTransformer
from sklearn.preprocessing import PowerTransformer as sk_PowerTransformer
from sklearn.preprocessing import MinMaxScaler

class BooleanTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def get_feature_names_out(self, input_features=None):
        return list(input_features)

    def transform(self, X):
        X = X.copy()
        X['VIP'] = X['VIP'].map({True: 1, False: 0})
        X['CryoSleep'] = X['CryoSleep'].map({True: 1, False: 0})
        return X

class AddPassengerGroup(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['passenger_group'] = X['PassengerId'].str[:4]
        X = X.drop(columns=['PassengerId'], axis=1)
        return X

class FillBinaryNumericTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        binary_columns = ['CryoSleep','VIP']
        numeric_columns = ['FoodCourt','RoomService','Spa','VRDeck','ShoppingMall']

        # I am assuming here that if there is no record of the person in CryoSleep or if they are on the VIP list they probably aren't in neither
        X.loc[:, binary_columns] = X[binary_columns].fillna(False)
        # The same logic applies here, if there is no record of the passenger spending money, they probably didn't spend it
        X.loc[:, numeric_columns] = X[numeric_columns].fillna(0)
        return X

class FillCabinDestHomeAgeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Here the function groups the data by the passanger group, and fills in the other data based on the info of another passanger that's in their group
        # For example if the daughter didn't put her data correct but her mother did, the code will retrieve that information
        X['Cabin'] = X.groupby('passenger_group')['Cabin'].transform(lambda x: x.fillna(x.iloc[0]))
        X['Destination'] = X.groupby('passenger_group')['Destination'].transform(lambda x: x.fillna(x.iloc[0]))
        X['HomePlanet'] = X.groupby('passenger_group')['HomePlanet'].transform(lambda x: x.fillna(x.iloc[0]))
        X['Age'] = X.groupby('passenger_group')['Age'].transform(lambda value: value.fillna(value.mean()))
        return X

class FillRestTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # For the groups that didn't have any information, I just decided to fill in the Age with the mean of values from that group and the other just a 'missing' string
        X['Age'] = X.groupby('passenger_group')['Age'].transform(lambda value: value.fillna(value.mean()))
        X['Age'] = X['Age'].transform(lambda value: value.fillna(value.mean()))
        X[['Cabin','HomePlanet','Destination']] = X[['Cabin','HomePlanet','Destination']].fillna('missing')
        return X

class DualCategoricalImputer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 fill_value="missing",
                 imputation_method="most_frequent",
                 imputer_choice="both"):
        """
        Parameters:
        -----------
        fill_value: any, default="missing"
            The constant value to use when imputing missing values.
        imputation_method: str, default="most_frequent"
            The strategy for imputing missing values (e.g., "most_frequent").
        imputer_choice: str, default="both"
            Determines which imputer to apply.
            Options:
                "both"      - apply both the constant fill imputer and the chosen strategy imputer (default)
                "constant"  - apply only the constant fill imputer
                "frequent"  - apply only the strategy imputer (e.g., most frequent)
        """
        self.fill_value = fill_value
        self.imputation_method = imputation_method
        self.imputer_choice = imputer_choice

        self.imputer_1 = SimpleImputer(strategy="constant", fill_value=self.fill_value)
        self.imputer_2 = SimpleImputer(strategy=self.imputation_method)

    def fit(self, X, y=None):
        # Select only the categorical columns.
        categorical_cols = X.select_dtypes(include=['object', 'category']).columns
        self.categorical_cols = categorical_cols

        if self.imputer_choice in ["both", "constant"]:
            self.imputer_1.fit(X[categorical_cols])
        if self.imputer_choice in ["both", "frequent"]:
            self.imputer_2.fit(X[categorical_cols])
        return self

    def transform(self, X):
        X = X.copy()
        df_list = [X]  # Optionally include original columns

        if self.imputer_choice in ["both", "constant"]:
            X_missing = pd.DataFrame(
                self.imputer_1.transform(X[self.categorical_cols]),
                columns=[f"{col}_missing" for col in self.categorical_cols],
                index=X.index
            )
            df_list.append(X_missing)

        if self.imputer_choice in ["both", "frequent"]:
            X_frequent = pd.DataFrame(
                self.imputer_2.transform(X[self.categorical_cols]),
                columns=[f"{col}_frequent" for col in self.categorical_cols],
                index=X.index
            )
            df_list.append(X_frequent)

        df = pd.concat(df_list, axis=1)
        return df

class MultipleNumericalImputer(BaseEstimator, TransformerMixin):
    def __init__(self,
                 max_iter=10,
                 imputation_method="mean",
                 imputer_choice="all"):
        """
        Parameters:
        -----------
        max_iter : int, default=10
            Maximum number of iterations for the iterative imputers.
        imputation_method : str, default="mean"
            The imputation method for the mean/median imputer.
        imputer_choice : str or list, default="all"
            Specifies which imputation method(s) to apply.
            Options (if string):
                "all"         - apply all available imputers.
                "mean_median" - use the mean/median imputer.
                "fill_binary" - use the binary fill imputer.
                "bayes"       - use the Bayesian Ridge-based imputer.
                "knn"         - use the KNeighbors-based imputer.
                "nonlin"      - use the Decision Tree-based imputer.
                "missforest"  - use the Extra Trees-based imputer.
            You can also pass a list of these strings to combine methods.
        """
        self.max_iter = max_iter
        self.imputation_method = imputation_method
        self.imputer_choice = imputer_choice

        self.imputer_mean_median = mdi.MeanMedianImputer(imputation_method=imputation_method)
        self.imputer_fill_binary = FillBinaryNumericTransformer()
        self.imputer_bayes = IterativeImputer(estimator=BayesianRidge(), max_iter=max_iter)
        self.imputer_knn = IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=5), max_iter=max_iter)
        self.imputer_nonLin = IterativeImputer(estimator=DecisionTreeRegressor(max_features='sqrt', random_state=0), max_iter=max_iter)
        self.imputer_missForest = IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=10, random_state=0), max_iter=max_iter)

    def fit(self, X, y=None):
        numeric_columns = X.select_dtypes(include=['float', 'int']).columns
        self.numeric_columns = numeric_columns

        # Process imputer_choice into a list of choices
        if isinstance(self.imputer_choice, str):
            choices = [self.imputer_choice.lower()]
        else:
            choices = [choice.lower() for choice in self.imputer_choice]
        if "all" in choices:
            choices = ["mean_median", "fill_binary", "bayes", "knn", "nonlin", "missforest"]
        self.chosen_imputers = choices

        # Fit only the selected imputers on the numeric columns
        if "mean_median" in self.chosen_imputers:
            self.imputer_mean_median.fit(X[numeric_columns])
        if "fill_binary" in self.chosen_imputers:
            self.imputer_fill_binary.fit(X[numeric_columns])
        if "bayes" in self.chosen_imputers:
            self.imputer_bayes.fit(X[numeric_columns])
        if "knn" in self.chosen_imputers:
            self.imputer_knn.fit(X[numeric_columns])
        if "nonlin" in self.chosen_imputers:
            self.imputer_nonLin.fit(X[numeric_columns])
        if "missforest" in self.chosen_imputers:
            self.imputer_missForest.fit(X[numeric_columns])

        return self

    def transform(self, X):
        X = X.copy()
        df_list = [X]  # Start with the original data

        if "mean_median" in self.chosen_imputers:
            X_mean_median = pd.DataFrame(
                self.imputer_mean_median.transform(X[self.numeric_columns]),
                columns=[f"{col}_mean_median" for col in self.numeric_columns],
                index=X.index
            )
            df_list.append(X_mean_median)

        if "fill_binary" in self.chosen_imputers:
            X_fill_binary = pd.DataFrame(
                self.imputer_fill_binary.transform(X[self.numeric_columns]),
                columns=[f"{col}_fill_binary" for col in self.numeric_columns],
                index=X.index
            )
            df_list.append(X_fill_binary)

        if "bayes" in self.chosen_imputers:
            X_bayes = pd.DataFrame(
                self.imputer_bayes.transform(X[self.numeric_columns]),
                columns=[f"{col}_bayes" for col in self.numeric_columns],
                index=X.index
            )
            df_list.append(X_bayes)

        if "knn" in self.chosen_imputers:
            X_knn = pd.DataFrame(
                self.imputer_knn.transform(X[self.numeric_columns]),
                columns=[f"{col}_knn" for col in self.numeric_columns],
                index=X.index
            )
            df_list.append(X_knn)

        if "nonlin" in self.chosen_imputers:
            X_nonLin = pd.DataFrame(
                self.imputer_nonLin.transform(X[self.numeric_columns]),
                columns=[f"{col}_nonLin" for col in self.numeric_columns],
                index=X.index
            )
            df_list.append(X_nonLin)

        if "missforest" in self.chosen_imputers:
            X_missForest = pd.DataFrame(
                self.imputer_missForest.transform(X[self.numeric_columns]),
                columns=[f"{col}_missForest" for col in self.numeric_columns],
                index=X.index
            )
            df_list.append(X_missForest)

        df = pd.concat(df_list, axis=1)

        # Post-processing: if any column contains missing values,
        # set non-null values to NaN and then fill with 0.
        for col in df.columns:
            if df[col].isnull().any():
                df.loc[df[col].notnull(), col] = np.nan
                df[col] = df[col].fillna(0)

        return df

class DualCategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.imputer_1 = CountFrequencyEncoder(encoding_method='frequency', missing_values='ignore')
        self.imputer_2 = OrdinalEncoder(encoding_method='ordered', missing_values='ignore')

    def fit(self, X, y=None):
        categorical_cols = X.select_dtypes(include=['object', 'category']).columns
        X_cols = list(X.columns)

        self.categorical_cols = categorical_cols
        self.X_cols = X_cols

        self.imputer_1.fit(X[categorical_cols])
        self.imputer_2.fit(X[categorical_cols], y)
        return self

    def transform(self, X):
        X_1 = pd.DataFrame(
            self.imputer_1.transform(X[self.categorical_cols]),
            index=X.index
        )
        X_1.columns = [f"{col}_count_frequency" for col in self.categorical_cols]

        X_2 = pd.DataFrame(
            self.imputer_2.transform(X[self.categorical_cols]),
            index=X.index
        )
        X_2.columns = [f"{col}_ordinal_enc" for col in self.categorical_cols]

        df = pd.concat([X, X_1, X_2], axis=1)

        for col in df.columns:
            if df[col].isnull().any():
                df.loc[df[col].notnull(), col] = np.nan
                df[col] = df[col].fillna(0)

        return df

class TripleCategoricalDiscretiser(BaseEstimator, TransformerMixin):
    def __init__(self,
                 quantiles=10,
                 scoring='accuracy',
                 param_grid={'max_depth': [1, 2, 3, 4]},
                 discretiser_choice="all"):
        """
        Parameters:
        -----------
        quantiles: int, default=10
            Number of quantiles to use for the equal frequency discretiser.
        scoring: str, default='accuracy'
            The scoring metric for the decision tree discretiser.
        param_grid: dict, default={'max_depth': [1, 2, 3, 4]}
            Parameter grid for tuning the decision tree discretiser.
        discretiser_choice: str, default="all"
            Determines which discretisation method(s) to apply.
            Options:
                "all"            - apply all three methods (equal frequency, decision tree, and ordinal)
                "equal_freq"     - apply only the equal frequency discretisation
                "decision_tree"  - apply only the decision tree discretisation
                "ordinal"        - apply only the ordinal encoding (based on equal frequency output)
        """
        self.scoring = scoring
        self.param_grid = param_grid
        self.q = quantiles
        self.discretiser_choice = discretiser_choice

        self.imputer_1 = EqualFrequencyDiscretiser(q=quantiles, return_object=True)
        self.imputer_2 = DecisionTreeDiscretiser(cv=10,
                                                 scoring=scoring,
                                                 regression=False,
                                                 param_grid=param_grid)
        self.imputer_3 = OrdinalEncoder(encoding_method='ordered', missing_values='ignore')

    def fit(self, X, y=None):
        # Select only numerical columns
        numerical_cols = X.select_dtypes(include=['float', 'int']).columns
        self.numerical_cols = numerical_cols

        if self.discretiser_choice in ["all", "equal_freq", "ordinal"]:
            self.imputer_1.fit(X[numerical_cols])
        if self.discretiser_choice in ["all", "decision_tree"]:
            self.imputer_2.fit(X[numerical_cols], y)
        if self.discretiser_choice in ["all", "ordinal"]:
            # For ordinal encoding, we need to fit the ordinal encoder on the equal frequency output
            X_equal_freq = self.imputer_1.transform(X[numerical_cols])
            X_equal_freq = pd.DataFrame(X_equal_freq, index=X.index)
            X_equal_freq.columns = [f"{col}_equal_freq" for col in self.numerical_cols]
            self.imputer_3.fit(X_equal_freq, y)

        return self

    def transform(self, X):
        X = X.copy()
        df_list = [X]  # Always start with the original data

        if self.discretiser_choice in ["all", "equal_freq"]:
            X_equal_freq_new = pd.DataFrame(self.imputer_1.transform(X[self.numerical_cols]), index=X.index)
            X_equal_freq_new.columns = [f"{col}_equal_freq" for col in self.numerical_cols]
            df_list.append(X_equal_freq_new)

        if self.discretiser_choice in ["all", "decision_tree"]:
            X_decision_tree = pd.DataFrame(self.imputer_2.transform(X[self.numerical_cols]), index=X.index)
            X_decision_tree.columns = [f"{col}_decision_tree" for col in self.numerical_cols]
            df_list.append(X_decision_tree)

        if self.discretiser_choice in ["all", "ordinal"]:
            # Compute equal frequency transformation to feed into the ordinal encoder
            X_equal_freq_new = pd.DataFrame(self.imputer_1.transform(X[self.numerical_cols]), index=X.index)
            X_equal_freq_new.columns = [f"{col}_equal_freq" for col in self.numerical_cols]
            X_ordinal = pd.DataFrame(self.imputer_3.transform(X_equal_freq_new), index=X.index)
            X_ordinal.columns = [f"{col}_ef_ordinal" for col in X_equal_freq_new.columns]
            df_list.append(X_ordinal)

        df = pd.concat(df_list, axis=1)

        # Handle missing values: if any column contains NaNs, convert non-null entries to NaN then fill with 0.
        for col in df.columns:
            if df[col].isnull().any():
                df.loc[df[col].notnull(), col] = np.nan
                df[col] = df[col].fillna(0)

        # Drop columns that cannot be converted to numeric values.
        cols_to_drop = []
        for col in df.columns:
            try:
                pd.to_numeric(df[col])
            except ValueError:
                cols_to_drop.append(col)
        df = df.drop(columns=cols_to_drop)

        return df

class DualTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, transformer_choice="both"):
        """
        Parameters:
        -----------
        transformer_choice: str, default="both"
            Determines which transformation(s) to apply.
            Options:
                "both"  - apply both transformations (default)
                "cbrt"  - apply only the cube-root based transformer
                "yj"    - apply only the Yeo-Johnson transformer
        """
        self.transformer_choice = transformer_choice
        self.transformer_1 = fe_PowerTransformer(exp=0.333)
        self.transformer_2 = sk_PowerTransformer(method='yeo-johnson', standardize=True)

    def fit(self, X, y=None):
        if self.transformer_choice in ["both", "cbrt"]:
            self.transformer_1.fit(X)
        if self.transformer_choice in ["both", "yj"]:
            self.transformer_2.fit(X)
        return self

    def transform(self, X):
        df_list = []  # list to collect the dataframes

        # Optionally, you could always keep the original data:
        df_list.append(X)

        if self.transformer_choice in ["both", "cbrt"]:
            X_1 = pd.DataFrame(
                self.transformer_1.transform(X),
                index=X.index)
            X_1.columns = [f"{col}_cbrt" for col in self.transformer_1.get_feature_names_out()]
            df_list.append(X_1)

        if self.transformer_choice in ["both", "yj"]:
            X_2 = pd.DataFrame(
                self.transformer_2.transform(X),
                index=X.index)
            X_2.columns = [f"{col}_yj" for col in self.transformer_2.get_feature_names_out()]
            df_list.append(X_2)

        return pd.concat(df_list, axis=1)

In [None]:
pd.DataFrame(y_train).to_parquet(path / 'y_train.pq')
pd.DataFrame(y_test).to_parquet(path / 'y_test.pq')
pd.DataFrame(y_val).to_parquet(path / 'y_val.pq')

#### Input methods analysis

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('bool', BooleanTransformer(), ['VIP', 'CryoSleep']),
        # You can add numeric or categorical steps here as well
    ],
    remainder='passthrough',
    verbose_feature_names_out=False,
).set_output(transform='pandas')

pipeline = Pipeline(steps=[
    #('add_passenger_group', AddPassengerGroup()),
    ('preprocessor', preprocessor),
    ("categorical_imputer", DualCategoricalImputer(fill_value="missing", imputation_method="most_frequent")),
    ("numerical_imputer", MultipleNumericalImputer(max_iter=10, imputation_method='mean')),

    # Possibly other steps (scaler, model, etc.)
])

In [None]:
imputed_data = pipeline.fit_transform(X_train)

In [None]:
imputed_data.info()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

feature = "RoomService"

original_values = np.log1p(train_data[feature].astype(float))
nonLin = np.log1p(imputed_data[f"{feature}_nonLin"].astype(float))
bayes = np.log1p(imputed_data[f"{feature}_bayes"].astype(float))
knn = np.log1p(imputed_data[f"{feature}_knn"].astype(float))  # Fixed reference
missForest = np.log1p(imputed_data[f"{feature}_missForest"].astype(float))

plt.figure(figsize=(16, 4))

violin_parts = plt.violinplot(
    [bayes, knn, nonLin, missForest],
    showmeans=True,
    widths=0.9
)

plt.xticks([1, 2, 3, 4], ["Bayesian Ridge", "KNN", "Non-Linear", "MissForest"])

plt.title(f"Comparison of Imputed Values for {feature}")
plt.ylabel(f"Log({feature})")

In [None]:
import pandas as pd

# Create a DataFrame to compare imputation methods
comparison_df = pd.DataFrame({
    "Bayesian Ridge": bayes,
    "KNN": knn,
    "Non-Linear": nonLin,
    "MissForest": missForest
})

# Compute statistical properties
stats_summary = comparison_df.describe()
stats_summary

### Apply the pipeline

In [None]:
convert_booleans = ColumnTransformer(
    transformers=[
        ('bool', BooleanTransformer(), ['VIP', 'CryoSleep']),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False,
).set_output(transform='pandas')

pipeline = Pipeline(steps=[
    ## Basic
    #('add_passenger_group', AddPassengerGroup()),
    ('convert_booleans', convert_booleans),

    ## Imputers
    ("categorical_imputer", DualCategoricalImputer(imputer_choice="frequent",
                                                   fill_value="missing",
                                                   imputation_method="most_frequent")),

    ("numerical_imputer", MultipleNumericalImputer(max_iter=20,
                                                   imputation_method='mean',
                                                   imputer_choice=["bayes", "knn"])),

    ## Encoders
    ("categorical_encoder", DualCategoricalEncoder()),

    ## Discretizers
    ("categorical_discretiser", TripleCategoricalDiscretiser(discretiser_choice="decision_tree",
                                                             quantiles=10)),

    ## Numerical transformations
    ("numerical_transformer", DualTransformer(transformer_choice="cbrt")),
    ## Math computations

    ## Scaling
    ("scaler", MinMaxScaler()),
    ('quantile', QuantileTransformer(output_distribution='normal'))
])

In [None]:
imputed_train_array = pipeline.fit_transform(X_train, y_train)
X_train = pd.DataFrame(imputed_train_array, columns=pipeline[6].get_feature_names_out())

imputed_train_array = pipeline.transform(X_test)
X_test = pd.DataFrame(imputed_train_array, columns=pipeline[6].get_feature_names_out())

imputed_train_array = pipeline.transform(X_val)
X_val = pd.DataFrame(imputed_train_array, columns=pipeline[6].get_feature_names_out())

### Numerical transformations

In [None]:
X_train

In [None]:
X_train.to_parquet(path / 'X_train.pq')
X_test.to_parquet(path / 'X_test.pq')
X_val.to_parquet(path / 'X_val.pq')

pd.DataFrame(y_train).to_parquet(path / 'y_train.pq')
pd.DataFrame(y_test).to_parquet(path / 'y_test.pq')
pd.DataFrame(y_val).to_parquet(path / 'y_val.pq')

### PCA

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Impute missing values (e.g., using mean)
imputer = SimpleImputer(strategy="mean")  # Other options: "median", "most_frequent", etc.
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
X_val_imputed = imputer.transform(X_val)

# Apply PCA
pca = PCA(n_components=0.85)
pca_train = pca.fit_transform(X_train_imputed)
pca_test = pca.transform(X_test_imputed)
pca_val = pca.transform(X_val_imputed)

In [None]:
pca_train.shape

In [None]:
plt.plot(pca.explained_variance_ratio_)
plt.title('Percentage of Variance Explained')
plt.xlabel('Number of Components')
plt.ylabel('Percentage of Variance Explained')

In [None]:
def diagnostic_plots(df, variable):
    plt.figure(figsize=(10,2))
    plt.subplot(1, 2, 1)
    df[variable].hist(bins=30)
    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.show()

In [None]:
pd.DataFrame(pca_train)

In [None]:
diagnostic_plots(pd.DataFrame(pca_train), 3)

In [None]:
pd.DataFrame(pca_train).iloc[:, :11].to_parquet(path / 'pca_train.pq')
pd.DataFrame(pca_test).iloc[:, :11].to_parquet(path / 'pca_test.pq')
pd.DataFrame(pca_val).iloc[:, :11].to_parquet(path / 'pca_val.pq')