### Split and Filter data

In [None]:
# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

# count unique values
unique_values_counts = X_train.nunique().sort_values(ascending=False)

# drop columns with more th 30 % missing values
missing = X_train.isnull().sum() / len(X_train) 
features_incomplete = missing[missing > 0.3].index
features_incomplete

# plot data histograms before transforming
import matplotlib.pyplot as plt
eda_train_set = X_train.copy(deep=True)
eda_train_set.hist(bins=30, figsize=(15, 8))
plt.suptitle('Numerical data histogram', fontsize=16)
plt.show()

# split data into numerical and categorical
import numpy as np
cat_data = eda_train_set.select_dtypes(include=['object'])
num_data = eda_train_set.select_dtypes(include=[np.number])

# impute data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean") 
num_data = imputer.fit_transform(num_data)

# view correlation 
import matplotlib.pyplot as plt
import seaborn as sns
correlation_matrix = num_data.corr()
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", ax=ax)
plt.show()

### Transform data

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

# single column example
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# example of pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ("log_transform", FunctionTransformer(func=np.log1p), ["Wind_Speed"]),
        ("standard_transform", StandardScaler(), ["Temperature", "X-Windv", "Y-Windv"]),
        ("dir_transform", FunctionTransformer(func=np.cos), ["Dir"]),      
    ], remainder="drop")

# custom transformer
from sklearn.base import BaseEstimator, TransformerMixin

class CustomTransform(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return None
    
# one-shot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(sparse_output=False))
])

cat_data_encoded = categorical_pipeline.fit_transform(cat_data)