# Imports and configurations

In [None]:
import datetime
from math import sqrt

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression

#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from scipy import stats



In [None]:
# Set Plotly as Pandas plotting backend

pd.options.plotting.backend = "plotly"
np.set_printoptions(precision=2, suppress=True)
pd.options.display.precision = 2
pd.options.display.float_format = '{:.2f}'.format
# pd.options.display.max_columns


# Load data

In [None]:
df = sns.load_dataset('diamonds')

# Advanced sklearn


## Adding FeatureSelection

In [None]:
# Define categorical and numerical features
categorical_features = ['cut', 'color', 'clarity']
numerical_features = ['carat', 'depth', 'table', 'x', 'y', 'z']

# Define transformations for numerical features
num_transformers = ColumnTransformer(
    transformers=[
        ('xyz_transform', Pipeline([
            ('log', FunctionTransformer(np.log1p, validate=True)),
            ('scaler', StandardScaler())
        ]), ['x', 'y', 'z']),
        ('carat_scaler', Pipeline([
            ('log', FunctionTransformer(np.log, validate=True)),
            ('scaler', StandardScaler())
        ]), ['carat']),
        ('depth_transform', StandardScaler(), ['depth']),
        ('table_scaler', StandardScaler(), ['table']),
    ]
)

# Feature selection
feature_selector = SelectKBest(score_func=f_regression, k=5)  # Selecting best 8 features

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformers, numerical_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('regressor', LinearRegression())
])

# Split dataset
X = df.drop(columns=['price'])
y = np.log(df['price']) # Apply log transformation to target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)



# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

## Polynomial Features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly_features = PolynomialFeatures(degree=2)

In [None]:
poly_features.fit(X[['carat', 'x', 'y', 'z', 'table', 'depth']])

In [None]:
poly_features.get_feature_names_out()

In [None]:
poly = pd.DataFrame(poly_features.transform(X[['carat', 'x', 'y', 'z', 'table', 'depth']]), columns=poly_features.get_feature_names_out())

In [None]:
poly

In [None]:
poly['price'] = y

In [None]:
poly

In [None]:
px.scatter(poly, x='x', y='price')

In [None]:
px.scatter(poly, x='x^2', y='price')

In [None]:
px.scatter(poly, x='x y', y='price')

In [None]:
px.scatter(poly, x='carat table', y='price')

### ❓**Exercise: try adding polynomial features to diamonds, and train the model to see whether there are improvements**

- try order 2 and 3
- try playing with interactions_only (True/False)
- use pipelines and FeatureSelection to see whether you can improve the results