# Imports and configurations

In [1]:
import datetime
from math import sqrt

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression

#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from scipy import stats



In [2]:
# Set Plotly as Pandas plotting backend

pd.options.plotting.backend = "plotly"
np.set_printoptions(precision=2, suppress=True)
pd.options.display.precision = 2
pd.options.display.float_format = '{:.2f}'.format
# pd.options.display.max_columns


# Helper Funcitons

In [3]:
def plot_feature_target_scatter(df, features, target_variable):
    """
    Displays a figure with multiple scatter plots showing the correlation
    between each feature and the target variable.

    Args:
        df (pd.DataFrame): DataFrame containing features and target variable.
        features (list): List of column names to be considered as features.
        target_variable (str): Name of the target variable column.
    """

    num_features = len(features)
    if num_features == 0:
        print("No features provided to plot.")
        return

    # Determine subplot grid layout (adjust as needed for better layout)
    if num_features <= 2:
        rows = 1
        cols = num_features
    elif num_features <= 4:
        rows = 2
        cols = 2
    else:
        rows = (num_features + 1) // 3  # Adjust columns for more features
        cols = 3

    fig = make_subplots(rows=rows, cols=cols,
                        subplot_titles=[f'Feature vs. Target: {feature}' for feature in features])

    for i, feature in enumerate(features):
        row_index = (i // cols) + 1
        col_index = (i % cols) + 1

        scatter_trace = go.Scatter(
            x=df[feature],
            y=df[target_variable],
            mode='markers',
            marker=dict(color='blue', size=5),
            name=feature
        )
        fig.add_trace(scatter_trace, row=row_index, col=col_index)

        fig.update_xaxes(title_text=feature, row=row_index, col=col_index)
        fig.update_yaxes(title_text=target_variable, row=row_index, col=col_index)

    fig.update_layout(title_text="Feature vs Target Variable Scatter Plots", showlegend=False)
    fig.show()

# Load data

In [4]:
df = sns.load_dataset('diamonds')

# EDA

## Target variable

In [5]:
px.histogram(df, x='price', nbins=250)

✅ **transforming target variable with log transformation might help**

## Features vs target variable

In [None]:
df.columns

In [6]:
plot_feature_target_scatter(df, df.select_dtypes('number').drop(columns='price').columns, 'price')

Output hidden; open in https://colab.research.google.com to view.

✅ **carat, x, y, z might require transformaitons**

✅ **there are some very extreme outliers that might need to be removed**

## Plot transformations

In [7]:
px.histogram(np.log(df['price']), nbins=250)

In [8]:
transformed, lambda_value = stats.boxcox(df['price'])
px.histogram(transformed, nbins=250)

In [9]:
px.histogram(df['price'] ** 0.5, nbins=250)

In [10]:
px.scatter(x=df['carat'], y=np.log(df['price']))

In [11]:
px.scatter(x=np.log(df['carat']), y=np.log(df['price']))

In [None]:
# log(x+1)

### log1p

In [15]:
px.scatter(x=np.log1p(df['z']), y=np.log(df['price']))

## Correlations

In [16]:
corr = pd.get_dummies(df, drop_first=True).select_dtypes(['number', 'bool']).corr()

In [17]:
px.imshow(corr, color_continuous_scale='Cividis')

# Plan

- transformations for target and features
- categorical features - encoding required (one-hot / ordinal)
- feature selection
- outliers (that probably mean incorrect data, like zeros in x, y, z) - should be removed
- normalization
- splitting to multiple models
- other suggestions?

# Advanced sklearn


## Encoders

In [18]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [19]:
one_hot_encoder =OneHotEncoder()

In [None]:
df[['color']]

In [21]:
one_hot_encoder.fit(df[['color']])

In [22]:
one_hot_encoder.get_feature_names_out()

array(['color_D', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I',
       'color_J'], dtype=object)

In [24]:
one_hot_encoder.transform(df[['color']]).toarray()

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [25]:
transformed = one_hot_encoder.transform(df[['color']]).toarray()

In [None]:
transformed

In [26]:
pd.DataFrame(transformed, columns=one_hot_encoder.get_feature_names_out())

Unnamed: 0,color_D,color_E,color_F,color_G,color_H,color_I,color_J
0,0.00,1.00,0.00,0.00,0.00,0.00,0.00
1,0.00,1.00,0.00,0.00,0.00,0.00,0.00
2,0.00,1.00,0.00,0.00,0.00,0.00,0.00
3,0.00,0.00,0.00,0.00,0.00,1.00,0.00
4,0.00,0.00,0.00,0.00,0.00,0.00,1.00
...,...,...,...,...,...,...,...
53935,1.00,0.00,0.00,0.00,0.00,0.00,0.00
53936,1.00,0.00,0.00,0.00,0.00,0.00,0.00
53937,1.00,0.00,0.00,0.00,0.00,0.00,0.00
53938,0.00,0.00,0.00,0.00,1.00,0.00,0.00


In [None]:
# X_train['color'].value_counts()

## Transformer

In [28]:
transformer = FunctionTransformer(np.log, validate=True)

In [29]:
transformer.fit(df[['price']])

In [30]:
transformer.transform(df[['price']])

array([[5.79],
       [5.79],
       [5.79],
       ...,
       [7.92],
       [7.92],
       [7.92]])

## Pipeline

In [31]:
pipe = Pipeline([
    ('log', FunctionTransformer(np.log1p, validate=True)),
    ('scaler', StandardScaler())
])

In [32]:
pipe.fit(df[['carat']])

In [33]:
pipe.transform(df[['carat']])

array([[-1.42],
       [-1.49],
       [-1.42],
       ...,
       [-0.1 ],
       [ 0.27],
       [ 0.02]])

## Bringing all together with ColumnTransformer

In [34]:
# Define categorical and numerical features
categorical_features = ['cut', 'color', 'clarity']
numerical_features = ['carat', 'depth', 'table', 'x', 'y', 'z']

# Define transformations for numerical features
num_transformers = ColumnTransformer(
    transformers=[
        ('xyz_transform', Pipeline([
            ('log', FunctionTransformer(np.log1p, validate=True)),
            ('scaler', StandardScaler())
        ]), ['x', 'y', 'z']),
        ('carat_scaler', Pipeline([
            ('log', FunctionTransformer(np.log, validate=True)),
            ('scaler', StandardScaler())
        ]), ['carat']),
        ('scaler_only', StandardScaler(), ['depth', 'table']),
    ]
)


# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformers, numerical_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split dataset
X = df.drop(columns=['price'])
# y = np.log1p(df['price'])  # Apply log transformation to target
y = np.log(df['price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])


Unnamed: 0,Test,Train
MSE,676716.07,632977.89
MAPE,0.1,0.1
R2,0.96,0.96


###❓**Exercise: Remove outliers according to scatter plots and train the model again to see whether there is improvement**

###❓**Exercise: Try using OrdinalEncoder instead of OneHotEncoder - is there any change to model performance?**

## Adding FeatureSelection

In [None]:
# Define categorical and numerical features
categorical_features = ['cut', 'color', 'clarity']
numerical_features = ['carat', 'depth', 'table', 'x', 'y', 'z']

# Define transformations for numerical features
num_transformers = ColumnTransformer(
    transformers=[
        ('xyz_transform', Pipeline([
            ('log', FunctionTransformer(np.log1p, validate=True)),
            ('scaler', StandardScaler())
        ]), ['x', 'y', 'z']),
        ('carat_scaler', Pipeline([
            ('log', FunctionTransformer(np.log, validate=True)),
            ('scaler', StandardScaler())
        ]), ['carat']),
        ('depth_transform', StandardScaler(), ['depth']),
        ('table_scaler', StandardScaler(), ['table']),
    ]
)

# Feature selection
feature_selector = SelectKBest(score_func=f_regression, k=8)  # Selecting best 8 features

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformers, numerical_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('regressor', LinearRegression())
])

# Split dataset
X = df.drop(columns=['price'])
y = np.log(df['price']) # Apply log transformation to target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)



# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

## Polynomial Features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly_features = PolynomialFeatures(degree=2)

In [None]:
poly_features.fit(X[['carat', 'x', 'y', 'z', 'table', 'depth']])

In [None]:
poly_features.get_feature_names_out()

In [None]:
poly = pd.DataFrame(poly_features.transform(X[['carat', 'x', 'y', 'z', 'table', 'depth']]), columns=poly_features.get_feature_names_out())

In [None]:
poly

In [None]:
poly['price'] = y

In [None]:
poly

In [None]:
px.scatter(poly, x='x^2', y='price')

In [None]:
poly.columns

In [None]:
plot_feature_target_scatter(poly, ['carat table', 'carat depth'], 'price')

### ❓**Exercise: try adding polynomial features to diamonds, and train the model to see whether there are improvements**