In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import numpy as np
import joblib
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# 1. Splitting & Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.base import BaseEstimator, TransformerMixin

# 2. Pipeline Construction
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 3. Preprocessing Steps
from sklearn.impute import SimpleImputer   
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder 
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import PowerTransformer
import ast

# 4. Model (Modeling)
from sklearn.linear_model import LinearRegression

# 5. Metrics (Metrics)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 6. Custom Transformer
import sys
import os
project_root = os.path.abspath("../")
sys.path.append(project_root)

from src.Custom_Transformer import *



## **Introduction**

This notebook's main goal is to evaluate how much model performance improves when moving from *raw data* to *fully processed and feature-engineered data*, as well as to obtain more reliable insights into which features most strongly influence anime scores.

The workflow is organized into three parts:

* **Part I: Baseline Model with Raw Data:** We train a simple **Linear Regression** model on minimally processed raw data. This baseline establishes the lower bound of model performance and highlights the limitations of noisy or inconsistent features.

* **Part II: Final Model with Processed Data:** We train a refined model using the transformed dataset produced in `01_DataPreprocessing`. This includes feature engineering, cleaning, encoding, and handling missing values—allowing the model to learn from higher-quality inputs.

* **Part III: Model Comparison & Insights:** We compare baseline and final models using metrics such as **R²** and **MAE**, and analyze feature importance to understand which variables have the strongest effect on anime Score.

This notebook ultimately demonstrates how proper preprocessing and feature engineering significantly enhance predictive accuracy and interpretability in the Score regression task.


## **Part I : Baseline Model with Raw Data**
*Goal: Train a simple **Linear Regression** model on minimally processed raw data.*


### **1. Basic Data Preparing**

In [2]:
# 1. Load Raw Data
path_raw = r'..\data\raw\anime-dataset-2023.csv'
df_raw = pd.read_csv(path_raw)
print('Dataset initial shape:', df_raw.shape)

# 2. Drop Unnecessary Columns
df_raw = df_raw.drop(['Licensors','Premiered', 'English name', 'Other name', 'Image URL','Synopsis','Rank','anime_id', 'Name','Popularity','Favorites','Scored By','Members'], axis = 1)

# 3. Standardized NaN and Drop all rows with NaN
# List of values to be treated as NaN (case-insensitive)
nan_like_values = ['unknown', 'not available', 'n/a', 'na', 'tbd', 'tba', '---']

# Iterate through object columns and replace NaN-like values
for col in df_raw.select_dtypes(include=['object']).columns:
    mask = df_raw[col].str.lower().isin(nan_like_values)
    df_raw.loc[mask.fillna(False), col] = np.nan

df_base = df_raw.dropna()

# 3. Change data type
numerical_cols = ['Score', 'Episodes']
for col in numerical_cols:
    df_base.loc[:, col] = pd.to_numeric(df_base[col], errors='coerce')

print('Dataset shape after basic preparing:', df_base.shape)


Dataset initial shape: (24905, 24)
Dataset shape after basic preparing: (7231, 11)


### **2. Splitting Train and Test Set (80/20)**

In [3]:
# 1. Split X (features) and y (target)
X_base = df_base.drop('Score', axis=1)
y_base = df_base['Score']

# 2. Perform the data split
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base, 
                                                    test_size=0.2, 
                                                    random_state=42)

print("Baseline shapes:")
print(f"X_train shape: {X_train_base.shape}")
print(f"X_test shape: {X_test_base.shape}")


Baseline shapes:
X_train shape: (5784, 10)
X_test shape: (1447, 10)


### **3. Baseline Model**

To compute baseline metrics on the raw dataset, all categorical (`object`) columns must be converted into numeric form.  
We use **Ordinal Encoding** as a minimal, lightweight transformation that:

- Keeps the raw structure intact  
- Avoids complex preprocessing  
- Allows Linear Regression to run on the original feature set  

This creates a fair baseline for comparing against the fully processed dataset later.


In [4]:
# 1. Ordinal encode for object features

object_cols = X_train_base.select_dtypes(include=['object']).columns.tolist()
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Fit encoder on train, then transform
X_train_enc = X_train_base.copy()
X_test_enc  = X_test_base.copy()

X_train_enc[object_cols] = encoder.fit_transform(X_train_base[object_cols])
X_test_enc[object_cols]  = encoder.transform(X_test_base[object_cols])

# 2. Train Linear Regression baseline
model_base = LinearRegression()
model_base.fit(X_train_enc, y_train_base)

# 3. Predict
y_pred_base = model_base.predict(X_test_enc)

# 4. Evaluating Metrics
r2_base = r2_score(y_test_base, y_pred_base)
mae_base = mean_absolute_error(y_test_base, y_pred_base)

print("\n=== Baseline Linear Regression ===")
print(f"R²  : {r2_base:.4f}")
print(f"MAE : {mae_base:.4f}")




=== Baseline Linear Regression ===
R²  : 0.0887
MAE : 0.6386


## **Part II : Final Model with Prepared Data**
*Goal: train a refined model using the transformed dataset and pipeline produced in `01_DataPreprocessing`*


### **1. Load Prepared Data**

In [5]:
# 1. Load prepared_data
path = r'..\data\processed\prepared_data.csv'
df1 = pd.read_csv(path)

# Drop rows where target 'Score' is NaN, and Drop redundant column
print('Shape before drop Cols, Rows', df1.shape)
df1 = df1.dropna(subset=['Score'])
df1 = df1.drop(['Aired Date Start','Aired Date End','anime_id', 'Name','Rank','Popularity','Favorites','Scored By','Members'], axis= 1)
print('Shape after drop Cols, Rows', df1.shape)

# Example
df1.head(1)

Shape before drop Cols, Rows (24905, 21)
Shape after drop Cols, Rows (15692, 12)


Unnamed: 0,Score,Genres,Type,Episodes,Status,Producers,Studios,Source,Rating,Aired Year,Aired Month,Duration Minutes
0,8.75,"['Action', 'Award Winning', 'Sci-Fi']",TV,26.0,Finished Airing,['Bandai Visual'],['Sunrise'],Original,R - 17+ (Violence & Profanity),1998.0,4.0,24.0


### **2. Splitting Train and Test Set (80/20)**

In [6]:
# Split X (features) and y (target)
X = df1.drop('Score', axis=1)
y = df1['Score']

# Perform the data split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (12553, 11)
X_test shape: (3139, 11)


### **3. Final Model**

In [7]:
# 1. Load Pipeline
print("--- LOADING PIPELINE ---")

pipeline_filename = r"..\models\processing_pipeline.pkl"
full_pipeline = joblib.load(pipeline_filename)

print("Pipeline loaded successfully!")

--- LOADING PIPELINE ---
Pipeline loaded successfully!


In [8]:
# 2. Evaluating with Cross-Validation
full_pipe = Pipeline(
   [('processor', full_pipeline),
   ('model', LinearRegression())]
)

# 3. Evaluating with CV
kfold = KFold(n_splits = 3, shuffle=True, random_state=42)

# R2
cv_r2 = cross_val_score(full_pipe, X, y, cv=kfold, scoring="r2")

# MAE 
cv_mae = cross_val_score(full_pipe, X, y, cv=kfold, scoring="neg_mean_absolute_error")

print("\n=== Cross Validation (5-fold) ===")
print(f"R² scores: {cv_r2}")
print(f"Mean R²:   {cv_r2.mean():.4f}")
print(f"Std  R²:   {cv_r2.std():.4f}")

print("\nMAE scores (positive):", -cv_mae)
print(f"Mean MAE:  {-cv_mae.mean():.4f}")
print(f"Std  MAE:  {cv_mae.std():.4f}")


=== Cross Validation (5-fold) ===
R² scores: [0.52840399 0.54222221 0.54802207]
Mean R²:   0.5395
Std  R²:   0.0082

MAE scores (positive): [0.48906706 0.48700216 0.48886486]
Mean MAE:  0.4883
Std  MAE:  0.0009


In [9]:
# 3. Final Evaluation on Test Set

X_train_process = full_pipeline.transform(X_train)
X_test_process = full_pipeline.transform(X_test)

# Training Model
model = LinearRegression()
model.fit(X_train_process, y_train)
y_pred = model.predict(X_test_process)

# Evaluating Metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("\n=== Final Model ===")
print(f"R²  : {r2:.4f}")
print(f"MAE : {mae:.4f}")


=== Final Model ===
R²  : 0.5348
MAE : 0.4837


In [10]:
# 4. Save model in same folder as pipeline

model_path = r"..\models\model.pkl"
joblib.dump(model, model_path)

print("Model saved to:", model_path)


Model saved to: ..\models\model.pkl


## **Part III: Model Comparison & Insights**
*Goal: Demonstrates how proper preprocessing and feature engineering significantly enhance predictive accuracy and interpretation*


### **1. Evaluating Metrics**

The comparison chart clearly shows that the data preparation pipeline significantly improves model performance:

- **R² increases from 0.09 to 0.54**, indicating that the final model explains much more variance in the target variable after preprocessing, feature engineering, and encoding.
- **MAE drops from 0.64 to 0.48**, showing that prediction errors become smaller and more stable.
- The improvement suggests that raw data alone is insufficient for linear models; the engineered features, handling of categorical/multi-label variables, and robust scaling are essential for capturing meaningful relationships.


In [11]:
def plot_metric(r2_base, r2, mae_base, mae):
   # Create data
   data = {
      'Metric': ['R²', 'R²', 'MAE', 'MAE'],
      'Model': ['Baseline', 'Final', 'Baseline', 'Final'],
      'Value': [r2_base, r2, mae_base, mae]
   }

   df = pd.DataFrame(data)

   # Create plot
   fig = px.bar(
      df,
      x='Metric',
      y='Value',
      color='Model',
      color_discrete_map={
         'Baseline': "#D2D1D1",
         'Final': "#a82516"
      },
      barmode='group',
      title='<b>Metrics Before and After Data Preparation</b>'
   )

   # Adjust layout - ADD LEGEND SETTINGS
   fig.update_layout(
      bargap=0.5,
      bargroupgap=0.25,
      yaxis_range=[0, 0.8],
      plot_bgcolor='white',
      font=dict(size=13),
      showlegend=True,
      title_x=0.5,  # Center title
      legend=dict(
         x=0.9,        # Horizontal position: 0 is leftmost, 1 is rightmost
         y=0.98,       # Vertical position: 0 is bottom, 1 is top
         xanchor='left', # Anchor to left
         yanchor='top',  # Anchor to top
         bgcolor='rgba(255, 255, 255, 0.8)',  # Semi-transparent white background
      )
   )

   # X-axis formatting: Bold R² and MAE
   fig.update_xaxes(
      title_text='',
      tickangle=0,
      tickfont=dict(size=13, family='Arial', color='black'),
   )

   # Assign bold labels for R² and MAE
   fig.update_xaxes(ticktext=['<b>R²</b>', '<b>MAE</b>'], tickvals=['R²', 'MAE'])

   # Hide Y axis
   fig.update_yaxes(
      title_text='',
      showticklabels=False,
      showgrid=False
   )

   # Display values on bars
   fig.update_traces(
      texttemplate='%{y:.2f}',
      textposition='outside'
   )

   fig.show()

plot_metric(r2_base, r2, mae_base, mae)

### **2. Feature Importance (based on Linear Regression Coefficients)**

After preprocessing, the model shifts from using weak signals (e.g., *Status*, *Rating*) in the baseline to much stronger and more meaningful features:

- `Producers` and `Studios` become the dominant predictors, showing that production teams carry major influence on anime quality and final scores.
- `Genres`, `Source`, and `Type` also gain importance once properly encoded.
- **Data preparation** unlocks the true predictive power of categorical and multi-label features, giving the final model much clearer and stronger feature signals than the baseline.

In [12]:
# Create Subplot with 1 row, 2 columns
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=(
        '<b>Baseline Model</b>',
        '<b>Final Model</b>'
    ),
    horizontal_spacing=0.35,
    vertical_spacing=0.1
)

# === FIGURE 1: BASELINE MODEL ===
feature_name = X_train_base.columns.tolist()
coefs_base = model_base.coef_

coef_df_base = pd.DataFrame({
    "feature": feature_name,
    "coef": coefs_base,
})

coef_df_base["abs_coef"] = coef_df_base["coef"].abs()

original_features = ['Genres', 'Type', 'Episodes', 'Status', 'Producers', 
                     'Studios', 'Source', 'Rating', 'Aired Year', 'Aired Month', 'Duration Minutes']

feature_to_original_group_base = {}
for feature in feature_name:
    assigned = False
    for orig_feat in original_features:
        if (feature == orig_feat or 
            feature.startswith(orig_feat + '_') or 
            feature.startswith(orig_feat + '__') or
            orig_feat in feature):
            feature_to_original_group_base[feature] = orig_feat
            assigned = True
            break
    if not assigned:
        feature_to_original_group_base[feature] = "Other"

coef_df_base["original_group"] = coef_df_base["feature"].map(feature_to_original_group_base)
group_importance_base = coef_df_base.groupby("original_group")["abs_coef"].sum().sort_values(ascending=False)
group_importance_df_base = group_importance_base.head(5).reset_index()
group_importance_df_base.columns = ["original_group", "total_importance"]

# Add baseline bar chart
fig.add_trace(
    go.Bar(
        x=group_importance_df_base['total_importance'],
        y=group_importance_df_base['original_group'],
        orientation='h',
        marker=dict(
            color=group_importance_df_base['total_importance'],
            colorscale=[(0, '#fce8e6'), (0.8, "#b4473b"), (1, '#a82516')],
            showscale=False,
            line=dict(width=0)
        ),
        name='Baseline',
        hovertemplate='<b>%{y}</b><br>Importance: %{x:.4f}<extra></extra>',
        text=group_importance_df_base['total_importance'].round(3),
        textposition='outside',
        textfont=dict(size=14),
        insidetextanchor='start',
        cliponaxis=False,  
    ),
    row=1, col=1
)

# === FIGURE 2: FINAL MODEL ===
try:
    feature_names = full_pipeline.named_steps["preprocessor"].get_feature_names_out()
except:
    feature_names = np.array([f"feat_{i}" for i in range(X_train_process.shape[1])])

coefs = model.coef_

coef_df_final = pd.DataFrame({
    "feature": feature_names,
    "coef": coefs,
})

coef_df_final["abs_coef"] = coef_df_final["coef"].abs()

feature_to_original_group_final = {}

for feature in feature_names:

    # NEW RULE → Aired Year + Aired Month + SeasonCat → Aired Time Features
    if ("Aired Year" in feature) or ("Aired Month" in feature):
        feature_to_original_group_final[feature] = "Aired Time Features"
        continue

    assigned = False

    # Existing custom rules
    if 'Aired Year_deg' in feature:
        feature_to_original_group_final[feature] = 'Aired Time Features'
        continue

    elif any(x in feature for x in ['Genres_Count', 'Genres__']):
        feature_to_original_group_final[feature] = 'Genres'
        continue

    elif any(x in feature for x in ['Producers_Count', 'Producers__']):
        feature_to_original_group_final[feature] = 'Producers'
        continue

    elif any(x in feature for x in ['Studios_Count', 'Studios__']):
        feature_to_original_group_final[feature] = 'Studios'
        continue

    elif 'Episodes_x_Duration Minutes' in feature:
        feature_to_original_group_final[feature] = 'Episodes, Duration Minutes'
        continue

    elif 'EpisodesCat' in feature:
        feature_to_original_group_final[feature] = 'Episodes'
        continue

    elif 'DurationCat' in feature:
        feature_to_original_group_final[feature] = 'Duration Minutes'
        continue

    elif any(x in feature for x in ['Aired Month_sin', 'Aired Month_cos']):
        feature_to_original_group_final[feature] = 'Aired Time Features'
        continue

    # Fallback rules
    for orig_feat in original_features:
        if (feature == orig_feat or 
            feature.startswith(orig_feat + '_') or 
            feature.startswith(orig_feat + '__') or
            orig_feat in feature):
            feature_to_original_group_final[feature] = orig_feat
            assigned = True
            break

    if not assigned:
        feature_to_original_group_final[feature] = "Other"

coef_df_final["original_group"] = coef_df_final["feature"].map(feature_to_original_group_final)
group_importance_final = coef_df_final.groupby("original_group")["abs_coef"].sum().sort_values(ascending=False)
group_importance_df_final = group_importance_final.head(5).reset_index()
group_importance_df_final.columns = ["original_group", "total_importance"]

# Add final bar chart
fig.add_trace(
    go.Bar(
        x=group_importance_df_final['total_importance'],
        y=group_importance_df_final['original_group'],
        orientation='h',
        marker=dict(
            color=group_importance_df_final['total_importance'],
            colorscale=[(0, "#fcefee"), (0.3, "#f4dfde"), (0.8, "#a52a1c"),(1, '#a82516')],
            showscale=False,
            line=dict(width=0)
        ),
        name='Final',
        hovertemplate='<b>%{y}</b><br>Importance: %{x:.4f}<extra></extra>',
        text=group_importance_df_final['total_importance'].round(3),
        textposition='outside',
        textfont=dict(size=14)  # Font size for numbers on bars
    ),
    row=1, col=2
)

# Update overall layout 
fig.update_layout(
    title=dict(
        text='<b>Feature Importance Before and After Data Preparation</b>',
        x=0.5,
        y=0.95,
        font=dict(size=24, family='Arial')
    ),
    height=500,
    width=1400,
    showlegend=False,
    plot_bgcolor='white',
    paper_bgcolor='white',
    margin=dict(t=140, b=100, l=180, r=100),
    font=dict(family='Arial')
)

# Common axes - SAME FONT SIZE FOR FEATURE NAMES
fig.update_xaxes(
    title_text="<b>Total Absolute Coefficient</b>",
    title_font=dict(size=18, family='Arial'),
    showgrid=True,
    gridwidth=1,
    gridcolor='lightgrey',
    zeroline=True,
    zerolinewidth=2,
    zerolinecolor='black'
)

fig.update_yaxes(
    title_text="",
    showgrid=False,
    tickfont=dict(size=16),  # Font size for feature names on Y axis
    ticklabelposition="outside",
    ticklabelstandoff=15
)

# Specific subplot settings
fig.update_yaxes(autorange='reversed', row=1, col=1)
fig.update_yaxes(autorange='reversed', row=1, col=2)

# Synchronize X axis
max_x = max(group_importance_df_base['total_importance'].max(), 
            group_importance_df_final['total_importance'].max()) * 1.15

fig.update_xaxes(range=[0, max_x], row=1, col=1)
fig.update_xaxes(range=[0, max_x], row=1, col=2)

# UPDATE SUBPLOT TITLES 
fig.update_annotations(
    font=dict(size=20, family='Arial', color='#2c3e50'),  
    y=1.05
)

fig.update_xaxes(showticklabels=False, row=1, col=1)
fig.update_xaxes(showticklabels=False, row=1, col=2)
fig.update_xaxes(showgrid=False, row=1, col=1)
fig.update_xaxes(showgrid=False, row=1, col=2)
fig.update_layout(bargap=0.3)

fig.show()