In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import calendar
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Pandas display options for better viewing
pd.set_option('display.max_columns', 50)

## **Introduction**

This notebook serves as the central hub for our entire data preprocessing workflow. The process is strategically divided into two distinct phases to ensure data integrity and prevent data leakage, following best practices in machine learning.

*   **Part I: Pre-Split Cleaning (Execution):** This part involves executing "stateless" cleaning operations that do not depend on the overall dataset statistics (e.g., fixing data types, standardizing text formats, restructuring columns). The output of this part is a structurally sound file (`prepared_data.csv`) intended for human-readable Exploratory Data Analysis (EDA).

*   **Part II: Post-Split Pipeline (Definition):** This part involves **defining** the "stateful" transformations that learn parameters from the data (e.g., Imputation, Scaling). These steps are encapsulated into a Scikit-learn `Pipeline`. This pipeline object is the final deliverable of this notebook, ready to be used for model training in the next phase.

## **Part I : Pre-Split Cleaning**
*Goal: Transform the raw data (`anime-dataset-2023.csv`) into a clean, structurally correct dataset (`prepared_data.csv`).*


### **1. Load Data**

In [8]:
# path = kagglehub.dataset_download("dbdmobile/myanimelist-dataset")
# csv_file_path = f"{path}/anime-dataset-2023.csv"
current_dir = Path.cwd()
project_root = current_dir.parent 
raw_data_path = project_root / "data" / "raw" / "anime-dataset-2023.csv"

df = pd.read_csv(raw_data_path)

print(f"Dataset loaded successfully from: {raw_data_path}")
print(f"Initial shape: {df.shape[0]} rows, {df.shape[1]} columns")

# --- Initial Data Inspection ---
print("\nDataFrame Info:")
df.info()

print("\nFirst 5 rows of the raw data:")
display(df.head())

Dataset loaded successfully from: c:\Users\ADMIN\Documents\KY5\DATA_VID\Project\DSEB65A_Gr1_anime_data_storytelling\data\raw\anime-dataset-2023.csv
Initial shape: 24905 rows, 24 columns

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   anime_id      24905 non-null  int64 
 1   Name          24905 non-null  object
 2   English name  24905 non-null  object
 3   Other name    24905 non-null  object
 4   Score         24905 non-null  object
 5   Genres        24905 non-null  object
 6   Synopsis      24905 non-null  object
 7   Type          24905 non-null  object
 8   Episodes      24905 non-null  object
 9   Aired         24905 non-null  object
 10  Premiered     24905 non-null  object
 11  Status        24905 non-null  object
 12  Producers     24905 non-null  object
 13  Licensors     24905 non-null  object
 14  Studios    

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,Premiered,Status,Producers,Licensors,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",spring 1998,Finished Airing,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,"Sep 1, 2001",UNKNOWN,Finished Airing,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),189.0,602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",spring 1998,Finished Airing,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,328.0,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,"Jul 3, 2002 to Dec 25, 2002",summer 2002,Finished Airing,"Bandai Visual, Dentsu, Victor Entertainment, T...","Funimation, Bandai Entertainment",Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,2764.0,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,"Sep 30, 2004 to Sep 29, 2005",fall 2004,Finished Airing,"TV Tokyo, Dentsu",Illumitoon Entertainment,Toei Animation,Manga,23 min per ep,PG - Children,4240.0,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...


### **2. Standardize "NaN-like" Values**

Many object columns use string literals to represent missing data. We'll replace them with np.nan.

In [9]:
# List of values to be treated as NaN (case-insensitive)
nan_like_values = ['unknown', 'not available', 'n/a', 'na', 'tbd', 'tba', '---']

# Store initial null counts
initial_nulls = df.isnull().sum()

# Iterate through object columns and replace NaN-like values
for col in df.select_dtypes(include=['object']).columns:
    # The .str accessor handles NaNs automatically.
    # The 'na' parameter is invalid for .isin() and has been removed.
    # A boolean mask is created for strings that match our NaN-like values.
    mask = df[col].str.lower().isin(nan_like_values)

    # We use .loc with the mask to assign np.nan. .fillna(False) on the mask
    # ensures that existing NaN values are not included in the mask.
    df.loc[mask.fillna(False), col] = np.nan


print("Standardization of 'NaN-like' values complete.")

# --- Check after Standardization ---
final_nulls = df.isnull().sum()
changed_cols = (final_nulls - initial_nulls)[(final_nulls - initial_nulls) > 0].sort_values(ascending=False)
print("\nColumns with increased NaN counts after standardization:")
print(changed_cols)

Standardization of 'NaN-like' values complete.

Columns with increased NaN counts after standardization:
Licensors       20170
Premiered       19399
English name    14577
Producers       13350
Studios         10526
Score            9213
Scored By        9213
Genres           4929
Rank             4612
Source           3689
Aired             915
Rating            669
Duration          663
Episodes          611
Other name        128
Type               74
dtype: int64


### **3. Handle Placeholder Zeros**
The values `0` in `Rank` and `Popularity` don't represent a true zero value but rather a missing or unassigned one. These should be converted to `np.nan`.

In [10]:
cols_with_placeholder_zeros = ['Rank', 'Popularity']
df[cols_with_placeholder_zeros] = df[cols_with_placeholder_zeros].replace(0, np.nan)
print("Placeholder zeros in 'Rank' and 'Popularity' replaced with np.nan.")

# --- Check after Handling Zeros ---
print("\nMissing values count for Rank and Popularity:")
print(df[['Rank', 'Popularity']].isnull().sum())

Placeholder zeros in 'Rank' and 'Popularity' replaced with np.nan.

Missing values count for Rank and Popularity:
Rank          4612
Popularity     187
dtype: int64


### **4. Process Complex & Temporal Columns**


#### **4.1. `Aired` Column**
We'll parse the `Aired` string to extract start and end dates into proper `datetime` columns.

In [11]:
def parse_aired_string_ultimate(aired_str):
    start_date, end_date, start_year, start_month = pd.NaT, pd.NaT, np.nan, np.nan
    if not isinstance(aired_str, str):
        return start_date, end_date, start_year, start_month

    month_map = {name.lower(): i for i, name in enumerate(calendar.month_abbr) if i > 0}

    def parse_part(part_str, is_end_date=False):
        date_val, year_val, month_val = pd.NaT, np.nan, np.nan
        try:
            part_str_lower = part_str.lower()
            year_match = re.search(r'\b(\d{4})\b', part_str_lower)
            if year_match: year_val = int(year_match.group(1))

            found_month_name = None
            for month_name, month_num in month_map.items():
                if month_name in part_str_lower:
                    month_val = month_num
                    found_month_name = month_name
                    break

            day_match = re.search(r'\b(\d{1,2})\b', part_str_lower)
            day_present = bool(day_match) and not (day_match.group(1) == str(year_val) and found_month_name is None)

            if year_val and month_val and day_present:
                date_val = pd.to_datetime(part_str, errors='coerce')
            elif is_end_date and year_val and month_val:
                _, last_day = calendar.monthrange(int(year_val), int(month_val))
                date_val = pd.to_datetime(f"{int(year_val)}-{int(month_val)}-{last_day}", errors='coerce')

        except (ValueError, TypeError): pass
        return date_val, year_val, month_val

    parts = aired_str.strip().split(' to ')
    if len(parts) >= 1: start_date, start_year, start_month = parse_part(parts[0], is_end_date=False)
    if len(parts) == 2: end_date, _, _ = parse_part(parts[1], is_end_date=True)
    return start_date, end_date, start_year, start_month

parsed_info = df['Aired'].apply(parse_aired_string_ultimate)
df[['air_date_start', 'air_date_end', 'air_year', 'air_month']] = pd.DataFrame(parsed_info.tolist(), index=df.index)
df = df.drop(columns=['Aired'])
print("Ultimate parsing of 'Aired' column complete.")
# --- Verification Step ---
print("\nVerifying the new logic by checking data types and null counts:")
print(df[['air_year', 'air_month', 'air_date_start', 'air_date_end','air_month']].info())

Ultimate parsing of 'Aired' column complete.

Verifying the new logic by checking data types and null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   air_year        23990 non-null  float64       
 1   air_month       20750 non-null  float64       
 2   air_date_start  20090 non-null  datetime64[ns]
 3   air_date_end    9491 non-null   datetime64[ns]
 4   air_month       20750 non-null  float64       
dtypes: datetime64[ns](2), float64(3)
memory usage: 973.0 KB
None


#### **4.2. `Duration` Column**
Convert the duration string (e.g., "24 min. per ep.") into a numerical column representing total minutes.

In [12]:
def parse_duration_to_total_minutes(duration_str):
    """
    Parses a string from the 'Duration' column into a single numerical value
    representing the total duration in minutes. It handles hours, minutes, and seconds.
    """
    if pd.isna(duration_str):
        return np.nan

    # Use float for total_minutes to handle fractional values from seconds
    total_minutes = 0.0
    duration_str = str(duration_str).lower()

    # Regex to find numbers associated with 'hr', 'min', and 'sec'
    hour_match = re.search(r'(\d+)\s*hr', duration_str)
    min_match = re.search(r'(\d+)\s*min', duration_str)
    sec_match = re.search(r'(\d+)\s*sec', duration_str)

    # Calculate total minutes by converting all units
    if hour_match:
        total_minutes += float(hour_match.group(1)) * 60
    if min_match:
        total_minutes += float(min_match.group(1))
    if sec_match:
        total_minutes += float(sec_match.group(1)) / 60

    # If total_minutes is 0, it means no valid duration was found. Return NaN.
    return total_minutes if total_minutes > 0 else np.nan

# Apply the function to create the new, unified column
df['duration_minutes'] = df['Duration'].apply(parse_duration_to_total_minutes)

# Drop the original 'Duration' column as it's now redundant
df = df.drop(columns=['Duration'])

print("Optimized processing of 'Duration' column complete.")
print("Created 'duration_minutes' by unifying hours, minutes, and seconds.")

# --- Verification Step ---
print("\nVerifying the new column by checking its statistics:")
print(df['duration_minutes'].describe())


Optimized processing of 'Duration' column complete.
Created 'duration_minutes' by unifying hours, minutes, and seconds.

Verifying the new column by checking its statistics:
count    24242.000000
mean        22.132022
std         27.610721
min          0.050000
25%          4.000000
50%         17.000000
75%         25.000000
max       1500.000000
Name: duration_minutes, dtype: float64


#### **4.3. Multi-Value Text Columns (`Genres`, `Producers`, `Studios`)**
Split comma-separated strings into lists of strings for easier processing later.

In [13]:
multi_value_cols = ['Genres', 'Producers', 'Studios']

for col in multi_value_cols:
    # We use a lambda function within .apply() for a concise operation.
    # 1. First, check if the value 'x' is a string using isinstance(x, str).
    # 2. If it is, split the string by comma: x.split(',')
    # 3. For each resulting part 's' in the list, remove leading/trailing whitespace: s.strip()
    # 4. If 'x' is not a string (i.e., it's NaN), keep it as np.nan.
    df[col] = df[col].apply(lambda x: [s.strip() for s in x.split(',')] if isinstance(x, str) else np.nan)

print("Multi-value columns ('Genres', 'Producers', 'Studios') converted to lists of strings.")

# --- Verification Step ---
print("\nData preview after processing complex columns:")
# Display the original 'Name' column alongside the newly structured columns for context
display(df[['Name', 'Genres', 'Producers', 'Studios']].head(10))



Multi-value columns ('Genres', 'Producers', 'Studios') converted to lists of strings.

Data preview after processing complex columns:


Unnamed: 0,Name,Genres,Producers,Studios
0,Cowboy Bebop,"[Action, Award Winning, Sci-Fi]",[Bandai Visual],[Sunrise]
1,Cowboy Bebop: Tengoku no Tobira,"[Action, Sci-Fi]","[Sunrise, Bandai Visual]",[Bones]
2,Trigun,"[Action, Adventure, Sci-Fi]",[Victor Entertainment],[Madhouse]
3,Witch Hunter Robin,"[Action, Drama, Mystery, Supernatural]","[Bandai Visual, Dentsu, Victor Entertainment, ...",[Sunrise]
4,Bouken Ou Beet,"[Adventure, Fantasy, Supernatural]","[TV Tokyo, Dentsu]",[Toei Animation]
5,Eyeshield 21,[Sports],"[TV Tokyo, Nihon Ad Systems, TV Tokyo Music, S...",[Gallop]
6,Hachimitsu to Clover,"[Comedy, Drama, Romance]","[Dentsu, Genco, Fuji TV, Asmik Ace, Shueisha]",[J.C.Staff]
7,Hungry Heart: Wild Striker,"[Comedy, Slice of Life, Sports]",,[Nippon Animation]
8,Initial D Fourth Stage,"[Action, Drama]","[OB Planning, Studio Jack]",[A.C.G.T.]
9,Monster,"[Drama, Mystery, Suspense]","[VAP, Shogakukan-Shueisha Productions, Nippon ...",[Madhouse]


### **5. Data Type Coercion**
Many numerical columns (`Score`, `Episodes`, etc.) are initially loaded as `object` data types because they contained non-numeric strings like 'UNKNOWN'. Now that those strings have been replaced with `np.nan`, we can safely convert these columns to a numeric format.

We will coerce them to `float`, which is a numeric type that can natively handle `NaN` values. This step is crucial for enabling mathematical operations, statistical analysis, and visualizations on these features. The `errors='coerce'` parameter ensures that any remaining non-numeric value that we might have missed will be turned into `NaN` instead of causing an error.

In [14]:
# List of columns that should be numeric
numerical_cols = ['Score', 'Episodes', 'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members']

print("Data types BEFORE coercion:")
print(df[numerical_cols].dtypes)

for col in numerical_cols:
    # Use pd.to_numeric with errors='coerce' to handle any non-convertible values gracefully
    df[col] = pd.to_numeric(df[col], errors='coerce')

print("\nData types AFTER coercion:")
print(df[numerical_cols].dtypes)

# --- Verification Step ---
print("\nVerifying by checking the overall DataFrame info again:")
df.info()

Data types BEFORE coercion:
Score          object
Episodes       object
Rank           object
Popularity    float64
Favorites       int64
Scored By      object
Members         int64
dtype: object

Data types AFTER coercion:
Score         float64
Episodes      float64
Rank          float64
Popularity    float64
Favorites       int64
Scored By     float64
Members         int64
dtype: object

Verifying by checking the overall DataFrame info again:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   anime_id          24905 non-null  int64         
 1   Name              24905 non-null  object        
 2   English name      10328 non-null  object        
 3   Other name        24777 non-null  object        
 4   Score             15692 non-null  float64       
 5   Genres            19976 non-null  object        
 6   Synop

### **6. Drop Unnecessary Columns**

The final step in our pre-split cleaning process is to remove columns that are not useful for our analysis, are too sparse to be reliable, or are redundant. A clean, focused dataset is easier to work with and leads to more robust insights.

Based on our initial exploration and the project's goals, we will drop the following columns:

*   **`Licensors`**: This column is missing over 80% of its values, making it highly unreliable for any analysis or modeling.
*   **`Premiered`**: This column also has a very high rate of missing values (over 77%). The temporal information it provides (year and season) has already been extracted with higher reliability from the `Aired` column into our new `air_year` and `air_month` features. Therefore, it is now redundant.
*   **`English name`, `Other name`**: These are largely redundant, as the primary `Name` column serves as the main identifier for each anime.
*   **`Image URL`**: This column contains links to images and is not a feature that can be used in our current analytical scope.

*   **`Synopsis`**: While this column contains descriptive text, extracting meaningful features from it requires advanced Natural Language Processing (NLP) techniques that are beyond the scope of this project. To maintain focus on core data preparation and visualization tasks, this column will be removed.

Removing these columns results in a final, streamlined DataFrame that is ready to be saved for the next phases of analysis and modeling.

In [15]:
# Define the list of columns to be dropped
cols_to_drop = ['Licensors','Premiered', 'English name', 'Other name', 'Image URL','Synopsis']

print("DataFrame shape BEFORE dropping columns:", df.shape)
print("Columns to be dropped:", cols_to_drop)

# Drop the specified columns from the DataFrame
df = df.drop(columns=cols_to_drop)

print("\nDataFrame shape AFTER dropping columns:", df.shape)

# --- Verification Step ---
print("\nVerifying the final set of columns and DataFrame info:")
# The .info() summary will now show the reduced column count.
df.info()

print("\nFinal columns in the DataFrame:")
print(df.columns.tolist())

DataFrame shape BEFORE dropping columns: (24905, 27)
Columns to be dropped: ['Licensors', 'Premiered', 'English name', 'Other name', 'Image URL', 'Synopsis']

DataFrame shape AFTER dropping columns: (24905, 21)

Verifying the final set of columns and DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   anime_id          24905 non-null  int64         
 1   Name              24905 non-null  object        
 2   Score             15692 non-null  float64       
 3   Genres            19976 non-null  object        
 4   Type              24831 non-null  object        
 5   Episodes          24294 non-null  float64       
 6   Status            24905 non-null  object        
 7   Producers         11555 non-null  object        
 8   Studios           14379 non-null  object        
 9   Source            21216 n

### **7. Save Pre-Split Data**

This is the final step of our pre-split cleaning phase. We have successfully transformed the raw, messy data into a structured, clean, and consistent format.

We will now save this processed DataFrame to a new file named `prepared_data.csv`. This file will serve as the clean foundation for all subsequent tasks, including the data storytelling, visualization (Part 1), and any further analysis or machine learning modeling (Part 3).

By setting `index=False`, we prevent pandas from writing the DataFrame index as a new column in our CSV file, keeping the output clean.

In [16]:
df[['duration_minutes','Episodes']].value_counts()

duration_minutes  Episodes
2.0               1.0         1092
3.0               1.0         1072
4.0               1.0          943
24.0              12.0         587
23.0              12.0         572
                              ... 
10.0              260.0          1
                  247.0          1
                  240.0          1
                  237.0          1
1500.0            32.0           1
Name: count, Length: 2061, dtype: int64

In [17]:
# # Define the output filename
# output_filename = 'prepared_data.csv'

# # Save the cleaned DataFrame to a CSV file.
# # index=False ensures that the DataFrame index is not saved as a column.
# df.to_csv(output_filename, index=False)

### **8. Data Dictionary for `prepared_data.csv`**
This section serves as the official data dictionary for the output file, `prepared_data.csv`. It details each column's data type, a brief description of its content post-cleaning, and outlines the **required subsequent transformations** for the machine learning modeling phase. This dictionary acts as a clear handover document, ensuring the next phase understands the state of the data and the necessary model-specific preparation steps.

| Feature Name | Data Type | Description & State | Required Post-Split Preprocessing |
| :--- | :--- | :--- | :--- |
| **`anime_id`** | `int64` | Unique identifier for each anime. | Drop (Not a feature). |
| **`Name`** | `object` | The primary name of the anime. | Drop (Not a feature). |
| **`Score`** | `float64` | The average user score. Contains NaNs from 'UNKNOWN' values. | **Target Variable.** Rows with NaN will be dropped pre-split. |
| **`Genres`** | `object` (list) | A list of genres. Split from a string. May contain NaNs. | **Imputation** & **Encoding** (e.g., Multi-Label Binarization). |
| **`Type`** | `object` | The type of anime (e.g., 'tv', 'movie'). Cleaned and standardized. May contain NaNs. | **Imputation** & **Encoding** (e.g., One-Hot). |
| **`Episodes`** | `float64` | The number of episodes. Contains NaNs from 'UNKNOWN' values. | **Imputation** & **Scaling**. |
| **`Status`** | `object` | The airing status. Cleaned and standardized. | **Imputation** & **Encoding**. |
| **`Producers`** | `object` (list) | A list of production companies. Split from a string. May contain NaNs. | **Imputation** & **Encoding**. |
| **`Studios`** | `object` (list) | A list of animation studios. Split from a string. May contain NaNs. | **Imputation** & **Encoding**. |
| **`Source`** | `object` | The source material. Cleaned and standardized. May contain NaNs. | **Imputation** & **Encoding**. |
| **`Rating`** | `object` | The age rating. Cleaned and standardized. May contain NaNs. | **Imputation** & **Encoding**. |
| **`Rank`** | `float64` | The popularity rank. Contains NaNs from placeholder zeros and 'UNKNOWN's. | **Imputation** & **Scaling**. |
| **`Popularity`** | `float64` | The popularity score. Contains NaNs from placeholder zeros. | **Imputation** & **Scaling**. |
| **`Favorites`** | `int64` | Number of users who favorited the anime. No missing values. | **Scaling**. |
| **`Scored By`** | `float64` | The number of users who scored the anime. Contains NaNs. | **Imputation** & **Scaling**. |
| **`Members`** | `int64` | The number of members in the anime's group. No missing values. | **Scaling**. |
| **`air_date_start`** | `datetime64[ns]`| The start date of airing. Parsed from `Aired`. May be NaT. | Drop (Information captured in `air_year`). |
| **`air_date_end`** | `datetime64[ns]`| The end date of airing. Parsed from `Aired`. May be NaT. | Drop. |
| **`air_year`** | `float64` | The year airing started. Extracted from `Aired`. May contain NaNs. | **Imputation** & **Scaling**. |
| **`duration_minutes`**| `float64` | Episode duration in minutes. Parsed from `Duration`. May contain NaNs. | **Imputation** & **Scaling**. |

## **Part II : Post-Split Pipeline**
*Goal: Build a parameter-learning preprocessing pipeline (imputation, scaling, and other stateful steps) and package it into a Scikit-learn Pipeline object that will be exported for use during model training.*


In [18]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import numpy as np
import joblib

# 1. Splitting & Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.base import BaseEstimator, TransformerMixin

# 2. Pipeline Construction
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 3. Preprocessing Steps
from sklearn.impute import SimpleImputer       # Data Cleaning
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder # Data Transform
from sklearn.feature_selection import SelectKBest, f_regression # Feature Selection
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import PowerTransformer
import ast

# 4. Model (Modeling)
from sklearn.linear_model import LinearRegression

# 5. Metrics (Metrics)
from sklearn.metrics import accuracy_score

from sklearn.impute import KNNImputer

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import plotly.express as px




ModuleNotFoundError: No module named 'lightgbm'

### **1. Drop Columns and Rows in Requirement**

In [None]:
path = r'prepared_data (1).csv'
df1 = pd.read_csv(path)

# Drop rows where target 'Score' is NaN, and Drop redundant column
print('Shape before drop Cols, Rows', df1.shape)
df1 = df1.dropna(subset=['Score'])
df1 = df1.drop(['Aired Date Start','Aired Date End','anime_id', 'Name'], axis= 1)
print('Shape after drop Cols, Rows', df1.shape)

# Example
df1.head(1)


Shape before drop Cols, Rows (24905, 21)
Shape after drop Cols, Rows (15692, 17)


Unnamed: 0,Score,Genres,Type,Episodes,Status,Producers,Studios,Source,Rating,Rank,Popularity,Favorites,Scored By,Members,Aired Year,Aired Month,Duration Minutes
0,8.75,"['Action', 'Award Winning', 'Sci-Fi']",TV,26.0,Finished Airing,['Bandai Visual'],['Sunrise'],Original,R - 17+ (violence & profanity),41.0,43.0,78525,914193.0,1771505,1998.0,4.0,24.0


### **2. Splitting Train and Test Set (80/20)**

In [None]:
# Split X (features) and y (target)
X = df1.drop('Score', axis=1)
y = df1['Score']

# Perform the data split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

# --- Model Performance (Linear Regression) ---
# R² Score : 0.8580
# MAE      : 0.2096

X_train shape: (12553, 16)
X_test shape: (3139, 16)


### **3. Evaluating with Baseline Pipeline**

In [None]:
# 1. Handle Missing by Dropping all NaN rows
df_base = df1.copy()
df_base = df_base.dropna()

# Split X (features) and y (target)
X_base = df_base.drop('Score', axis=1)
y_base = df_base['Score']

# Perform the data split
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base, 
                                                    test_size=0.2, 
                                                    random_state=42)

print("Baseline shapes:")
print(f"X_train shape: {X_train_base.shape}")
print(f"X_test shape: {X_test_base.shape}")

# === Baseline Linear Regression (Ordinal Encoding) ===
# R²  : 0.9541
# MAE : 0.1002

Baseline shapes:
X_train shape: (4896, 16)
X_test shape: (1225, 16)


In [None]:
# 2. Ordinal encode for object features

object_cols = X_train_base.select_dtypes(include=['object']).columns.tolist()
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Fit encoder on train, then transform
X_train_enc = X_train_base.copy()
X_test_enc  = X_test_base.copy()

X_train_enc[object_cols] = encoder.fit_transform(X_train_base[object_cols])
X_test_enc[object_cols]  = encoder.transform(X_test_base[object_cols])

# 3. Train Linear Regression baseline
model_base = LinearRegression()
model_base.fit(X_train_enc, y_train_base)

# Predict
y_pred_base = model_base.predict(X_test_enc)

# 4. Evaluating Metrics
r2_base = r2_score(y_test_base, y_pred_base)
mae_base = mean_absolute_error(y_test_base, y_pred_base)

print("\n=== Baseline Linear Regression (Ordinal Encoding) ===")
print(f"R²  : {r2_base:.4f}")
print(f"MAE : {mae_base:.4f}")




=== Baseline Linear Regression (Ordinal Encoding) ===
R²  : 0.9541
MAE : 0.1002


In [None]:
object_cols = X_train_base.select_dtypes(include=['object']).columns.tolist()
X_train_enc = X_train_base.drop(object_cols, axis = 1)
X_test_enc = X_test_base.drop(object_cols, axis = 1)

# 3. Train Linear Regression baseline
model_base = LinearRegression()
model_base.fit(X_train_enc, y_train_base)

# Predict
y_pred_base = model_base.predict(X_test_enc)

# 4. Evaluating Metrics
r2_base = r2_score(y_test_base, y_pred_base)
mae_base = mean_absolute_error(y_test_base, y_pred_base)

print("\n=== Baseline Linear Regression without category ===")
print(f"R²  : {r2_base:.4f}")
print(f"MAE : {mae_base:.4f}")



=== Baseline Linear Regression without category ===
R²  : 0.9542
MAE : 0.1003


### **4. Custom Transformers**

### **4.1. Custom Transformers for `Genres`, `Producers`, and `Studios`**

These three columns are **multi-label lists**, meaning each entry contains multiple values (e.g., multiple genres or multiple studios).  
Because scikit-learn’s default encoders cannot handle list-based features, we build **custom transformers** to:

- Normalize and clean list values  
- Group or replace rare labels based on column-specific frequency rules  
- Create consistent multi-label vocabularies  
- Convert lists into multi-hot encoded features  

In the **ColumnTransformer** later, these three columns will be handled through a **separate pipeline**, using the custom classes defined in this section.


In [None]:
class MultiListModeImputer(BaseEstimator, TransformerMixin):
    '''    
    Imputes list-like columns by replacing empty or invalid lists with the
    most frequent item (mode) found in that column.
    '''
    def __init__(self, columns):
        self.columns = columns
        self.modes_ = {}

    def _ensure_list(self, x):
        if isinstance(x, list):
            return x
        if pd.isna(x):
            return []
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except:
                return []
        return []

    def fit(self, X, y=None):
        for col in self.columns:
            temp = X[col].apply(self._ensure_list)
            all_items = []
            for lst in temp:
                all_items.extend(lst)

            if len(all_items) == 0:
                self.modes_[col] = None
            else:
                self.modes_[col] = Counter(all_items).most_common(1)[0][0]

        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            mode_item = self.modes_[col]
            X[col] = X[col].apply(self._ensure_list)
            X[col] = X[col].apply(lambda lst: lst if len(lst) > 0 else [mode_item])
        return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.columns)
    
class FrequencyGrouper(BaseEstimator, TransformerMixin):
    '''
    Genre/Producer/Studios with frequency ≥ min_freq → keep name.
    Other Genre/Producer/Studio with frequency < min_freq → group into "Other".
    Help reduce Dimension

    '''
    def __init__(self, columns, min_freq):
        """
        columns: list các cột dạng list
        min_freq: dict chứa min_freq riêng cho từng cột
        """
        self.columns = columns
        self.min_freq = min_freq
        self.frequent_items_ = {}  # lưu item phổ biến của từng cột

    def _ensure_list(self, x):
        """Ensure input is in list form"""
        if isinstance(x, list):
            return x
        if pd.isna(x):
            return []
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except:
                return []
        return []

    def fit(self, X, y=None):
        for col in self.columns:
            min_f = self.min_freq.get(col, 10)  # nếu thiếu thì gán mặc định 10

            temp = X[col].apply(self._ensure_list)

            # Flatten
            all_items = []
            for lst in temp:
                all_items.extend(lst)

            counts = Counter(all_items)
            frequent = [k for k, v in counts.items() if v >= min_f]

            self.frequent_items_[col] = set(frequent)

        return self

    def transform(self, X):
        X = X.copy()

        for col in self.columns:
            freq_set = self.frequent_items_[col]

            X[col] = (
                X[col]
                .apply(self._ensure_list)
                .apply(lambda lst: [item if item in freq_set else "Other" 
                                    for item in lst])
            )

        return X
    
    def get_feature_names_out(self, input_features=None):
        return np.array(self.columns)

class MultiLabelBinarizerDF(BaseEstimator, TransformerMixin):
    '''
    A custom transformer for multi-label columns.
    Converts each list-like column into multiple binary features using
    MultiLabelBinarizer, with feature names formatted as <col>__<label>.
    Supports get_feature_names_out() for pipeline compatibility.
    '''
    def __init__(self, columns):
        self.columns = columns
        self.encoders = {}
        self.output_features_ = []

    def fit(self, X, y=None):
        self.output_features_ = []
        for col in self.columns:
            mlb = MultiLabelBinarizer()
            mlb.fit(X[col])
            self.encoders[col] = mlb

            # lưu tên cột sinh ra
            for c in mlb.classes_:
                self.output_features_.append(f"{col}__{c}")

        return self

    def transform(self, X):
        X = X.copy()
        encoded_list = []

        for col in self.columns:
            mlb = self.encoders[col]
            arr = mlb.transform(X[col])

            df_enc = pd.DataFrame(
                arr,
                index=X.index,
                columns=[f"{col}__{c}" for c in mlb.classes_]
            )
            encoded_list.append(df_enc)

        other_cols = X.drop(columns=self.columns)

        return pd.concat([other_cols] + encoded_list, axis=1)

    def get_feature_names_out(self, input_features=None):
        return np.array(self.output_features_)


### **4.2. Custom Transformer for `Aired Month`**
The `Aired Month` column represents a **repeating cycle** (12 → 1), so encoding it as plain integers would wrongly imply that December and January are far apart.  
To preserve this cyclical structure, the `CyclicalMonthEncoder` converts each month into two continuous features (`month_sin`, `month_cos`) using sine–cosine mapping.

This encoding ensures that:

- Adjacent months remain close in feature space  
- The model learns the natural yearly cycle  
- No artificial jumps occur between 12 → 1  

In the preprocessing pipeline, this column is processed through the custom `CyclicalMonthEncoder` defined in this section.


In [None]:
class CyclicalMonthEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.generated_features_ = []

    def fit(self, X, y=None):
        # Tạo danh sách tên feature sẽ sinh ra
        self.generated_features_ = []
        for col in self.columns:
            self.generated_features_.append(f"{col}_sin")
            self.generated_features_.append(f"{col}_cos")
        return self

    def transform(self, X):
        X = X.copy()

        for col in self.columns:
            # Fill missing
            X[col] = X[col].fillna(X[col].median()).astype(float)

            # Add sin/cos features
            X[col + "_sin"] = np.sin(2 * np.pi * X[col] / 12)
            X[col + "_cos"] = np.cos(2 * np.pi * X[col] / 12)

            X = X.drop(columns=[col])

        return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.generated_features_)


### **4.3. New Features**

To enhance the model’s ability to capture deeper patterns within the anime dataset, we introduce several engineered features that provide additional structural, behavioral, and interaction-based insights beyond the original columns.

These new features include:

- **Favorites–Members Ratio** (`Fav_Mem_Ratio`):  
  A proxy for engagement intensity, indicating how actively fans support a title relative to its overall audience size.

- **Polynomial Transformations of Aired Year** (`Aired Year_deg2`, ...):  
  Helps the model learn non-linear trends over time, such as changes in production quality or popularity across different eras.

- **Cross-Feature Interactions** (e.g., `Episodes_x_Duration Minutes`, `Members_x_Favorites`):  
  Capture combined effects between related attributes that may influence the final *Score* more strongly when considered together.

- **List-Based Count Features** (`Genres_Count`, `Producers_Count`, `Studios_Count`):  
  Quantify the complexity of each anime title by counting how many genres, studios, or producers are associated with it.

These engineered features are generated through the custom `FeatureEngineering` transformer and appended to the dataset before the preprocessing pipeline. 


In [None]:
def create_ratio_fav_mem(df, fav_col='Favorites', mem_col='Members'):
    df = df.copy()
    df['Fav_Mem_Ratio'] = df[fav_col] / (df[mem_col] + 1e-6)
    return df[['Fav_Mem_Ratio']]


def create_polynomial_year(df, col='Aired Year', degree=2):
    df = df.copy()
    out = {}
    for d in range(2, degree + 1):
        out[f'{col}_deg{d}'] = df[col] ** d
    return pd.DataFrame(out, index=df.index)


def create_interactions(df, pairs=[
    ('Episodes', 'Duration Minutes'),
    ('Rank', 'Popularity'),
    ('Members', 'Favorites')
]):
    df = df.copy()
    out = {}
    for a, b in pairs:
        out[f'{a}_x_{b}'] = df[a] * df[b]
    return pd.DataFrame(out, index=df.index)


def create_list_counts(df, columns):
    df = df.copy()
    out = {}
    for col in columns:
        out[col + "_Count"] = df[col].apply(lambda x: len(x) if isinstance(x, list) else 0)
    return pd.DataFrame(out, index=df.index)


class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 fav_col='Favorites', 
                 mem_col='Members',
                 year_col='Aired Year',
                 degree=2,
                 list_columns=['Genres', 'Producers', 'Studios'],
                 interaction_pairs=[
                     ('Episodes', 'Duration Minutes'),
                     ('Rank', 'Popularity'),
                     ('Members', 'Favorites')
                 ]):
        
        self.fav_col = fav_col
        self.mem_col = mem_col
        self.year_col = year_col
        self.degree = degree
        self.list_columns = list_columns
        self.interaction_pairs = interaction_pairs

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # --- Create new features ---
        df_ratio = create_ratio_fav_mem(X, self.fav_col, self.mem_col) # Ratio of Favorite/ Member
        df_poly  = create_polynomial_year(X, self.year_col, self.degree) # Polynomial for Aired Year (degree 2)
        df_list  = create_list_counts(X, self.list_columns) # Count number of Genre, Producers, Stuiods
        df_inter = create_interactions(X, self.interaction_pairs) # Cross Feature Interactions

        # Combine all new features
        X_new = pd.concat([X, df_ratio, df_poly, df_list, df_inter], axis=1)

        return X_new


In [None]:
# code chạy thử riêng feature engineer nếu muốn
feature_engineering = FeatureEngineering()
X_train_fe = feature_engineering.fit_transform(X_train)

print("New FE shape:", X_train_fe.shape)
X_train_fe.head(1)



New FE shape: (12553, 24)


Unnamed: 0,Genres,Type,Episodes,Status,Producers,Studios,Source,Rating,Rank,Popularity,Favorites,Scored By,Members,Aired Year,Aired Month,Duration Minutes,Fav_Mem_Ratio,Aired Year_deg2,Genres_Count,Producers_Count,Studios_Count,Episodes_x_Duration Minutes,Rank_x_Popularity,Members_x_Favorites
11511,,Music,1.0,Finished Airing,['Lastrum Music'],,Music,G - All Ages,,16739.0,0,165.0,338,2014.0,9.0,3.0,0.0,4056196.0,0,0,0,3.0,,0


### **5. Pipeline**

### **5.1. Sub-Pipeline**

### **5.1.1. Feature Engineering Pipeline**

In [None]:
feature_engineering = FeatureEngineering()

### **5.1.2. Preprocessor Pipeline**

In [None]:
numeric_features = ['Episodes', 'Rank','Popularity','Favorites','Scored By','Members','Duration Minutes','Aired Year',
                    'Fav_Mem_Ratio','Episodes_x_Duration Minutes','Aired Year_deg2', 'Members_x_Favorites','Rank_x_Popularity',
                    'Genres_Count','Producers_Count','Studios_Count']

nominal_features = ['Type','Status','Source'] 
rating_feature = ['Rating']
multi_label_features = ['Genres','Producers', 'Studios']
cyclidal_features = ['Aired Month']


# 1. Create a sub-pipeline for NUMERICAL data
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson')),
    ('scaler', RobustScaler())
])


# 2. Create a sub-pipeline for Multi-label data
multi_label_pipeline = Pipeline(steps=[
    ('imputer', MultiListModeImputer(columns=multi_label_features)),
    ('grouper', FrequencyGrouper(columns=multi_label_features, min_freq={"Genres": 25, "Producers": 60, "Studios": 60})),
    ('multi_encoder', MultiLabelBinarizerDF(columns=multi_label_features))
])

# 3. Create a sub-pipeline for cyclidal data: Aired Month
aired_month_pipeline = Pipeline(steps=[
    ('cyclical', CyclicalMonthEncoder(cyclidal_features))
])


# 4. Create sub-pipeline for nominal category 
nominal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 5. create sub-pipeline for Rating (ordinal feature)
rating_order = ["G - All Ages","PG - Children", "PG-13 - Teens 13 or older", 
                "R - 17+ (violence & profanity)", "R+ - Mild Nudity", "Rx - Hentai" ]

rating_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[rating_order], handle_unknown='use_encoded_value', unknown_value=-1))
])


# 6. Combine with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        # 1. Numerical columns (median impute + robust scaling)
        ('num', numeric_pipeline, numeric_features),

        # 2. Nominal categorical columns (mode impute + OneHotEncoder)
        ('nominal', nominal_pipeline, nominal_features),

        # 3. Multi-label columns (list-impute → frequency grouping → multi-hot encoding)
        ('multi_label', multi_label_pipeline, multi_label_features),

        # 4. Cyclical encoded column (Aired Month → sin/cos)
        ('cyclical', aired_month_pipeline, cyclidal_features),

        # 5. Ordinal encoding for Rating
        ('rating', rating_pipeline, rating_feature)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False   
)


### **5.2. Full Pipeline**

In [None]:
full_pipeline = Pipeline([
    ('feature_engineering', feature_engineering),
    ('preprocessor', preprocessor),
    # ('selector', SelectKBest(score_func=f_regression, k=50)),
    ('model', LinearRegression())
])

print("Full regression pipeline created.")

full_pipeline.fit(X_train, y_train)


Full regression pipeline created.


0,1,2
,steps,"[('feature_engineering', ...), ('preprocessor', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,fav_col,'Favorites'
,mem_col,'Members'
,year_col,'Aired Year'
,degree,2
,list_columns,"['Genres', 'Producers', ...]"
,interaction_pairs,"[('Episodes', ...), ('Rank', ...), ...]"

0,1,2
,transformers,"[('num', ...), ('nominal', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,columns,"['Genres', 'Producers', ...]"

0,1,2
,columns,"['Genres', 'Producers', ...]"
,min_freq,"{'Genres': 25, 'Producers': 60, 'Studios': 60}"

0,1,2
,columns,"['Genres', 'Producers', ...]"

0,1,2
,columns,['Aired Month']

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['G - All Ages', 'PG - Children', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [None]:
# Predict and Evaluate
y_pred = full_pipeline.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("\n--- Model Performance (Linear Regression) ---")
print(f"R² Score : {r2:.4f}")
print(f"MAE      : {mae:.4f}")



--- Model Performance (Linear Regression) ---
R² Score : 0.9035
MAE      : 0.1857


### **5. Evaluating the Pipeline with Cross-Validation**

### **6. Save Pipeline**

In [None]:
print("--- STARTING TRAINING AND SAVING MODEL ---")

# 1. Train the pipeline on the ENTIRE X_train, y_train
full_pipeline.fit(X_train, y_train)
print("Pipeline has been trained on X_train, y_train.")

# 2. Final evaluation on the Test set (unseen data)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"R² Score : {r2:.4f}")
print(f"MAE      : {mae:.4f}")

# 3. Save the pipeline
# Use joblib.dump to 'freeze' the entire pipeline (including imputer, scaler, model...)
model_filename = 'final_model_pipeline.joblib'
joblib.dump(full_pipeline, model_filename)
print(f"Pipeline saved to file: {model_filename}")

--- STARTING TRAINING AND SAVING MODEL ---
Pipeline has been trained on X_train, y_train.
R² Score : 0.8584
MAE      : 0.2094
Pipeline saved to file: final_model_pipeline.joblib


### **7. Feature Importance (base on coefficient of LinearRegrssor)**

In [None]:
# Checking the shape
X_train_pre = preprocessor.transform(X_train)
X_test_pre = preprocessor.transform(X_test)

print("Original X_train shape:", X_train.shape)
print("Transformed X_train shape:", X_train_pre.shape)

print("\nOriginal X_test shape:", X_test.shape)
print("Transformed X_test shape:", X_test_pre.shape)



Original X_train shape: (12553, 16)
Transformed X_train shape: (12553, 168)

Original X_test shape: (3139, 16)
Transformed X_test shape: (3139, 168)


In [None]:
# ---- GET FEATURE NAMES ----
try:
    feature_names = preprocessor.get_feature_names_out()
except:
    feature_names = np.array([f"feat_{i}" for i in range(X_train_pre.shape[1])])

# ---- LINEAR REG COEFFICIENTS ----
coefs = full_pipeline.named_steps["model"].coef_

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coef": coefs,
})

coef_df["abs_coef"] = coef_df["coef"].abs()
coef_df_sorted = coef_df.sort_values("abs_coef", ascending=False)

# Lấy top 20 features
top_20 = coef_df_sorted.head(20)

# Vẽ biểu đồ bằng Plotly
fig = px.bar(
    top_20,
    x='abs_coef',
    y='feature',
    orientation='h',
    title='Top 20 Most Important Features (Linear Regression Coefficients)',
    labels={'abs_coef': 'Absolute Coefficient Value', 'feature': 'Features'},
    color='abs_coef',
    color_continuous_scale='viridis'
)

# Tùy chỉnh layout
fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    showlegend=False,
    height=600,
    width=800
)

# Hiển thị biểu đồ
fig.show()

In [None]:

# ---- PHÂN LOẠI FEATURES VÀO NHÓM GỐC ----
original_features = ['Score', 'Genres', 'Type', 'Episodes', 'Status', 'Producers', 
                     'Studios', 'Source', 'Rating', 'Rank', 'Popularity', 'Favorites', 
                     'Scored By', 'Members', 'Aired Year', 'Aired Month', 'Duration Minutes']

# Tạo mapping từ feature đã xử lý về feature gốc
feature_to_original = {}

for feature in feature_names:
    feature_lower = feature.lower()
    assigned = False
    
    # Kiểm tra từng feature gốc
    for orig_feat in original_features:
        orig_lower = orig_feat.lower()
        
        # Nếu feature chứa tên feature gốc (có thể có tiền tố hậu tố từ preprocessor)
        if orig_lower in feature_lower:
            feature_to_original[feature] = orig_feat
            assigned = True
            break
    
    # Nếu không tìm thấy mapping, gán vào nhóm "unknown"
    if not assigned:
        feature_to_original[feature] = "unknown"

# Thêm thông tin feature gốc vào DataFrame
coef_df["original_feature"] = coef_df["feature"].map(feature_to_original)

# ---- PHÂN TÍCH THEO NHÓM ----
# 1. Tổng hợp importance theo nhóm feature gốc
group_importance = coef_df.groupby("original_feature")["abs_coef"].sum().sort_values(ascending=False)
group_importance_df = group_importance.reset_index()
group_importance_df.columns = ["original_feature", "total_importance"]

# 2. Top features trong mỗi nhóm
top_features_by_group = coef_df.sort_values(["original_feature", "abs_coef"], ascending=[True, False])
top_features_by_group = top_features_by_group.groupby("original_feature").head(3)

# ---- VẼ BIỂU ĐỒ ----
# Biểu đồ 1: Tổng importance theo nhóm
fig1 = px.bar(
    group_importance_df.head(10),
    x='total_importance',
    y='original_feature',
    orientation='h',
    title='Top 10 Most Influential Feature Groups (Total Absolute Coefficients)',
    labels={'total_importance': 'Total Absolute Coefficient', 'original_feature': 'Feature Groups'},
    color='total_importance',
    color_continuous_scale='viridis'
)

fig1.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    showlegend=False,
    height=500,
    width=800
)

fig1.show()


