In [72]:
import pandas as pd
import numpy as np
import re
import calendar
import kagglehub
import warnings
import joblib
import sys
import os
from collections import Counter
import ast

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, OrdinalEncoder, PowerTransformer, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Custom imports
project_root = os.path.abspath("../")
sys.path.append(project_root)
from src.Custom_Transformer import *

# Pandas display options for better viewing
pd.set_option('display.max_columns', 50)

# Warning configuration
warnings.filterwarnings("ignore", category=FutureWarning)

## **Introduction**

This notebook serves as the central hub for our entire data preprocessing workflow. The process is strategically divided into two distinct phases to ensure data integrity and prevent data leakage, following best practices in machine learning.

*   **Part I: Pre-Split Cleaning (Execution):** This part involves executing "stateless" cleaning operations that do not depend on the overall dataset statistics (e.g., fixing data types, standardizing text formats, restructuring columns). The output of this part is a structurally sound file (`prepared_data.csv`) intended for human-readable Exploratory Data Analysis (EDA).

*   **Part II: Post-Split Pipeline (Definition):** This part involves **defining** the "stateful" transformations that learn parameters from the data (e.g., Imputation, Scaling). These steps are encapsulated into a Scikit-learn `Pipeline`. This pipeline object is the final deliverable of this notebook, ready to be used for model training in the next phase.

## **Part I : Pre-Split Cleaning**
*Goal: Transform the raw data (`anime-dataset-2023.csv`) into a clean, structurally correct dataset (`prepared_data.csv`).*


### **1. Load Data**

In [73]:
# path = kagglehub.dataset_download("dbdmobile/myanimelist-dataset")
# csv_file_path = f"{path}/anime-dataset-2023.csv"
csv_file_path = r'..\data\raw\anime-dataset-2023.csv'
df = pd.read_csv(csv_file_path)

print(f"Dataset loaded successfully from: {csv_file_path}")
print(f"Initial shape: {df.shape[0]} rows, {df.shape[1]} columns")

# --- Initial Data Inspection ---
print("\nDataFrame Info:")
df.info()

print("\nFirst 5 rows of the raw data:")
display(df.head())

Dataset loaded successfully from: ..\data\raw\anime-dataset-2023.csv
Initial shape: 24905 rows, 24 columns

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   anime_id      24905 non-null  int64 
 1   Name          24905 non-null  object
 2   English name  24905 non-null  object
 3   Other name    24905 non-null  object
 4   Score         24905 non-null  object
 5   Genres        24905 non-null  object
 6   Synopsis      24905 non-null  object
 7   Type          24905 non-null  object
 8   Episodes      24905 non-null  object
 9   Aired         24905 non-null  object
 10  Premiered     24905 non-null  object
 11  Status        24905 non-null  object
 12  Producers     24905 non-null  object
 13  Licensors     24905 non-null  object
 14  Studios       24905 non-null  object
 15  Source        24905 non-null  object
 16  Durati

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,Premiered,Status,Producers,Licensors,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",spring 1998,Finished Airing,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,"Sep 1, 2001",UNKNOWN,Finished Airing,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),189.0,602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",spring 1998,Finished Airing,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,328.0,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,"Jul 3, 2002 to Dec 25, 2002",summer 2002,Finished Airing,"Bandai Visual, Dentsu, Victor Entertainment, T...","Funimation, Bandai Entertainment",Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,2764.0,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,"Sep 30, 2004 to Sep 29, 2005",fall 2004,Finished Airing,"TV Tokyo, Dentsu",Illumitoon Entertainment,Toei Animation,Manga,23 min per ep,PG - Children,4240.0,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...


### **2. Standardize "NaN-like" Values**

Many object columns use string literals to represent missing data. We'll replace them with np.nan.

In [74]:
# Comprehensive list of values to be treated as NaN, identified in Diagnostic EDA
nan_like_values = [
    'unknown', 'not available', 'n/a', 'na', 'tbd', 'tba', '---',
    '', ' ', '-', '?', '.',  # Single punctuation/empty tokens
    'https://cdn.myanimelist.net/img/sp/icon/apple-touch-icon-256.png' # Specific placeholder image
]

# Store initial null counts for comparison
initial_nulls = df.isnull().sum()

# Iterate through object columns and replace NaN-like values
for col in df.select_dtypes(include=['object']).columns:
    # Check for exact matches (case-insensitive)
    mask = df[col].str.strip().str.lower().isin(nan_like_values)
    
    # Replace with np.nan using .loc
    df.loc[mask.fillna(False), col] = np.nan

print("Standardization of 'NaN-like' values complete.")

# --- Verification ---
# Check if specific columns like 'Studios' or 'Licensors' have increased NaN counts
final_nulls = df.isnull().sum()
diff = final_nulls - initial_nulls
print("\nIncrease in NaN values per column:")
print(diff[diff > 0].sort_values(ascending=False).head())

Standardization of 'NaN-like' values complete.

Increase in NaN values per column:
Licensors       20170
Premiered       19399
English name    14577
Producers       13350
Studios         10526
dtype: int64


### **3. Handle Placeholder Zeros**
The values `0` in `Rank` and `Popularity` don't represent a true zero value but rather a missing or unassigned one. These should be converted to `np.nan`.

In [75]:
cols_with_placeholder_zeros = ['Rank', 'Popularity']
df[cols_with_placeholder_zeros] = df[cols_with_placeholder_zeros].replace(0, np.nan)
print("Placeholder zeros in 'Rank' and 'Popularity' replaced with np.nan.")

# --- Check after Handling Zeros ---
print("\nMissing values count for Rank and Popularity:")
print(df[['Rank', 'Popularity']].isnull().sum())

Placeholder zeros in 'Rank' and 'Popularity' replaced with np.nan.

Missing values count for Rank and Popularity:
Rank          4612
Popularity     187
dtype: int64


### **4. Process Complex & Temporal Columns**


#### **4.1. `Aired` Column**
We'll parse the `Aired` string to extract start and end dates into proper `datetime` columns.

In [76]:
def parse_aired_string_ultimate(aired_str):
    start_date, end_date, start_year, start_month = pd.NaT, pd.NaT, np.nan, np.nan
    if not isinstance(aired_str, str):
        return start_date, end_date, start_year, start_month

    month_map = {name.lower(): i for i, name in enumerate(calendar.month_abbr) if i > 0}
    
    def parse_part(part_str, is_end_date=False):
        date_val, year_val, month_val = pd.NaT, np.nan, np.nan
        try:
            part_str_lower = part_str.lower()
            year_match = re.search(r'\b(\d{4})\b', part_str_lower)
            if year_match: year_val = int(year_match.group(1))

            found_month_name = None
            for month_name, month_num in month_map.items():
                if month_name in part_str_lower:
                    month_val = month_num
                    found_month_name = month_name
                    break
            
            day_match = re.search(r'\b(\d{1,2})\b', part_str_lower)
            day_present = bool(day_match) and not (day_match.group(1) == str(year_val) and found_month_name is None)

            if year_val and month_val and day_present:
                date_val = pd.to_datetime(part_str, errors='coerce')
            elif is_end_date and year_val and month_val:
                _, last_day = calendar.monthrange(int(year_val), int(month_val))
                date_val = pd.to_datetime(f"{int(year_val)}-{int(month_val)}-{last_day}", errors='coerce')
        
        except (ValueError, TypeError): pass
        return date_val, year_val, month_val

    parts = aired_str.strip().split(' to ')
    if len(parts) >= 1: start_date, start_year, start_month = parse_part(parts[0], is_end_date=False)
    if len(parts) == 2: end_date, _, _ = parse_part(parts[1], is_end_date=True)
    return start_date, end_date, start_year, start_month

parsed_info = df['Aired'].apply(parse_aired_string_ultimate)
df[['Aired Date Start', 'Aired Date End', 'Aired Year', 'Aired Month']] = pd.DataFrame(parsed_info.tolist(), index=df.index)
df = df.drop(columns=['Aired'])
print("Ultimate parsing of 'Aired' column complete.")
# --- Verification Step ---
print("\nVerifying the new logic by checking data types and null counts:")
print(df[['Aired Date Start', 'Aired Date End', 'Aired Year', 'Aired Month']].info())

Ultimate parsing of 'Aired' column complete.

Verifying the new logic by checking data types and null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Aired Date Start  20090 non-null  datetime64[ns]
 1   Aired Date End    9491 non-null   datetime64[ns]
 2   Aired Year        23990 non-null  float64       
 3   Aired Month       20750 non-null  float64       
dtypes: datetime64[ns](2), float64(2)
memory usage: 778.4 KB
None


#### **4.2. `Duration` Column**
##### **4.2.1** 
Convert the duration string (e.g., "24 min. per ep.") into a numerical column representing total minutes.

In [77]:
def parse_duration_to_total_minutes(duration_str):
    """
    Parses a string from the 'Duration' column into a single numerical value
    representing the total duration in minutes. It handles hours, minutes, and seconds.
    """
    if pd.isna(duration_str):
        return np.nan
    
    # Use float for total_minutes to handle fractional values from seconds
    total_minutes = 0.0
    duration_str = str(duration_str).lower()
    
    # Regex to find numbers associated with 'hr', 'min', and 'sec'
    hour_match = re.search(r'(\d+)\s*hr', duration_str)
    min_match = re.search(r'(\d+)\s*min', duration_str)
    sec_match = re.search(r'(\d+)\s*sec', duration_str)
    
    # Calculate total minutes by converting all units
    if hour_match:
        total_minutes += float(hour_match.group(1)) * 60
    if min_match:
        total_minutes += float(min_match.group(1))
    if sec_match:
        total_minutes += float(sec_match.group(1)) / 60
        
    # If total_minutes is 0, it means no valid duration was found. Return NaN.
    return total_minutes if total_minutes > 0 else np.nan

df['Duration Minutes'] = df['Duration'].apply(parse_duration_to_total_minutes)
df = df.drop(columns=['Duration'])
print("Processing of 'Duration' column complete.")

# --- Verification Step ---
print("\nVerifying the new column by checking its statistics:")
print(df['Duration Minutes'].describe())



Processing of 'Duration' column complete.

Verifying the new column by checking its statistics:
count    24242.000000
mean        22.132022
std         27.610721
min          0.050000
25%          4.000000
50%         17.000000
75%         25.000000
max       1500.000000
Name: Duration Minutes, dtype: float64


##### **4.2.2 Post-Conversion Sanity Check: Detect Hidden Outliers**

Now that we have converted `Duration` from a complex string to a numerical format (`Duration Minutes`), we can perform a statistical check to ensure the data makes sense.

**Logic:**
Feature Engineering often reveals data quality issues that were hidden in text formats. For example, a typo like "1500 min" instead of "25 min" is hard to spot in a string column but becomes obvious in numerical statistics (max value).

In [78]:
# 1. Statistical Summary
print("Statistics for 'Duration Minutes':")
display(df['Duration Minutes'].describe())

# 2. Investigate Extreme Values
# Let's look at anime with unusually long duration (> 200 minutes) that are NOT Movies
potential_errors = df[
    (df['Duration Minutes'] > 200) &
    (df['Type'] != 'Movie')
]

print(f"\nPotential errors found: {len(potential_errors)}")
display(potential_errors[['Name', 'Type', 'Duration Minutes', 'Episodes']])

# 3. Quick Visualization to confirm the Outlier
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(
    go.Box(
        x=df['Duration Minutes'],
        boxpoints='outliers',
        marker=dict(size=4),
        line=dict(width=1.5)
    )
)

fig.update_layout(
    title=dict(
        text="<b>Diagnostic Plot: Duration Minutes (Revealing Outliers)</b>",
        x=0.5,
        xanchor="center",
        font=dict(size=20)
    ),
    height=250,
    margin=dict(l=20, r=20, t=60, b=20),
    showlegend=False,
)

fig.update_xaxes(title="Duration Minutes")

fig.show()



Statistics for 'Duration Minutes':


count    24242.000000
mean        22.132022
std         27.610721
min          0.050000
25%          4.000000
50%         17.000000
75%         25.000000
max       1500.000000
Name: Duration Minutes, dtype: float64


Potential errors found: 2


Unnamed: 0,Name,Type,Duration Minutes,Episodes
24501,Beyblade Burst QuadStrike,ONA,1351.0,
24801,Castlevania (Netflix animated series),TV,1500.0,32.0


#####  **4.4. Hard Cleaning Rule: Removing Impossible Durations**

**Diagnostic Finding:**
We identified a **TV Series** with a duration of **1500 minutes (25 hours) per episode** and **ONA(1351 minutes)**. This is physically impossible for a standard TV anime and is clearly a data entry error (likely total duration mistaken for per-episode duration).

**Action:**
To prevent this outlier from skewing our scaling and imputation steps later, we will replace this specific value with `NaN`. The Pipeline's Imputer will later fill it.


In [79]:
# Define threshold: Any episode longer than 200 minutes 
# for Non-Movie types is considered a data entry error (likely Total Duration).
# Note: Movies can be long, so we exclude them from this check.

mask_invalid_duration = (
    (df['Type'].isin(['TV', 'ONA', 'OVA', 'Special'])) & 
    (df['Duration Minutes'] > 200)
)

# Reporting before removal
print(f"Detected {mask_invalid_duration.sum()} entries with impossible episode duration:")
display(df.loc[mask_invalid_duration, ['Name', 'Type', 'Episodes', 'Duration Minutes']])

# Action: Replace with NaN (to be handled by Imputer later)
df.loc[mask_invalid_duration, 'Duration Minutes'] = np.nan

print("-" * 30)
print(f"Cleaning complete. Max duration for Series/ONA/OVA is now: {df.loc[~df['Type'].eq('Movie'), 'Duration Minutes'].max()} minutes.")

Detected 2 entries with impossible episode duration:


Unnamed: 0,Name,Type,Episodes,Duration Minutes
24501,Beyblade Burst QuadStrike,ONA,,1351.0
24801,Castlevania (Netflix animated series),TV,32.0,1500.0


------------------------------
Cleaning complete. Max duration for Series/ONA/OVA is now: 161.0 minutes.


#### **4.3. Multi-Value Columns (`Genres`, `Producers`, `Studios`)**
These columns contain comma-separated strings. Before splitting them into lists, we perform **Name Normalization** to fix specific typos and inconsistent naming (e.g., merging "Sunrise Inc." into "Sunrise") as identified in the Fuzzy Matching phase of the EDA. Then, we convert them into structured lists.

In [80]:
multi_value_cols = ['Genres', 'Producers', 'Studios']

# 1. Correction Dictionary (Based on EDA Fuzzy Matching Findings)
# Format: { 'Bad Name': 'Correct Name' }
name_corrections = {
    'Congzhuo Animation': 'Chongzhuo',
}

# 2. Apply Corrections and Split
for col in multi_value_cols:
    # Apply string replacement for known typos across the whole column string first
    for bad, good in name_corrections.items():
        if pd.notna(good):
            # Regex replace to ensure we match full words/segments if needed
            df[col] = df[col].str.replace(bad, good, regex=False)
    
    # Split string into list
    df[col] = df[col].apply(lambda x: [s.strip() for s in x.split(',')] if isinstance(x, str) else np.nan)

print("Processing of multi-value columns complete (Normalization + Splitting).")

# --- Verification ---
# Check if 'Congzhuo Animation' (bad name) still exists in Studios
# We explode the list to check individual elements
if 'Studios' in df.columns:
    all_studios = df['Studios'].explode().unique()
    print(f"\nIs 'Congzhuo Animation' present? {'Congzhuo Animation' in all_studios}")
    print(f"Is 'Chongzhuo' present? {'Chongzhuo' in all_studios}")

Processing of multi-value columns complete (Normalization + Splitting).

Is 'Congzhuo Animation' present? False
Is 'Chongzhuo' present? True


### **5. Data Type Coercion & Logic Validation**

In this section, we ensure the numerical integrity and logical consistency of our data.
1.  **Type Coercion:** Convert numerical strings to `float`.
2.  **Sanity Checks (Logic Fixes):** Fix or drop data that violates business rules (e.g., Movies cannot have > 1 episode, End Date cannot be before Start Date), as highlighted in the Diagnostic EDA.
3.  **Advanced Business Logic:** Reclassifying data based on domain knowledge (e.g., fixing 'Movie' types that are too short, correcting ratings based on genres).

In [81]:
# 5.1. DATA TYPE COERCION
# Convert numerical columns stored as objects to float
numerical_cols = ['Score', 'Episodes', 'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members']
for col in numerical_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
print("5.1. Data type coercion complete.\n")


# 5.2. SANITY CHECKS (BASIC LOGIC)
print("--- 5.2. Starting Sanity Checks (Fixing Impossible Values) ---")

# Rule 1: Negative Values -> NaN
cols_must_be_positive = ['Score', 'Episodes', 'Duration Minutes', 'Aired Year', 'Aired Month']
for col in cols_must_be_positive:
    mask_neg = df[col] < 0
    if mask_neg.sum() > 0:
        print(f"  [Fix] Found {mask_neg.sum()} negative values in '{col}'. Replacing with NaN.")
        df.loc[mask_neg, col] = np.nan

# Rule 2: 'Finished Airing' but Episodes is 0 or NaN -> Likely missing data
mask_finished = df['Status'].astype(str).str.lower().str.strip() == 'finished airing'
mask_zero_eps = (df['Episodes'] == 0) | (df['Episodes'].isna())
mask_finished_zero_eps = mask_finished & mask_zero_eps

if mask_finished_zero_eps.sum() > 0:
    print(f"  [Fix] Found {mask_finished_zero_eps.sum()} 'Finished' anime with 0/NaN episodes. Setting Episodes to NaN.")
    df.loc[mask_finished_zero_eps, 'Episodes'] = np.nan

# Rule 3: Type 'Movie' must have exactly 1 Episode
mask_movie = df['Type'].astype(str).str.lower().str.strip() == 'movie'
mask_movie_multi_eps = mask_movie & (df['Episodes'] > 1)

if mask_movie_multi_eps.sum() > 0:
    print(f"  [Fix] Found {mask_movie_multi_eps.sum()} Movies with >1 episodes. Forcing Episodes to 1.")
    df.loc[mask_movie_multi_eps, 'Episodes'] = 1.0

# Rule 4: Date Paradox (End Date < Start Date)
date_paradox_mask = (df['Aired Date End'].notna()) & \
                    (df['Aired Date Start'].notna()) & \
                    (df['Aired Date End'] < df['Aired Date Start'])
if date_paradox_mask.sum() > 0:
    print(f"  [Fix] Found {date_paradox_mask.sum()} rows with Date Paradox (End < Start). Setting End Date to NaT.")
    df.loc[date_paradox_mask, 'Aired Date End'] = pd.NaT

# Rule 5: Ghost Scoring (Score exists but Scored By is 0 or NaN)
mask_ghost_score = (df['Score'].notna()) & ((df['Scored By'] == 0) | (df['Scored By'].isna()))
if mask_ghost_score.sum() > 0:
    print(f"  [Fix] Found {mask_ghost_score.sum()} rows with Score but 0/NaN voters. Setting Score to NaN.")
    df.loc[mask_ghost_score, 'Score'] = np.nan


# 5.3. ADVANCED BUSINESS LOGIC & RECLASSIFICATION

print("\n--- 5.3. Starting Advanced Logic Fixes (Domain Knowledge) ---")

# 1. Fix Logic: Rating vs Genres Conflict
# Problem: Rated 'G'/'PG' but contains 'Hentai' or 'Ecchi'.
light_ratings = ['g - all ages', 'pg - children', 'pg-13 - teens 13 or older']
adult_genres = {'Hentai', 'Ecchi', 'Erotica'}

def fix_rating_logic(row):
    current_rating = str(row['Rating']).lower().strip()
    # Genres is now a list, so we convert to set for intersection check
    current_genres = set(row['Genres']) if isinstance(row['Genres'], list) else set()
    
    has_adult_content = not current_genres.isdisjoint(adult_genres)
    
    if (current_rating in light_ratings) and has_adult_content:
        if 'Hentai' in current_genres:
            return 'rx - hentai'
        else:
            return 'r+ - mild nudity'
    return row['Rating']

original_ratings = df['Rating'].copy()
df['Rating'] = df.apply(fix_rating_logic, axis=1)
diff_rating = (df['Rating'] != original_ratings).sum()
if diff_rating > 0:
    print(f"  [Logic Fix] Updated {diff_rating} rows where Rating conflicted with Adult Genres.")

# 2. Fix Logic: Status vs Aired Date
# Problem: 'Not yet aired' but Start Date is way in the past.
def fix_status_logic(row):
    status = str(row['Status']).lower().strip()
    start_year = row['Aired Year']
    has_score = pd.notna(row['Score'])
    
    # If marked 'not yet aired' but has a score OR aired before 2023
    if status == 'not yet aired':
        if has_score or (pd.notna(start_year) and start_year < 2023):
            return 'finished airing'
    return row['Status']

original_status = df['Status'].copy()
df['Status'] = df.apply(fix_status_logic, axis=1)
diff_status = (df['Status'] != original_status).sum()
if diff_status > 0:
    print(f"  [Logic Fix] Updated {diff_status} rows with 'Not yet aired' status but valid Score/Past Date.")

# 3. Fix Logic: Type Reclassification based on Duration
def fix_type_by_duration(row):
    anime_type = str(row['Type']).lower().strip()
    duration = row['Duration Minutes']
    
    if pd.isna(duration) or duration == 0:
        return row['Type']
    
    if anime_type == 'movie' and duration < 40:
        return 'OVA' # Return standardized Uppercase
    
    if anime_type == 'tv' and duration < 10:
        return 'Special' # Return standardized Title Case
        
    return row['Type']

original_type = df['Type'].copy()
df['Type'] = df.apply(fix_type_by_duration, axis=1)
diff_type = (df['Type'] != original_type).sum()
if diff_type > 0:
    print(f"  [Logic Fix] Reclassified {diff_type} rows (Movie<40m -> OVA, TV<10m -> Special).")
    
## 4. Fix Logic: Consolidate 'Source' Categories (Macro-categorization)
# Problem: Source is too fragmented (e.g., 'Manga', 'Web Manga', '4-koma Manga' are split).
def group_source_category(row):
    s = str(row['Source']).strip()
    
    # 1. Manga Group
    if s in ['Manga', 'Web manga', '4-koma manga', 'Digital manga']:
        return 'Manga'
    
    # 2. Literature Group
    elif s in ['Light novel', 'Novel', 'Web novel', 'Book', 'Picture book']:
        return 'Literature'
    
    # 3. Game Group
    elif s in ['Game', 'Visual novel', 'Card game']:
        return 'Game'
    
    # 4. Original 
    elif s == 'Original':
        return 'Original'
    
    # 5. Others
    else:
        return 'Other/Mixed'

original_source_counts = df['Source'].nunique()
df['Source'] = df.apply(group_source_category, axis=1)
new_source_counts = df['Source'].nunique()

print(f" [Logic Fix] Source column grouped from {original_source_counts} micro-categories to {new_source_counts} macro-categories.")
print("\nNew Source Distribution:")
print(df['Source'].value_counts())

# --- 5. FINAL STANDARDIZATION (FIX MIXED CASING) ---
# This step ensures "special" becomes "Special", "tv" becomes "TV", etc.
print("\n--- Finalizing Text Formatting ---")
for col in ['Type', 'Status', 'Rating', 'Source']:
    # Convert to Title Case first (e.g. "finished airing" -> "Finished Airing")
    df[col] = df[col].astype(str).str.strip().str.title()
    
    # Fix specific acronyms to Uppercase
    df[col] = df[col].replace({
        'Tv': 'TV', 
        'Ova': 'OVA', 
        'Ona': 'ONA',
        'Nan': np.nan # 'nan' string back to real NaN
    })

print("Text columns standardized (Title Case/Uppercase).")

# --- Verification ---
print("\nCheck Type distribution after standardization:")
print(df['Type'].value_counts().head())

# --- Verification ---
print("\nSummary statistics after all logic fixes:")
display(df[['Episodes', 'Score', 'Duration Minutes']].describe())

5.1. Data type coercion complete.

--- 5.2. Starting Sanity Checks (Fixing Impossible Values) ---


  [Fix] Found 9 'Finished' anime with 0/NaN episodes. Setting Episodes to NaN.
  [Fix] Found 96 Movies with >1 episodes. Forcing Episodes to 1.
  [Fix] Found 7 rows with Date Paradox (End < Start). Setting End Date to NaT.

--- 5.3. Starting Advanced Logic Fixes (Domain Knowledge) ---
  [Logic Fix] Updated 977 rows where Rating conflicted with Adult Genres.
  [Logic Fix] Reclassified 3525 rows (Movie<40m -> OVA, TV<10m -> Special).
 [Logic Fix] Source column grouped from 16 micro-categories to 5 macro-categories.

New Source Distribution:
Source
Original       9622
Manga          5448
Other/Mixed    5268
Game           2407
Literature     2160
Name: count, dtype: int64

--- Finalizing Text Formatting ---
Text columns standardized (Title Case/Uppercase).

Check Type distribution after standardization:
Type
TV         6216
OVA        6146
Special    3939
ONA        3533
Music      2686
Name: count, dtype: int64

Summary statistics after all logic fixes:


Unnamed: 0,Episodes,Score,Duration Minutes
count,24294.0,15692.0,24240.0
mean,14.894501,6.38089,22.016233
std,48.581309,0.928696,24.483495
min,1.0,1.85,0.05
25%,1.0,5.73,4.0
50%,2.0,6.39,17.0
75%,13.0,7.06,25.0
max,3057.0,9.1,168.0


### **6. Drop Unnecessary Columns**

The final step in our pre-split cleaning process is to remove columns that are not useful for our analysis, are too sparse to be reliable, or are redundant. A clean, focused dataset is easier to work with and leads to more robust insights.

Based on our initial exploration and the project's goals, we will drop the following columns:

*   **`Licensors`**: This column is missing over 80% of its values, making it highly unreliable for any analysis or modeling.
*   **`Premiered`**: This column also has a very high rate of missing values (over 77%). The temporal information it provides (year and season) has already been extracted with higher reliability from the `Aired` column into our new `air_year` and `air_month` features. Therefore, it is now redundant.
*   **`English name`, `Other name`**: These are largely redundant, as the primary `Name` column serves as the main identifier for each anime.
*   **`Image URL`**: This column contains links to images and is not a feature that can be used in our current analytical scope.

*   **`Synopsis`**: While this column contains descriptive text, extracting meaningful features from it requires advanced Natural Language Processing (NLP) techniques that are beyond the scope of this project. To maintain focus on core data preparation and visualization tasks, this column will be removed.

Removing these columns results in a final, streamlined DataFrame that is ready to be saved for the next phases of analysis and modeling.

In [82]:
# Define the list of columns to be dropped
cols_to_drop = ['Licensors','Premiered', 'English name', 'Other name', 'Image URL','Synopsis']

print("DataFrame shape BEFORE dropping columns:", df.shape)
print("Columns to be dropped:", cols_to_drop)

# Drop the specified columns from the DataFrame
df = df.drop(columns=cols_to_drop)

print("\nDataFrame shape AFTER dropping columns:", df.shape)

# --- Verification Step ---
print("\nVerifying the final set of columns and DataFrame info:")
# The .info() summary will now show the reduced column count.
df.info()

print("\nFinal columns in the DataFrame:")
print(df.columns.tolist())

DataFrame shape BEFORE dropping columns: (24905, 27)
Columns to be dropped: ['Licensors', 'Premiered', 'English name', 'Other name', 'Image URL', 'Synopsis']

DataFrame shape AFTER dropping columns: (24905, 21)

Verifying the final set of columns and DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   anime_id          24905 non-null  int64         
 1   Name              24905 non-null  object        
 2   Score             15692 non-null  float64       
 3   Genres            19976 non-null  object        
 4   Type              24831 non-null  object        
 5   Episodes          24294 non-null  float64       
 6   Status            24905 non-null  object        
 7   Producers         11555 non-null  object        
 8   Studios           14379 non-null  object        
 9   Source            24905 n

### **7. Save Pre-Split Data**

This is the final step of our pre-split cleaning phase. We have successfully transformed the raw, messy data into a structured, clean, and consistent format.

We will now save this processed DataFrame to a new file named `prepared_data.csv`. This file will serve as the clean foundation for all subsequent tasks, including the data storytelling, visualization (Part 1), and any further analysis or machine learning modeling (Part 3).

By setting `index=False`, we prevent pandas from writing the DataFrame index as a new column in our CSV file, keeping the output clean.

In [83]:
df[['Duration Minutes','Episodes']].value_counts()

Duration Minutes  Episodes
2.0               1.0         1100
3.0               1.0         1080
4.0               1.0          949
24.0              12.0         587
23.0              12.0         572
                              ... 
10.0              260.0          1
                  247.0          1
                  240.0          1
                  237.0          1
168.0             1.0            1
Name: count, Length: 2048, dtype: int64

In [84]:
# Define the output filename
output_filename = r'..\data\processed\prepared_data.csv'

# Save the cleaned DataFrame to a CSV file.
# index=False ensures that the DataFrame index is not saved as a column.
df.to_csv(output_filename, index=False)

### **8. Data Dictionary for `prepared_data.csv`**

This section serves as the official documentation for our clean dataset (`prepared_data.csv`). It details the state of each feature after the Pre-Split Cleaning phase and provides **strategic recommendations** for the subsequent Machine Learning pipeline (Phase 3).

**Critical Handover Notes (Based on Diagnostic EDA):**
1.  **Outliers:** We retained statistical outliers (e.g., long-running series like *One Piece* in `Episodes`) because they are genuine data points. **Strategy:** Do not drop them. Use **Power Transformation (Yeo-Johnson)** or **Robust Scaling** in the modeling pipeline to handle skewness.
2.  **Data Leakage:** `Rank` and `Popularity` are highly correlated with the target `Score`. **Strategy:** These columns must be **DROPPED** immediately after loading data for modeling to ensure a fair evaluation.
3.  **Seasonality:** We extracted `Aired Month`. **Strategy:** Use **Cyclical Encoding (Sin/Cos)** for this feature to capture seasonal patterns effectively.

| Feature Name | Data Type | Description & State | Required Post-Split Preprocessing |
| :--- | :--- | :--- | :--- |
| **`anime_id`** | `int64` | Unique identifier. | Drop (Not a feature). |
| **`Name`** | `object` | Primary name. | Drop (Not a feature). |
| **`Score`** | `float64` | Average user score (0-10). **Target Variable.** | Drop rows with NaN pre-split. |
| **`Genres`** | `object` (list) | List of genres (Normalized). | **Multi-label Binarization** or Embedding. |
| **`Type`** | `object` | Type of anime (e.g., 'tv', 'movie'). Cleaned. | **Imputation** & **One-Hot Encoding**. |
| **`Episodes`** | `float64` | Number of episodes. Zeros/Negatives fixed. **Highly Skewed**. | **Imputation** & **Power Transform (Yeo-Johnson)**. |
| **`Status`** | `object` | Airing status. Cleaned. | **Imputation** & **One-Hot Encoding**. |
| **`Producers`** | `object` (list) | List of producers (Normalized). High cardinality. | **Count Encoding** or **Target Encoding**. |
| **`Studios`** | `object` (list) | List of studios (Normalized). | **Multi-label Binarization**. |
| **`Source`** | `object` | Source material. Cleaned. | **Imputation** & **One-Hot Encoding**. |
| **`Rating`** | `object` | Age rating. Cleaned. | **Ordinal** or **One-Hot Encoding**. |
| **`Rank`** | `float64` | Popularity rank. Fixed zeros. | **LEAKAGE RISK. DROP before Modeling.** |
| **`Popularity`** | `float64` | Popularity score. Fixed zeros. | **LEAKAGE RISK. DROP before Modeling.** |
| **`Favorites`** | `int64` | User favorites count. **Highly Skewed**. | **Power Transform (Yeo-Johnson)**. |
| **`Scored By`** | `float64` | Voter count. **Highly Skewed**. | **Power Transform (Yeo-Johnson)**. |
| **`Members`** | `int64` | Group members count. **Highly Skewed**. | **Power Transform (Yeo-Johnson)**. |
| **`Aired Date Start`**| `datetime64[ns]`| Start date. NaT if unknown. | Feature Engineering (e.g., Age of anime) or Drop. |
| **`Aired Date End`** | `datetime64[ns]`| End date. Fixed logic errors. | Drop (captured in Duration/Episodes). |
| **`Aired Year`** | `float64` | Year started. | **Imputation** & **Scaling**. |
| **`Aired Month`** | `float64` | Month started (1-12). | **Cyclical Encoding (Sin/Cos)**. |
| **`Duration Minutes`**| `float64` | Total minutes. includes fractional minutes. | **Imputation** & **Robust Scaling**. |

## **Part II : Post-Split Pipeline**
*Goal: Build a parameter-learning preprocessing pipeline (imputation, scaling, and other stateful steps) and package it into a Scikit-learn Pipeline object that will be exported for use during model training.*


### **1. Drop Columns and Rows in Requirement**

In [85]:
path = r'..\data\processed\prepared_data.csv'
df1 = pd.read_csv(path)

# Drop rows where target 'Score' is NaN, and Drop redundant column
print('Shape before drop Cols, Rows', df1.shape)
df1 = df1.dropna(subset=['Score'])
df1 = df1.drop(['Aired Date Start','Aired Date End','anime_id', 'Name','Rank','Popularity','Favorites','Scored By','Members'], axis= 1)
print('Shape after drop Cols, Rows', df1.shape)

# Example
df1.head(1)


Shape before drop Cols, Rows (24905, 21)
Shape after drop Cols, Rows (15692, 12)


Unnamed: 0,Score,Genres,Type,Episodes,Status,Producers,Studios,Source,Rating,Aired Year,Aired Month,Duration Minutes
0,8.75,"['Action', 'Award Winning', 'Sci-Fi']",TV,26.0,Finished Airing,['Bandai Visual'],['Sunrise'],Original,R - 17+ (Violence & Profanity),1998.0,4.0,24.0


### **2. Splitting Train and Test Set (80/20)**

In [86]:
# Split X (features) and y (target)
X = df1.drop('Score', axis=1)
y = df1['Score']

# Perform the data split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (12553, 11)
X_test shape: (3139, 11)


### **3. Custom Transformers**

### **3.1. Custom Transformers for `Genres`, `Producers`, and `Studios`**

These three columns are **multi-label lists**, meaning each entry contains multiple values (e.g., multiple genres or multiple studios).  
Because scikit-learn’s default encoders cannot handle list-based features, we build **custom transformers** to:

- Normalize and clean list values  
- Group or replace rare labels based on column-specific frequency rules  
- Create consistent multi-label vocabularies  
- Convert lists into multi-hot encoded features  

In the **ColumnTransformer** later, these three columns will be handled through a **separate pipeline**, using the custom classes defined in this section.


In [87]:
class MultiListModeImputer(BaseEstimator, TransformerMixin):
    '''
    Imputes list-like columns by replacing empty or invalid lists with the
    most frequent item (mode) found in that column.
    '''
    def __init__(self, columns):
        self.columns = columns
        self.modes_ = {}

    def _ensure_list(self, x):
        if isinstance(x, list):
            return x
        if pd.isna(x):
            return []
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except:
                return []
        return []

    def fit(self, X, y=None):
        for col in self.columns:
            temp = X[col].apply(self._ensure_list)
            all_items = []
            for lst in temp:
                all_items.extend(lst)

            if len(all_items) == 0:
                self.modes_[col] = None
            else:
                self.modes_[col] = Counter(all_items).most_common(1)[0][0]

        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            mode_item = self.modes_[col]
            X[col] = X[col].apply(self._ensure_list)
            X[col] = X[col].apply(lambda lst: lst if len(lst) > 0 else [mode_item])
        return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.columns)

class FrequencyGrouper(BaseEstimator, TransformerMixin):
    '''
    Genre/Producer/Studios with frequency ≥ min_freq → keep name.
    Other Genre/Producer/Studio with frequency < min_freq → group into "Other".
    Help reduce Dimension

    '''
    def __init__(self, columns, min_freq):
        """
        columns: list các cột dạng list
        min_freq: dict chứa min_freq riêng cho từng cột
        """
        self.columns = columns
        self.min_freq = min_freq
        self.frequent_items_ = {}  # lưu item phổ biến của từng cột

    def _ensure_list(self, x):
        """Ensure input is in list form"""
        if isinstance(x, list):
            return x
        if pd.isna(x):
            return []
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except:
                return []
        return []

    def fit(self, X, y=None):
        for col in self.columns:
            min_f = self.min_freq.get(col, 10)  # nếu thiếu thì gán mặc định 10

            temp = X[col].apply(self._ensure_list)

            # Flatten
            all_items = []
            for lst in temp:
                all_items.extend(lst)

            counts = Counter(all_items)
            frequent = [k for k, v in counts.items() if v >= min_f]

            self.frequent_items_[col] = set(frequent)

        return self

    def transform(self, X):
        X = X.copy()

        for col in self.columns:
            freq_set = self.frequent_items_[col]

            X[col] = (
                X[col]
                .apply(self._ensure_list)
                .apply(lambda lst: [item if item in freq_set else "Other"
                                    for item in lst])
            )

        return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.columns)

class MultiLabelBinarizerDF(BaseEstimator, TransformerMixin):
    '''
    A custom transformer for multi-label columns.
    Converts each list-like column into multiple binary features using
    MultiLabelBinarizer, with feature names formatted as <col>__<label>.
    Supports get_feature_names_out() for pipeline compatibility.
    '''
    def __init__(self, columns):
        self.columns = columns
        self.encoders = {}
        self.output_features_ = []

    def fit(self, X, y=None):
        self.output_features_ = []
        for col in self.columns:
            mlb = MultiLabelBinarizer()
            mlb.fit(X[col])
            self.encoders[col] = mlb

            # lưu tên cột sinh ra
            for c in mlb.classes_:
                self.output_features_.append(f"{col}__{c}")

        return self

    def transform(self, X):
        X = X.copy()
        encoded_list = []

        for col in self.columns:
            mlb = self.encoders[col]
            arr = mlb.transform(X[col])

            df_enc = pd.DataFrame(
                arr,
                index=X.index,
                columns=[f"{col}__{c}" for c in mlb.classes_]
            )
            encoded_list.append(df_enc)

        other_cols = X.drop(columns=self.columns)

        return pd.concat([other_cols] + encoded_list, axis=1)

    def get_feature_names_out(self, input_features=None):
        return np.array(self.output_features_)


### **3.2. Custom Transformer for `Aired Month`**
The `Aired Month` column represents a **repeating cycle** (12 → 1), so encoding it as plain integers would wrongly imply that December and January are far apart.  
To preserve this cyclical structure, the `CyclicalMonthEncoder` converts each month into two continuous features (`month_sin`, `month_cos`) using sine–cosine mapping.

This encoding ensures that:

- Adjacent months remain close in feature space  
- The model learns the natural yearly cycle  
- No artificial jumps occur between 12 → 1  

In the preprocessing pipeline, this column is processed through the custom `CyclicalMonthEncoder` defined in this section.


In [88]:
class CyclicalMonthEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.generated_features_ = []

    def fit(self, X, y=None):
        # Tạo danh sách tên feature sẽ sinh ra
        self.generated_features_ = []
        for col in self.columns:
            self.generated_features_.append(f"{col}_sin")
            self.generated_features_.append(f"{col}_cos")
        return self

    def transform(self, X):
        X = X.copy()

        for col in self.columns:
            # Fill missing
            X[col] = X[col].fillna(X[col].median()).astype(float)

            # Add sin/cos features
            X[col + "_sin"] = np.sin(2 * np.pi * X[col] / 12)
            X[col + "_cos"] = np.cos(2 * np.pi * X[col] / 12)

            X = X.drop(columns=[col])

        return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.generated_features_)


### **3.3. New Features**

To enhance the model’s ability to capture deeper patterns within the anime dataset, we introduce several engineered features that provide additional structural, behavioral, and interaction-based insights beyond the original columns.

These new features include:

- **Cross-Feature Interactions** (e.g., `Episodes_x_Duration Minutes`):  
  Capture combined effects between related attributes that may influence the final *Score* more strongly when considered together.

- **List-Based Count Features** (`Genres_Count`, `Producers_Count`, `Studios_Count`):  
  Quantify the complexity of each anime title by counting how many genres, studios, or producers are associated with it.

- **Duration-Based Quantile Category** (`DurationCat`):  
  A categorical feature created by dividing *Duration Minutes* into quantile-based bins (e.g., Q1–Q4).  
  This allows the model to learn non-linear relationships in watch-time length without relying solely on continuous scaling.

- **Episodes-Based Quantile Category** (`EpisodesCat`):  
  A quantile-based categorical version of *Episodes*, representing ranges such as Q1–Q4.  
  This captures structural differences between short-format, mid-length, and long-running series in a more interpretable form.

These engineered features are generated through the custom `FeatureEngineering` transformer and appended to the dataset before the preprocessing pipeline.


In [89]:
def create_interactions(df, pairs=[
    ('Episodes', 'Duration Minutes')
]):
    df = df.copy()
    out = {}
    for a, b in pairs:
        out[f'{a}_x_{b}'] = df[a] * df[b]
    return pd.DataFrame(out, index=df.index)


def create_list_counts(df, columns):
    df = df.copy()
    out = {}
    for col in columns:
        out[col + "_Count"] = df[col].apply(lambda x: len(x) if isinstance(x, list) else 0)
    return pd.DataFrame(out, index=df.index)

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self,
                 year_col='Aired Year',
                 degree=2,
                 list_columns=['Genres', 'Producers', 'Studios'],
                 interaction_pairs=[('Episodes', 'Duration Minutes')],
                 duration_col='Duration Minutes',
                 duration_q=4,
                 episodes_col='Episodes',
                 episodes_q=4,
                 month_col='Aired Month'):

        # Inputs
        self.year_col = year_col
        self.degree = degree
        self.list_columns = list_columns
        self.interaction_pairs = interaction_pairs
        self.duration_col = duration_col
        self.duration_q = duration_q
        self.episodes_col = episodes_col
        self.episodes_q = episodes_q
        self.month_col = month_col

        # Will be populated in fit()
        self.duration_bins = None
        self.episodes_bins = None

        # Labels
        self.duration_labels = ["Very Short", "Short", "Medium", "Long"]
        self.episodes_labels = ["Mini_Series", "Short_Series",
                                "Standard_Series", "Long_Running"]

    def fit(self, X, y=None):
        X = X.copy()

        # --- Compute quantile bins (train only) ---
        self.duration_bins = np.unique(
            np.quantile(
                X[self.duration_col].dropna(),
                np.linspace(0, 1, self.duration_q + 1)
            )
        )

        self.episodes_bins = np.unique(
            np.quantile(
                X[self.episodes_col].dropna(),
                np.linspace(0, 1, self.episodes_q + 1)
            )
        )

        return self

    def transform(self, X):
        X = X.copy()

        # Count list features
        df_list = create_list_counts(X, self.list_columns)

        # Interaction features
        df_inter = create_interactions(X, self.interaction_pairs)

        # --- DurationCat ---
        dur_numeric = pd.cut(
            X[self.duration_col],
            bins=self.duration_bins,
            labels=False,
            include_lowest=True
        )

        dur_labels = dur_numeric.map(
            lambda x: self.duration_labels[int(x)] if pd.notna(x) else np.nan
        )

        df_dur = pd.DataFrame({'DurationCat': dur_labels}, index=X.index)

        # --- EpisodesCat ---
        ep_numeric = pd.cut(
            X[self.episodes_col],
            bins=self.episodes_bins,
            labels=False,
            include_lowest=True
        )

        ep_labels = ep_numeric.map(
            lambda x: self.episodes_labels[int(x)] if pd.notna(x) else np.nan
        )

        df_ep = pd.DataFrame({'EpisodesCat': ep_labels}, index=X.index)

        # === Combine all new features ===
        X_new = pd.concat(
            [X, df_list, df_inter, df_dur, df_ep],
            axis=1
        )

        return X_new


### **4. Sub-Pipeline**

### **4.1. Feature Engineering Pipeline**

In [90]:
feature_engineering = FeatureEngineering()

### **4.2. Preprocessor Pipeline**

In [91]:
numeric_features = ['Episodes','Duration Minutes','Aired Year',
                   'Episodes_x_Duration Minutes','Genres_Count','Producers_Count','Studios_Count']

nominal_features = ['Type','Status','Source','Rating','EpisodesCat','DurationCat'] 
multi_label_features = ['Genres','Producers', 'Studios']
cyclidal_features = ['Aired Month']


# 1. Create a sub-pipeline for NUMERICAL data
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson')),
    ('scaler', RobustScaler())
])


# 2. Create a sub-pipeline for Multi-label data
multi_label_pipeline = Pipeline(steps=[
    ('imputer', MultiListModeImputer(columns=multi_label_features)),
    ('grouper', FrequencyGrouper(columns=multi_label_features, min_freq={"Genres": 25, "Producers": 40, "Studios":40})),
    ('multi_encoder', MultiLabelBinarizerDF(columns=multi_label_features))
])

# 3. Create a sub-pipeline for cyclidal data: Aired Month
aired_month_pipeline = Pipeline(steps=[
    ('cyclical', CyclicalMonthEncoder(cyclidal_features))
])


# 4. Create sub-pipeline for nominal category 
nominal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# 5. Combine with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        # 1. Numerical columns (median impute + power transform + robust scaling)
        ('num', numeric_pipeline, numeric_features),

        # 2. Nominal categorical columns (mode impute + OneHotEncoder)
        ('nominal', nominal_pipeline, nominal_features),

        # 3. Multi-label columns (list-impute → frequency grouping → multi-hot encoding)
        ('multi_label', multi_label_pipeline, multi_label_features),

        # 4. Cyclical encoded column (Aired Month → sin/cos)
        ('cyclical', aired_month_pipeline, cyclidal_features)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False   
)



### **5. Full Pipeline**

In [92]:
full_pipeline = Pipeline([
    ('feature_engineering', feature_engineering),
    ('preprocessor', preprocessor)
])

print("Full regression pipeline created.")

full_pipeline.fit(X_train, y_train)


Full regression pipeline created.


0,1,2
,steps,"[('feature_engineering', ...), ('preprocessor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,year_col,'Aired Year'
,degree,2
,list_columns,"['Genres', 'Producers', ...]"
,interaction_pairs,"[('Episodes', ...)]"
,duration_col,'Duration Minutes'
,duration_q,4
,episodes_col,'Episodes'
,episodes_q,4
,month_col,'Aired Month'

0,1,2
,transformers,"[('num', ...), ('nominal', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,columns,"['Genres', 'Producers', ...]"

0,1,2
,columns,"['Genres', 'Producers', ...]"
,min_freq,"{'Genres': 25, 'Producers': 40, 'Studios': 40}"

0,1,2
,columns,"['Genres', 'Producers', ...]"

0,1,2
,columns,['Aired Month']


In [93]:
X_train_process = full_pipeline.transform(X_train)
X_test_process = full_pipeline.transform(X_test)
print('X train shape after preprocessing pipeline:', X_train_process.shape)
print('X test shape after preprocessing pipeline:', X_test_process.shape)

X train shape after preprocessing pipeline: (12553, 220)
X test shape after preprocessing pipeline: (3139, 220)


### **6. Save Pipeline**

After fitting the preprocessing pipeline on the training data, we save the entire object using `joblib` so it can be reused later in `03_Modeling_Comparison` without refitting. This ensures consistency, prevents data leakage.

**Note:** When loading this pipeline, all custom transformers (e.g., `CyclicalMonthEncoder`, `FeatureEngineering`, etc.) must be imported before calling `joblib.load()`. Otherwise, Python will not be able to deserialize the pipeline.


In [94]:
print("--- SAVING PIPELINE ---")

# 1. Train the pipeline on the ENTIRE X_train, y_train
full_pipeline.fit(X_train, y_train)
print("Pipeline has been fit on X_train, y_train.")

# 2. Save the pipeline
# Use joblib.dump to 'freeze' the entire pipeline (including imputer, scaler, ...)
pipeline_filename = r'..\models\processing_pipeline.pkl'
joblib.dump(full_pipeline, pipeline_filename)
print(f"Pipeline saved to file: {pipeline_filename}")

--- SAVING PIPELINE ---
Pipeline has been fit on X_train, y_train.
Pipeline saved to file: ..\models\processing_pipeline.pkl
