# Model Training Notebook

* Author: Finian O'Neill
* Purpose: Conduct the feature engineering on the training dataset. Then, create a ML pipeline to train a regression model.

### Setup

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from collections import defaultdict

In [None]:
def cluster_similar_categories(df, column_name, n_clusters=None, similarity_threshold=0.7):
    """
    Groups semantically similar categories using text similarity and clustering
    
    Parameters:
    -----------
    df : pandas DataFrame
        The input dataframe
    column_name : str
        Name of the categorical column to process
    n_clusters : int, optional
        Number of clusters to create. If None, determined automatically
    similarity_threshold : float, optional
        Threshold for considering categories similar in automatic mode
        
    Returns:
    --------
    df_copy : pandas DataFrame
        DataFrame with the new clustered column
    mapping : dict
        Mapping from original categories to cluster labels
    """
    df_copy = df.copy()
    
    # Get unique categories
    categories = df[column_name].dropna().unique().tolist()
    
    # If very few categories, no need to cluster
    if len(categories) <= 5:
        print(f"Only {len(categories)} categories found. No clustering needed.")
        return df_copy, {cat: cat for cat in categories}
    
    # Convert categories to strings
    categories = [str(cat) for cat in categories]
    
    # Create TF-IDF vectors for the categories
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
    tfidf_matrix = vectorizer.fit_transform(categories)
    
    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(tfidf_matrix)
    
    # Determine number of clusters if not provided
    if n_clusters is None:
        # Use elbow method or silhouette score
        distortions = []
        K_range = range(1, min(15, len(categories)))
        for k in K_range:
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(tfidf_matrix)
            distortions.append(kmeans.inertia_)
        
        # Find "elbow" point - simple heuristic
        deltas = np.diff(distortions)
        n_clusters = np.argmax(deltas) + 1
        n_clusters = max(n_clusters, 2)  # At least 2 clusters
        
        # Plot elbow curve
        plt.figure(figsize=(10, 6))
        plt.plot(K_range, distortions, 'bx-')
        plt.xlabel('Number of clusters')
        plt.ylabel('Distortion')
        plt.title('Elbow Method For Optimal Clusters')
        plt.axvline(x=n_clusters, color='r', linestyle='--')
        plt.show()
        
        print(f"Auto-selected {n_clusters} clusters based on elbow method")
    
    # Apply K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(tfidf_matrix)
    
    # Create mapping from original categories to cluster labels
    mapping = {}
    cluster_groups = defaultdict(list)
    
    for cat, label in zip(categories, cluster_labels):
        cluster_groups[label].append(cat)
    
    # Create meaningful names for clusters
    for label, members in cluster_groups.items():
        if len(members) == 1:
            # If only one member, keep original name
            cluster_name = members[0]
        else:
            # Find most frequent or representative member
            counts = df[df[column_name].isin(members)][column_name].value_counts()
            most_common = counts.index[0] if not counts.empty else members[0]
            cluster_name = f"{most_common}_cluster_{label}"
        
        # Map all members to this cluster name
        for member in members:
            mapping[member] = cluster_name
    
    # Apply mapping to create new column
    new_col_name = f"{column_name}_clustered"
    df_copy[new_col_name] = df_copy[column_name].astype(str).map(mapping)
    df_copy[new_col_name] = df_copy[new_col_name].fillna('other')
    
    # Print cluster statistics
    original_count = len(categories)
    new_count = len(set(mapping.values()))
    print(f"Reduced cardinality from {original_count} to {new_count} categories")
    
    # Print samples from each cluster to verify semantic similarity
    print("\nSample clusters:")
    for label, members in list(cluster_groups.items())[:5]:  # Show first 5 clusters
        print(f"Cluster {label}: {', '.join(members[:5])}" + 
              (f" ... and {len(members)-5} more" if len(members) > 5 else ""))
    
    return df_copy, mapping

### Data Import

In [None]:
# load train dataset and expect
train_df = pd.read_csv('data/train.csv')
train_df.head()