In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


In [16]:
def run_rf(df):
    # Encode genre labels
    le_genre = LabelEncoder()
    df['genre_encoded'] = le_genre.fit_transform(df['genre'])
    print("Train genres encoded:\n", set(df['genre_encoded'].values))
    
    # Split into train (Clara, Kenji) and test (Aimen)
    train_df = df[df['participant'].isin(['aimen', 'kenji'])]
    test_df = df[df['participant'] == 'clara']
    
    print("\nTrain genres:\n", train_df['genre'].value_counts())
    print("\nTest genres:\n", test_df['genre'].value_counts())
    
    # Automatically select numeric feature columns (excluding timestamp, target, person)
    excluded_cols = ['timestamp', 'genre', 'genre_encoded', 'participant']
    feature_cols = [col for col in df.columns if col not in excluded_cols and pd.api.types.is_numeric_dtype(df[col])]
    
    # Normalize all feature columns per person
    df[feature_cols] = df.groupby('participant')[feature_cols].transform(lambda x: (x - x.mean()) / x.std())
    
    X_train = train_df[feature_cols]
    y_train = train_df['genre_encoded']
    X_test = test_df[feature_cols]
    y_test = test_df['genre_encoded']
    
    # Train classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = clf.predict(X_test)
    print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le_genre.classes_, zero_division=0))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [17]:
df = pd.read_csv('data_cleaning/data/imputed_data.csv')
df.rename({'person': 'participant'}, axis='columns', inplace=True)
run_rf(df)

Train genres encoded:
 {0, 1, 2}

Train genres:
 genre
horror         2140
comedy         1621
documentary    1479
Name: count, dtype: int64

Test genres:
 genre
documentary    1560
comedy         1340
horror         1216
Name: count, dtype: int64

Classification Report:
               precision    recall  f1-score   support

      comedy       0.33      0.17      0.22      1340
 documentary       0.48      0.55      0.51      1560
      horror       0.66      0.89      0.76      1216

    accuracy                           0.53      4116
   macro avg       0.49      0.54      0.50      4116
weighted avg       0.48      0.53      0.49      4116


Confusion Matrix:
 [[ 224  834  282]
 [ 417  862  281]
 [  37   98 1081]]


In [18]:
run_rf(pd.read_csv('feature_engineering/data_with_new_features_v2.csv'))

Train genres encoded:
 {0, 1, 2}

Train genres:
 genre
horror         2132
comedy         1613
documentary    1471
Name: count, dtype: int64

Test genres:
 genre
documentary    1556
comedy         1336
horror         1212
Name: count, dtype: int64

Classification Report:
               precision    recall  f1-score   support

      comedy       1.00      1.00      1.00      1336
 documentary       0.00      0.00      0.00      1556
      horror       0.44      1.00      0.61      1212

    accuracy                           0.62      4104
   macro avg       0.48      0.67      0.54      4104
weighted avg       0.45      0.62      0.51      4104


Confusion Matrix:
 [[1336    0    0]
 [   0    0 1556]
 [   0    0 1212]]


In [20]:
run_rf(pd.read_csv('feature_engineering/data_with_new_features_v3.csv'))

Train genres encoded:
 {0, 1, 2}

Train genres:
 genre
horror         2132
comedy         1613
documentary    1471
Name: count, dtype: int64

Test genres:
 genre
documentary    1556
comedy         1336
horror         1212
Name: count, dtype: int64

Classification Report:
               precision    recall  f1-score   support

      comedy       1.00      1.00      1.00      1336
 documentary       0.00      0.00      0.00      1556
      horror       0.44      1.00      0.61      1212

    accuracy                           0.62      4104
   macro avg       0.48      0.67      0.54      4104
weighted avg       0.45      0.62      0.51      4104


Confusion Matrix:
 [[1336    0    0]
 [   0    0 1556]
 [   0    0 1212]]


In [6]:
def run_rf_70_30(df):
    # Encode genre labels
    le_genre = LabelEncoder()
    df['genre_encoded'] = le_genre.fit_transform(df['genre'])
    
    # Drop non-feature columns
    feature_cols = [col for col in df.columns if col not in ['timestamp', 'genre', 'genre_encoded', 'participant']]
    X = df[feature_cols]
    X = X.select_dtypes(include='number')
    y = df['genre_encoded']
    
    # Train-test split (70% train, 30% test, stratified by genre)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    
    # Train classifier
    clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
    clf.fit(X_train, y_train)
    
    # Predictions and evaluation
    y_pred = clf.predict(X_test)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(
        y_test, y_pred, target_names=le_genre.classes_, zero_division=0))

In [7]:
df = pd.read_csv('data_cleaning/data/imputed_data.csv')
df.rename({'person': 'participant'}, axis='columns', inplace=True)
run_rf_70_30(df)

Confusion Matrix:
 [[856  10  22]
 [  2 898  12]
 [ 25  22 960]]

Classification Report:
               precision    recall  f1-score   support

      comedy       0.97      0.96      0.97       888
 documentary       0.97      0.98      0.98       912
      horror       0.97      0.95      0.96      1007

    accuracy                           0.97      2807
   macro avg       0.97      0.97      0.97      2807
weighted avg       0.97      0.97      0.97      2807



In [8]:
run_rf_70_30(pd.read_csv('feature_engineering/data_with_new_features_v2.csv'))

Confusion Matrix:
 [[ 885    0    0]
 [   0  908    0]
 [   0    0 1003]]

Classification Report:
               precision    recall  f1-score   support

      comedy       1.00      1.00      1.00       885
 documentary       1.00      1.00      1.00       908
      horror       1.00      1.00      1.00      1003

    accuracy                           1.00      2796
   macro avg       1.00      1.00      1.00      2796
weighted avg       1.00      1.00      1.00      2796

