In [3]:
from src.utils.results_utils import *
from src.utils.data_utils import str_dict_to_values
from src.utils.ml_utils import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import pickle

## Load and clean the dataset

In [4]:
df_ml = pd.read_csv('data/cleaned.csv')
df_ml['Country'] = df_ml['Country'].apply(str_dict_to_values)
df_ml['Genres'] = df_ml['Genres'].apply(str_dict_to_values)

In [5]:
df_ml.drop(columns=['Wikipedia_ID','Name', 'Sex', 'Actor_age', 'Country','Languages','Release_date'],inplace=True)
df_ml.head()

Unnamed: 0,Genres,Character_name
0,"[Thriller, Science Fiction, Horror, Adventure,...",Akooshay
1,"[Thriller, Science Fiction, Horror, Adventure,...",Melanie
2,"[Thriller, Science Fiction, Horror, Adventure,...",Williams
3,"[Thriller, Science Fiction, Horror, Adventure,...",Jericho
4,"[Thriller, Science Fiction, Horror, Adventure,...",Bashira


In [6]:
categorizer = GenreCategorizer()
df_ml = categorizer.categorize_genres_in_df(df_ml)

In [7]:
df_ml.drop(columns='Genres', inplace=True)
df_ml.head()

Unnamed: 0,Character_name,Genre_Category
0,Akooshay,"[Action & Adventure, Horror & Thriller, Fantas..."
1,Melanie,"[Action & Adventure, Horror & Thriller, Fantas..."
2,Williams,"[Action & Adventure, Horror & Thriller, Fantas..."
3,Jericho,"[Action & Adventure, Horror & Thriller, Fantas..."
4,Bashira,"[Action & Adventure, Horror & Thriller, Fantas..."


In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

# Multi-Hot Encoding
genres_list = ['Action & Adventure', 'Drama', 'Comedy', 'Horror & Thriller', 
              'Fantasy & Sci-Fi', 'Historical & War', 'Romance', 'Documentary', 
              'Music & Performance', 'Cult & B-Movies', 'Other']

# Apply MultiLabelBinarizer to encode the genres
mlb = MultiLabelBinarizer(classes=genres_list)
genre_encoded = mlb.fit_transform(df_ml['Genre_Category'])
# Create a DataFrame for the encoded genres
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

# Combine the name with the genre DataFrame
df_ml = pd.concat([df_ml['Character_name'], genre_df], axis=1)

df_ml = df_ml.reset_index(drop=True)


We will add the most frequent special character to the alphabet: 'é', 'è', 'á' and 'í'

In [9]:
augmented_alphabet = 'abcdefghijklmnopqrstuvwxyzéèíá'

In [10]:
character_processor = NameFeatureProcessor('Character_name', ngram_range = (2,2))

df_ml = character_processor.process(df_ml,alphabet = augmented_alphabet, analyze_name = True, diacritic = False, phonetics = False, first_last = True, ngram=False)
df_ml.head()

Unnamed: 0,Character_name,Action & Adventure,Drama,Comedy,Horror & Thriller,Fantasy & Sci-Fi,Historical & War,Romance,Documentary,Music & Performance,...,u_l,v_l,w_l,x_l,y_l,z_l,é_l,è_l,í_l,á_l
0,Akooshay,1,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Melanie,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Williams,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Jericho,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Bashira,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
vectorizer = HashingVectorizer(analyzer='char', ngram_range=(2, 3), n_features=100)  # Example: using 1024 features
ngram_features = vectorizer.fit_transform(df_ml['Character_name'])
n_gram_df = pd.DataFrame(ngram_features.toarray())
df_ml = pd.concat([df_ml, n_gram_df], axis=1)

In [12]:
df_ml.sample(10)

Unnamed: 0,Character_name,Action & Adventure,Drama,Comedy,Horror & Thriller,Fantasy & Sci-Fi,Historical & War,Romance,Documentary,Music & Performance,...,90,91,92,93,94,95,96,97,98,99
57145,Deloris,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73014,George,1,1,0,0,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156063,Torres,0,1,1,0,0,0,0,0,0,...,0.0,0.333333,0.0,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0
32692,Kara,0,0,1,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153102,Cleopatra,1,1,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.258199,0.0,0.0
126181,Ed,0,0,1,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83006,Kees,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0
81260,Zack,0,1,1,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83834,Pyotr,1,0,0,1,0,1,0,0,0,...,0.0,-0.377964,0.0,0.0,0.0,0.0,0.0,0.377964,0.0,0.0
98338,Lester,0,0,1,1,0,0,0,0,0,...,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Genre Predictions

In [13]:
df_ml_genres = df_ml.copy()
df_ml_genres.columns = df_ml_genres.columns.astype(str)

In [14]:
rows_with_nans = df_ml_genres.isna().any(axis=1).sum()
print(rows_with_nans)

0


In [15]:
df_ml_genres = df_ml_genres.dropna()

In [16]:
# Define feature matrix (X) and target matrix (y)
X = df_ml_genres.drop(['Character_name', 'Action & Adventure', 'Drama', 'Comedy',
             'Horror & Thriller', 'Fantasy & Sci-Fi', 'Historical & War', 'Romance',
             'Documentary', 'Music & Performance', 'Cult & B-Movies', 'Other'], axis=1)
y = df_ml_genres[['Action & Adventure', 'Drama', 'Comedy', 'Horror & Thriller', 'Fantasy & Sci-Fi',
        'Historical & War', 'Romance', 'Documentary', 'Music & Performance', 'Cult & B-Movies', 'Other']]

In [17]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [18]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=50, max_depth=10, class_weight="balanced", random_state=42))
clf.fit(X_train, y_train)

In [19]:
# Make predictions
y_pred = clf.predict(X_test)

In [20]:
from sklearn.metrics import classification_report
# Evaluate the model
print(classification_report(y_test, y_pred, target_names=y.columns))

                     precision    recall  f1-score   support

 Action & Adventure       0.44      0.53      0.48      6457
              Drama       0.66      0.65      0.65     10232
             Comedy       0.45      0.44      0.44      6280
  Horror & Thriller       0.29      0.67      0.40      4009
   Fantasy & Sci-Fi       0.16      0.42      0.23      2010
   Historical & War       0.13      0.48      0.20      1615
            Romance       0.25      0.48      0.33      3512
        Documentary       0.05      0.48      0.08       523
Music & Performance       0.13      0.44      0.21      1234
    Cult & B-Movies       0.15      0.61      0.24      1920
              Other       0.05      0.44      0.10       404

          micro avg       0.29      0.55      0.38     38196
          macro avg       0.25      0.51      0.31     38196
       weighted avg       0.40      0.55      0.44     38196
        samples avg       0.29      0.55      0.36     38196



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
# Save the model
with open('model_genres.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [22]:
def feature_creation(name):
    df_pred = pd.DataFrame([name], columns=['Name'])
    pred_processor = NameFeatureProcessor('Name', ngram_range=(2,2))
    df_pred = pred_processor.process(df_pred, alphabet=augmented_alphabet, analyze_name=True, diacritic=False, phonetics=False, first_last=True, ngram=False)
    
    # Load pre-trained HashingVectorizer
    with open('hashing_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    
    # Transform the name using the vectorizer
    ngram_name = vectorizer.transform(df_pred['Name'])
    ngram_name_df = pd.DataFrame(ngram_name.toarray())
    df_pred = pd.concat([df_pred, ngram_name_df], axis=1)
    
    return df_pred

# Function for prediction using the trained model
def predict(df):
    df.drop(columns=['Name'], inplace=True)  # Remove 'Name' column
    df.columns = df.columns.astype(str)  # Ensure column names are strings to match model features
    return clf.predict(df)

# Function to process the name, extract features, and predict genres
def create_and_predict(name):
    df = feature_creation(name)  # Feature creation step
    pred = predict(df)  # Make prediction with the model
    
    # Decode the binary prediction results to genre names
    decoded_genres = mlb.inverse_transform(pred)
    
    # Display the decoded genre labels
    print(decoded_genres)

In [23]:
create_and_predict('fanny')

[('Drama', 'Comedy', 'Fantasy & Sci-Fi', 'Historical & War', 'Romance')]


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
