In [1]:
from src.utils.results_utils import *
from src.utils.data_utils import str_dict_to_values
from src.utils.ml_utils import *
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
import ast

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\amaur\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\amaur\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


## Load and clean the dataset

In [2]:
df_ml = pd.read_csv('data/cleaned.csv')
df_ml['Country'] = df_ml['Country'].apply(str_dict_to_values)

In [3]:
df_ml.drop(columns=['Wikipedia_ID','Name', 'Sex', 'Actor_age', 'Country','Languages','Release_date'],inplace=True)
df_ml.head()

Unnamed: 0,Character_name,Genre_Category
0,Akooshay,"['Action & Adventure', 'Horror & Thriller', 'F..."
1,Melanie,"['Action & Adventure', 'Horror & Thriller', 'F..."
2,Williams,"['Action & Adventure', 'Horror & Thriller', 'F..."
3,Jericho,"['Action & Adventure', 'Horror & Thriller', 'F..."
4,Bashira,"['Action & Adventure', 'Horror & Thriller', 'F..."


In [4]:
# Multi-Hot Encoding
genres_list = ['Action & Adventure', 'Drama', 'Comedy', 'Horror & Thriller', 
              'Fantasy & Sci-Fi', 'Historical & War', 'Romance', 'Documentary', 
              'Music & Performance', 'Cult & B-Movies', 'Other']
df_ml['Genre_Category'] = df_ml['Genre_Category'].apply(ast.literal_eval)
# Apply MultiLabelBinarizer to encode the genres
mlb = MultiLabelBinarizer(classes=genres_list)
genre_encoded = mlb.fit_transform(df_ml['Genre_Category'])
# Create a DataFrame for the encoded genres
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

In [5]:
# Combine the name with the genre encoded DataFrame
df_ml = pd.concat([df_ml['Character_name'], genre_df], axis=1)

df_ml = df_ml.reset_index(drop=True)
df_ml.sample()

We will add the most frequent special character to the alphabet: 'é', 'è', 'á' and 'í'

In [6]:
augmented_alphabet = 'abcdefghijklmnopqrstuvwxyzéèíá'

In [7]:
character_processor = NameFeatureProcessor('Character_name', ngram_range = (2,2))

df_ml = character_processor.process(df_ml,alphabet = augmented_alphabet, analyze_name = True, diacritic = False, phonetics = False, first_last = True, ngram=False)
df_ml.head()

Unnamed: 0,Character_name,Action & Adventure,Drama,Comedy,Horror & Thriller,Fantasy & Sci-Fi,Historical & War,Romance,Documentary,Music & Performance,...,u_l,v_l,w_l,x_l,y_l,z_l,é_l,è_l,í_l,á_l
0,Akooshay,1,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Melanie,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Williams,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Jericho,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Bashira,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
vectorizer = HashingVectorizer(analyzer='char', ngram_range=(2, 3), n_features=100)  # Example: using 1024 features
ngram_features = vectorizer.fit_transform(df_ml['Character_name'])
n_gram_df = pd.DataFrame(ngram_features.toarray())
df_ml = pd.concat([df_ml, n_gram_df], axis=1)

In [9]:
df_ml.sample(10)

Unnamed: 0,Character_name,Action & Adventure,Drama,Comedy,Horror & Thriller,Fantasy & Sci-Fi,Historical & War,Romance,Documentary,Music & Performance,...,90,91,92,93,94,95,96,97,98,99
28253,Ash,1,0,1,1,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.57735
136229,Vimal,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62829,Mollie,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,-0.333333,0.0,0.0,0.0,0.0
80534,Olivia,0,1,0,0,0,1,0,1,0,...,0.0,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71374,Pete,1,1,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70278,Veronica,0,1,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.27735,0.0,0.0,0.27735,0.0,0.0,0.0
145622,Dean,0,1,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124197,Frank,0,1,0,0,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97055,Richard,1,1,0,0,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.301511,0.0,0.0,0.0
11378,Othello,0,1,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.301511,0.0,-0.301511,0.0,0.301511,-0.301511,0.0


## Genre Predictions

In [10]:
df_ml_genres = df_ml.copy()
df_ml_genres.columns = df_ml_genres.columns.astype(str)

In [11]:
rows_with_nans = df_ml_genres.isna().any(axis=1).sum()
print(rows_with_nans)

0


In [12]:
df_ml_genres = df_ml_genres.dropna()

In [13]:
# Define feature matrix (X) and target matrix (y)
X = df_ml_genres.drop(['Character_name', 'Action & Adventure', 'Drama', 'Comedy',
             'Horror & Thriller', 'Fantasy & Sci-Fi', 'Historical & War', 'Romance',
             'Documentary', 'Music & Performance', 'Cult & B-Movies', 'Other'], axis=1)
y = df_ml_genres[['Action & Adventure', 'Drama', 'Comedy', 'Horror & Thriller', 'Fantasy & Sci-Fi',
        'Historical & War', 'Romance', 'Documentary', 'Music & Performance', 'Cult & B-Movies', 'Other']]

In [14]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [15]:
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=50, max_depth=10, class_weight="balanced", random_state=42))
clf.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred = clf.predict(X_test)

In [17]:
# Evaluate the model
print(classification_report(y_test, y_pred, target_names=y.columns))

                     precision    recall  f1-score   support

 Action & Adventure       0.44      0.59      0.51      6457
              Drama       0.66      0.62      0.64     10232
             Comedy       0.44      0.43      0.44      6280
  Horror & Thriller       0.28      0.69      0.40      4009
   Fantasy & Sci-Fi       0.16      0.45      0.23      2010
   Historical & War       0.13      0.49      0.20      1615
            Romance       0.25      0.45      0.32      3512
        Documentary       0.04      0.44      0.08       523
Music & Performance       0.14      0.41      0.21      1234
    Cult & B-Movies       0.15      0.61      0.24      1920
              Other       0.05      0.40      0.09       404

          micro avg       0.29      0.55      0.38     38196
          macro avg       0.25      0.51      0.31     38196
       weighted avg       0.40      0.55      0.44     38196
        samples avg       0.29      0.55      0.36     38196



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# Save the model
with open('model_genres.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [19]:
def feature_creation(name):
    df_pred = pd.DataFrame([name], columns=['Name'])
    pred_processor = NameFeatureProcessor('Name', ngram_range=(2,2))
    df_pred = pred_processor.process(df_pred, alphabet=augmented_alphabet, analyze_name=True, diacritic=False, phonetics=False, first_last=True, ngram=False)
    
    # Load pre-trained HashingVectorizer
    with open('hashing_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    
    # Transform the name using the vectorizer
    ngram_name = vectorizer.transform(df_pred['Name'])
    ngram_name_df = pd.DataFrame(ngram_name.toarray())
    df_pred = pd.concat([df_pred, ngram_name_df], axis=1)
    
    return df_pred

# Function for prediction using the trained model
def predict(df):
    df.drop(columns=['Name'], inplace=True)  # Remove 'Name' column
    df.columns = df.columns.astype(str)  # Ensure column names are strings to match model features
    return clf.predict(df)

# Function to process the name, extract features, and predict genres
def create_and_predict(name):
    df = feature_creation(name)  # Feature creation step
    pred = predict(df)  # Make prediction with the model
    
    # Decode the binary prediction results to genre names
    decoded_genres = mlb.inverse_transform(pred)
    
    # Display the decoded genre labels
    print(decoded_genres)

In [20]:
create_and_predict('Fanny')

[('Drama', 'Comedy', 'Horror & Thriller', 'Fantasy & Sci-Fi', 'Historical & War', 'Romance', 'Documentary', 'Cult & B-Movies')]
