In [245]:
from src.utils.results_utils import *
from src.utils.data_utils import str_dict_to_values
from src.utils.ml_utils import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import pickle

## Load and clean the dataset

In [246]:
df_ml = pd.read_csv('data/cleaned.csv')
df_ml['Country'] = df_ml['Country'].apply(str_dict_to_values)
df_ml['Genres'] = df_ml['Genres'].apply(str_dict_to_values)

In [247]:
df_ml.drop(columns=['Wikipedia_ID','Name', 'Sex', 'Actor_age', 'Country','Languages','Release_date'],inplace=True)
df_ml.head()

Unnamed: 0,Genres,Character_name
0,"[Thriller, Science Fiction, Horror, Adventure,...",Akooshay
1,"[Thriller, Science Fiction, Horror, Adventure,...",Melanie
2,"[Thriller, Science Fiction, Horror, Adventure,...",Williams
3,"[Thriller, Science Fiction, Horror, Adventure,...",Jericho
4,"[Thriller, Science Fiction, Horror, Adventure,...",Bashira


In [248]:
class GenreCategorizer:
    def __init__(self):
        # Define genre categories
        self.action_adventure = ['Action', 'Adventure', 'Thriller', 'War film', 'Action/Adventure', 'Martial Arts Film', 'Wuxia', 'Superhero movie', 'Western', 'Sword and sorcery', 'Spy', 'Supernatural']
        self.drama = ['Drama', 'Biographical film', 'Crime Drama', 'Family Film', 'Family Drama', 'Historical fiction', 'Biopic [feature]', 'Courtroom Drama', 'Political drama', 'Family-Oriented Adventure', 'Psychological thriller']
        self.comedy = ['Comedy', 'Romantic comedy', 'Comedy-drama', 'Comedy film', 'Black comedy', 'Slapstick', 'Romantic comedy', 'Musical', 'Satire', 'Parody', 'Comedy horror']
        self.horror_thriller = ['Horror', 'Psychological horror', 'Horror Comedy', 'Slasher', 'Thriller', 'Crime Thriller', 'Sci-Fi Horror', 'Suspense', 'Zombie Film', 'Natural horror films']
        self.fantasy_sci = ['Fantasy', 'Science Fiction', 'Space western', 'Fantasy Adventure', 'Fantasy Comedy', 'Sci-Fi Horror', 'Sci-Fi Thriller', 'Fantasy Drama', 'Dystopia', 'Alien Film', 'Cyberpunk', 'Time travel']
        self.historical_war = ['Historical drama', 'Historical fiction', 'Historical Epic', 'Epic', 'War effort', 'War film', 'Period piece', 'Courtroom Drama']
        self.romance = ['Romance Film', 'Romantic drama', 'Romance', 'Romantic fantasy', 'Marriage Drama']
        self.documentary = ['Documentary', 'Docudrama', 'Biography', 'Historical Documentaries', 'Mondo film', 'Patriotic film', 'Educational']
        self.music_performance = ['Musical', 'Music', 'Musical Drama', 'Musical comedy', 'Dance', 'Jukebox musical', 'Concert film']
        self.cult_b_movies = ['Cult', 'B-movie', 'Indie', 'Experimental film', 'Surrealism', 'Avant-garde', 'Grindhouse', 'Blaxploitation', 'Camp']

    def _categorize_genre(self, genres_movies) -> list:
        categories = []
        
        # Iterate through the genres and categorize
        for genre in genres_movies:
            if genre in self.action_adventure:
                if 'Action & Adventure' not in categories:
                    categories.append('Action & Adventure')
            if genre in self.drama:
                if 'Drama' not in categories:
                    categories.append('Drama')
            if genre in self.comedy:
                if 'Comedy' not in categories:
                    categories.append('Comedy')
            if genre in self.horror_thriller:
                if 'Horror & Thriller' not in categories:
                    categories.append('Horror & Thriller')
            if genre in self.fantasy_sci:
                if 'Fantasy & Sci-Fi' not in categories:
                    categories.append('Fantasy & Sci-Fi')
            if genre in self.historical_war:
                if 'Historical & War' not in categories:
                    categories.append('Historical & War')
            if genre in self.romance:
                if 'Romance' not in categories:
                    categories.append('Romance')
            if genre in self.documentary:
                if 'Documentary' not in categories:
                    categories.append('Documentary')
            if genre in self.music_performance:
                if 'Music & Performance' not in categories:
                    categories.append('Music & Performance')
            if genre in self.cult_b_movies:
                if 'Cult & B-Movies' not in categories:
                    categories.append('Cult & B-Movies')

        return categories if categories else ['Other']

    def categorize_genres_in_df(self, df: pd.DataFrame) -> pd.DataFrame:
        # Apply genre categorization to the 'genre' column and create a new 'categorized_genre' column
        df['Genre_Category'] = df['Genres'].apply(self._categorize_genre)
        return df
    
categorizer = GenreCategorizer()
df_ml = categorizer.categorize_genres_in_df(df_ml)

In [249]:
df_ml.drop(columns='Genres', inplace=True)
df_ml.head()

Unnamed: 0,Character_name,Genre_Category
0,Akooshay,"[Action & Adventure, Horror & Thriller, Fantas..."
1,Melanie,"[Action & Adventure, Horror & Thriller, Fantas..."
2,Williams,"[Action & Adventure, Horror & Thriller, Fantas..."
3,Jericho,"[Action & Adventure, Horror & Thriller, Fantas..."
4,Bashira,"[Action & Adventure, Horror & Thriller, Fantas..."


In [250]:
from sklearn.preprocessing import MultiLabelBinarizer

# Multi-Hot Encoding
genres_list = ['Action & Adventure', 'Drama', 'Comedy', 'Horror & Thriller', 
              'Fantasy & Sci-Fi', 'Historical & War', 'Romance', 'Documentary', 
              'Music & Performance', 'Cult & B-Movies', 'Other']

# Apply MultiLabelBinarizer to encode the genres
mlb = MultiLabelBinarizer(classes=genres_list)
genre_encoded = mlb.fit_transform(df_ml['Genre_Category'])
# Create a DataFrame for the encoded genres
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

# Combine the name with the genre DataFrame
df_ml = pd.concat([df_ml['Character_name'], genre_df], axis=1)

df_ml = df_ml.reset_index(drop=True)


We will add the most frequent special character to the alphabet: 'é', 'è', 'á' and 'í'

In [251]:
augmented_alphabet = 'abcdefghijklmnopqrstuvwxyzéèíá'

In [252]:
character_processor = NameFeatureProcessor('Character_name', ngram_range = (2,2))

df_ml = character_processor.process(df_ml,alphabet = augmented_alphabet, analyze_name = True, diacritic = False, phonetics = False, first_last = True, ngram=False)
df_ml.head()

Unnamed: 0,Character_name,Action & Adventure,Drama,Comedy,Horror & Thriller,Fantasy & Sci-Fi,Historical & War,Romance,Documentary,Music & Performance,...,u_l,v_l,w_l,x_l,y_l,z_l,é_l,è_l,í_l,á_l
0,Akooshay,1,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Melanie,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Williams,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Jericho,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Bashira,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [253]:
vectorizer = HashingVectorizer(analyzer='char', ngram_range=(2, 3), n_features=100)  # Example: using 1024 features
ngram_features = vectorizer.fit_transform(df_ml['Character_name'])
n_gram_df = pd.DataFrame(ngram_features.toarray())
df_ml = pd.concat([df_ml, n_gram_df], axis=1)

In [254]:
df_ml.sample(10)

Unnamed: 0,Character_name,Action & Adventure,Drama,Comedy,Horror & Thriller,Fantasy & Sci-Fi,Historical & War,Romance,Documentary,Music & Performance,...,90,91,92,93,94,95,96,97,98,99
41155,Jeff,1,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136523,Tom,0,1,0,0,0,0,0,0,0,...,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44124,Elizabeth,1,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36071,Jothi,0,1,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.377964,0.0,0.0,0.0,0.377964,0.0,0.0
49779,Morty,1,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147571,Kennedy,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76233,Johnathan,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.258199,0.0,0.0,0.0,-0.258199,0.0,0.0
45423,Memel,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155481,Paul,1,0,1,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
141811,Komiya,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0


## Genre Predictions

In [255]:
df_ml_genres = df_ml.copy()
df_ml_genres.columns = df_ml_genres.columns.astype(str)

In [256]:
rows_with_nans = df_ml_genres.isna().any(axis=1).sum()
print(rows_with_nans)

0


In [257]:
df_ml_genres = df_ml_genres.dropna()

In [258]:
# Define feature matrix (X) and target matrix (y)
X = df_ml_genres.drop(['Character_name', 'Action & Adventure', 'Drama', 'Comedy',
             'Horror & Thriller', 'Fantasy & Sci-Fi', 'Historical & War', 'Romance',
             'Documentary', 'Music & Performance', 'Cult & B-Movies', 'Other'], axis=1)
y = df_ml_genres[['Action & Adventure', 'Drama', 'Comedy', 'Horror & Thriller', 'Fantasy & Sci-Fi',
        'Historical & War', 'Romance', 'Documentary', 'Music & Performance', 'Cult & B-Movies', 'Other']]

In [277]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [278]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=50, max_depth=10, class_weight="balanced", random_state=42))
clf.fit(X_train, y_train)

In [279]:
# Make predictions
y_pred = clf.predict(X_test)

In [280]:
from sklearn.metrics import classification_report
# Evaluate the model
print(classification_report(y_test, y_pred, target_names=y.columns))

                     precision    recall  f1-score   support

 Action & Adventure       0.44      0.53      0.48      6457
              Drama       0.66      0.65      0.65     10232
             Comedy       0.45      0.44      0.44      6280
  Horror & Thriller       0.29      0.67      0.40      4009
   Fantasy & Sci-Fi       0.16      0.42      0.23      2010
   Historical & War       0.13      0.48      0.20      1615
            Romance       0.25      0.48      0.33      3512
        Documentary       0.05      0.48      0.08       523
Music & Performance       0.13      0.44      0.21      1234
    Cult & B-Movies       0.15      0.61      0.24      1920
              Other       0.05      0.44      0.10       404

          micro avg       0.29      0.55      0.38     38196
          macro avg       0.25      0.51      0.31     38196
       weighted avg       0.40      0.55      0.44     38196
        samples avg       0.29      0.55      0.36     38196



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [281]:
# Save the model
with open('model_genres.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [264]:
def feature_creation(name):
    df_pred = pd.DataFrame([name], columns=['Name'])
    pred_processor = NameFeatureProcessor('Name', ngram_range=(2,2))
    df_pred = pred_processor.process(df_pred, alphabet=augmented_alphabet, analyze_name=True, diacritic=False, phonetics=False, first_last=True, ngram=False)
    
    # Load pre-trained HashingVectorizer
    with open('hashing_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    
    # Transform the name using the vectorizer
    ngram_name = vectorizer.transform(df_pred['Name'])
    ngram_name_df = pd.DataFrame(ngram_name.toarray())
    df_pred = pd.concat([df_pred, ngram_name_df], axis=1)
    
    return df_pred

# Function for prediction using the trained model
def predict(df):
    df.drop(columns=['Name'], inplace=True)  # Remove 'Name' column
    df.columns = df.columns.astype(str)  # Ensure column names are strings to match model features
    return clf.predict(df)

# Function to process the name, extract features, and predict genres
def create_and_predict(name):
    df = feature_creation(name)  # Feature creation step
    pred = predict(df)  # Make prediction with the model
    
    # Decode the binary prediction results to genre names
    decoded_genres = mlb.inverse_transform(pred)
    
    # Display the decoded genre labels
    print(decoded_genres)

In [265]:
create_and_predict('fanny')

[('Drama', 'Comedy', 'Fantasy & Sci-Fi', 'Historical & War', 'Romance', 'Cult & B-Movies')]


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
