In [1]:
#from src.utils.results_utils import *
from src.utils.data_utils import str_dict_to_values
from src.utils.results_utils import *
from src.utils.ml_utils import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

import pickle

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\amaur\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\amaur\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


## Load and clean the dataset

In [2]:
# Read and present the data
df_ethnicity = pd.read_csv('data/name_ethnicity.csv')
df_ethnicity['Country'].value_counts()

Country
English-Speaking               2812
Germanic                       2359
Romance                        1847
Eastern Slavic                 1749
Southern and Western Slavic    1286
Hispanic                        891
Nordic                          882
East Asian                      673
Name: count, dtype: int64

We will add the most frequent special character to the alphabet: 'é', 'è', 'á' and 'í'

In [3]:
augmented_alphabet = 'abcdefghijklmnopqrstuvwxyzéèíá'

In [4]:
df_ethnicity.head()

Unnamed: 0,Name,Country
0,Vaclav,Southern and Western Slavic
1,Allan,Eastern Slavic
2,Kristine,Nordic
3,Matteo,Romance
4,Isao,East Asian


In [5]:
origin_processor = NameFeatureProcessor('Name', ngram_range = (2,3))

df_ml = origin_processor.process(df_ethnicity,alphabet = augmented_alphabet,analyze_name = True, diacritic = False, phonetics = False, first_last = True, ngram=False)
df_ml.head()

Unnamed: 0,Name,Country,name_length,vowel_count,consonant_count,a_f,b_f,c_f,d_f,e_f,...,u_l,v_l,w_l,x_l,y_l,z_l,é_l,è_l,í_l,á_l
0,Vaclav,Southern and Western Slavic,6,2,4,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,Allan,Eastern Slavic,5,2,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Kristine,Nordic,8,3,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Matteo,Romance,6,3,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Isao,East Asian,4,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
vectorizer = HashingVectorizer(analyzer='char', ngram_range=(2, 3), n_features=1000)  
ngram_features = vectorizer.fit_transform(df_ml['Name'])
n_gram_df = pd.DataFrame(ngram_features.toarray())
df_ml = pd.concat([df_ml, n_gram_df], axis=1)


In [7]:
with open('hashing_vectorizer_origin.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)

## Origin Prediction

In [8]:
origin_predictor = PredictorModel_o(df_ml,'Country')
df_origin = df_ml.drop(columns='Name')

In [9]:
rows_with_nans = df_origin.isna().any(axis=1).sum()
print(rows_with_nans)

0


We drop the rows with NaNs

In [10]:
df_origin = df_origin.dropna()
df_origin.columns = df_origin.columns.astype(str) 

In [11]:
origin_predictor.train(df_origin, balancing=True)

                             precision    recall  f1-score   support

                 East Asian       0.79      0.86      0.82        64
             Eastern Slavic       0.75      0.40      0.52       194
           English-Speaking       0.65      0.45      0.53       315
                   Germanic       0.46      0.55      0.50       224
                   Hispanic       0.55      0.47      0.51        93
                     Nordic       0.47      0.56      0.51        71
                    Romance       0.64      0.63      0.63       178
Southern and Western Slavic       0.34      0.77      0.47       111

                   accuracy                           0.54      1250
                  macro avg       0.58      0.59      0.56      1250
               weighted avg       0.59      0.54      0.54      1250





In [12]:
origin_predictor.create_and_predict('John')

Prediction:  ['Germanic']


### Prediction for the names of the dataset

In [13]:
df_names = pd.read_csv('data/cleaned.csv')
to_be_dropped = ['Wikipedia_ID','Languages','Country','Name','Sex','Actor_age','Release_date','Genre_Category']
df_names = df_names.drop(columns=to_be_dropped)
df_names = df_names.head(10000)
df_names.rename(columns={'Character_name': 'Name'}, inplace=True)
df_names.head()


Unnamed: 0,Name
0,Akooshay
1,Melanie
2,Williams
3,Jericho
4,Bashira


In [14]:
path_origin = 'model_Country.pkl'

with open(path_origin, 'rb') as file:
    predict_origin = pickle.load(file)

def feature_creation_o(df_pred):
    augmented_alphabet = 'abcdefghijklmnopqrstuvwxyzéèíá'
    pred_processor = NameFeatureProcessor('Name',ngram_range=(2,3))
    df_pred =pred_processor.process(df_pred,alphabet = augmented_alphabet,analyze_name = True, diacritic = False, phonetics = False, first_last = True, ngram=False)

    with open(f'hashing_vectorizer_origin.pkl', 'rb') as f:
        vectorizer = pickle.load(f)

    ngram_name = vectorizer.transform(df_pred['Name'])
    ngram_name_df = pd.DataFrame(ngram_name.toarray())
    df_pred = pd.concat([df_pred, ngram_name_df], axis=1)
    return df_pred

def predict_one_o(df, model):
    df.drop(columns=['Name'],inplace=True)
    df.columns = df.columns.astype(str)
    return model.predict(df)
    
def create_and_predict_origin(df, model):
    df_save = df.copy()
    df = feature_creation_o(df)
    pred = predict_one_o(df, model)
    df['Name'] = df_save
    df['Ethnicity'] = pred

    return df[['Name', 'Ethnicity']]  # Return a DataFrame with Name and Ethnicity

In [15]:
df_prediction = create_and_predict_origin(df_names, predict_origin)
df_prediction.to_csv('data/movie_character_ethnicity.csv')