In [1]:
import sys
import os

notebook_dir = os.getcwd()
parent_dir = os.path.dirname(notebook_dir)
sys.path.append(parent_dir)

from src.utils.data_utils import str_dict_to_values
from src.utils.results_utils import *
from src.utils.ml_utils import *
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer
import pickle

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\amaur\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\amaur\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


## Load and clean the dataset

In [2]:
file_name = 'data/sentimental_analysis.csv'

for path in sys.path:
    file_path = os.path.join(path, file_name)

df_sentiment_analysis_textblob = pd.read_csv(file_path)
df_sentiment_analysis_textblob.head()

Unnamed: 0,Character_Name,Wikipedia_id,Polarity,Subjectivity
0,Williams,3217,0.196667,0.45
1,Sheila,3217,0.042172,0.308838
2,Arthur,3217,0.040338,0.295411
3,Ash,3217,0.040338,0.295411
4,Duke,3217,0.04418,0.323545


In [3]:
file_name = 'data/cleaned.csv'

for path in sys.path:
    file_path = os.path.join(path, file_name)

df_ml = pd.read_csv(file_path)
df_ml['Country'] = df_ml['Country'].apply(str_dict_to_values)

In [4]:
df_ml = pd.merge(df_ml, df_sentiment_analysis_textblob, left_on=['Wikipedia_ID', 'Character_name'], right_on=['Wikipedia_id', 'Character_Name'], how='inner')
df_ml['kindness'] = df_ml['Polarity'].apply(good_guy_detector)
df_ml.head()

Unnamed: 0,Wikipedia_ID,Name,Languages,Country,Character_name,Sex,Actor_age,Release_date,Genre_Category,Character_Name,Wikipedia_id,Polarity,Subjectivity,kindness
0,975900,Ghosts of Mars,"{""/m/02h40lc"": ""English Language""}",[United States of America],Melanie,F,27.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Melanie,975900,-0.021759,0.169599,Not significant
1,975900,Ghosts of Mars,"{""/m/02h40lc"": ""English Language""}",[United States of America],Williams,M,32.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Williams,975900,-0.021759,0.169599,Not significant
2,975900,Ghosts of Mars,"{""/m/02h40lc"": ""English Language""}",[United States of America],Jericho,M,33.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Jericho,975900,-0.021759,0.169599,Not significant
3,975900,Ghosts of Mars,"{""/m/02h40lc"": ""English Language""}",[United States of America],Helena,F,52.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Helena,975900,-0.096875,0.352014,Not significant
4,975900,Ghosts of Mars,"{""/m/02h40lc"": ""English Language""}",[United States of America],Mars,M,,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Mars,975900,0.041667,0.208333,Not significant


In [5]:
df_ml.drop(columns=['Wikipedia_ID','Wikipedia_id','Name','Languages','Subjectivity','Character_Name','Polarity'],inplace=True)
df_ml.head()

Unnamed: 0,Country,Character_name,Sex,Actor_age,Release_date,Genre_Category,kindness
0,[United States of America],Melanie,F,27.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant
1,[United States of America],Williams,M,32.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant
2,[United States of America],Jericho,M,33.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant
3,[United States of America],Helena,F,52.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant
4,[United States of America],Mars,M,,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant


## Prepare the data for the model with the different features

In [6]:
df_ml['Release_date'] = pd.to_datetime(df_ml['Release_date'], errors='coerce')
df_ml['Year'] = df_ml['Release_date'].dt.year

min_year = df_ml['Year'].min()
max_year = df_ml['Year'].max()

print("Minimum Year:", min_year)
print("Maximum Year:", max_year)

df_ml.head()

Minimum Year: 1896.0
Maximum Year: 2014.0


Unnamed: 0,Country,Character_name,Sex,Actor_age,Release_date,Genre_Category,kindness,Year
0,[United States of America],Melanie,F,27.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,2001.0
1,[United States of America],Williams,M,32.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,2001.0
2,[United States of America],Jericho,M,33.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,2001.0
3,[United States of America],Helena,F,52.0,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,2001.0
4,[United States of America],Mars,M,,2001-08-24,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,2001.0


In [7]:
#Replace add labels for the year of birth of the character

age_bins = [1896, 1935, 1975, 2014]
age_labels = ['1896-1935', '1935-1975', '1975-2014']

df_ml['Year'] = df_ml['Year']-df_ml['Actor_age']
df_ml['birth_category'] = pd.cut(df_ml['Year'], bins=age_bins, labels=age_labels, right=False)
df_ml.drop(columns=['Actor_age', 'Year', 'Release_date'],inplace=True)

df_ml.head()

Unnamed: 0,Country,Character_name,Sex,Genre_Category,kindness,birth_category
0,[United States of America],Melanie,F,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975
1,[United States of America],Williams,M,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975
2,[United States of America],Jericho,M,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975
3,[United States of America],Helena,F,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975
4,[United States of America],Mars,M,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,


In [8]:
find_unusual_characters(df_ml,'Character_name')

Number of rows containing special characters: 7
Unusual Characters Found: {'-'}


In [9]:
augmented_alphabet = 'abcdefghijklmnopqrstuvwxyzéèíá'

In [10]:
df_ml = df_ml.dropna(subset=['birth_category'])
df_ml.head()

Unnamed: 0,Country,Character_name,Sex,Genre_Category,kindness,birth_category
0,[United States of America],Melanie,F,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975
1,[United States of America],Williams,M,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975
2,[United States of America],Jericho,M,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975
3,[United States of America],Helena,F,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975
6,[United States of America],Rebecca,F,"['Comedy', 'Drama', 'Romance']",Not significant,1935-1975


In [11]:
character_processor = NameFeatureProcessor('Character_name', ngram_range = (2,2))

df_ml = character_processor.process(df_ml,alphabet = augmented_alphabet,analyze_name = True, diacritic = False, phonetics = False, first_last = True, ngram=False)
df_ml.head()

Unnamed: 0,Country,Character_name,Sex,Genre_Category,kindness,birth_category,name_length,vowel_count,consonant_count,a_f,...,u_l,v_l,w_l,x_l,y_l,z_l,é_l,è_l,í_l,á_l
0,[United States of America],Melanie,F,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975,7,4,3,0,...,0,0,0,0,0,0,0,0,0,0
1,[United States of America],Williams,M,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975,8,3,5,0,...,0,0,0,0,0,0,0,0,0,0
2,[United States of America],Jericho,M,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975,7,3,4,0,...,0,0,0,0,0,0,0,0,0,0
3,[United States of America],Helena,F,"['Action & Adventure', 'Horror & Thriller', 'F...",Not significant,1935-1975,6,3,3,0,...,0,0,0,0,0,0,0,0,0,0
6,[United States of America],Rebecca,F,"['Comedy', 'Drama', 'Romance']",Not significant,1935-1975,7,3,4,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
vectorizer = HashingVectorizer(analyzer='char', ngram_range=(2, 3), n_features=500)  
ngram_features = vectorizer.fit_transform(df_ml['Character_name'])
n_gram_df = pd.DataFrame(ngram_features.toarray())
df_ml = pd.concat([df_ml, n_gram_df], axis=1)


In [13]:
with open('hashing_vectorizer_birth.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)

## Birth Year Prediction Model

In [14]:
birth_predictor = PredictorModel(df_ml,'birth_category')

df_age = birth_predictor.clean_df()

KeyError: "['age_category'] not found in axis"

In [201]:
df_age = df_age.dropna()

Train

In [202]:
birth_predictor.train(df_age, balancing=True)

40413    1935-1975
21838    1935-1975
37243    1935-1975
9091     1935-1975
8980     1935-1975
           ...    
30687    1935-1975
7772     1935-1975
1249     1935-1975
22413    1975-2014
33691    1935-1975
Name: birth_category, Length: 25989, dtype: category
Categories (3, object): ['1896-1935' < '1935-1975' < '1975-2014']
              precision    recall  f1-score   support

   1896-1935       0.17      0.28      0.21       419
   1935-1975       0.61      0.41      0.49      1727
   1975-2014       0.30      0.41      0.35       742

    accuracy                           0.39      2888
   macro avg       0.36      0.37      0.35      2888
weighted avg       0.47      0.39      0.41      2888

