In [1]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from tqdm import tqdm # Progress bar

from fuzzywuzzy import process
from nltk.tokenize import word_tokenize

In [2]:
output_characters_genders_context=pd.read_csv('DATA/output_characters_genders_context.csv')

In [3]:
verb_tags=['VB','VBD','VBG','VBN','VBP','VBZ']
adj_tags=['JJ','JJR','JJS']
noun_tags=['NN','NNS','NNP','NNPS']

In [22]:
# Count number of rows with missing associated_words
empty_rows = output_characters_genders_context[output_characters_genders_context['associated_words'].isnull() | (output_characters_genders_context['associated_words'] == '')]
count_empty_rows = empty_rows.shape[0]
print(f"Number of rows with empty 'associated words' column: {count_empty_rows}")

Number of rows with empty 'associated words' column: 10513


Modif initial fct to keep the id column for later merges 

In [4]:
def extract_words(df, id_col, char_name_col, to_extract):
    tokens = pd.Series()
    tagged_tokens = []
    chunks_array = []
    verbs_list = []
    adjs_list = []
    nouns_list = []
    stop_words = set(stopwords.words('english'))

    # Create tqdm progress bar for the loop
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Movies"):
        verbs = []
        adjs = []
        nouns = []
        text = row[to_extract]
        associated_w_text = row[char_name_col]
        movie_id = row[id_col]
        
        if type(text) == str:  # To only keep movies with a summary (ignoring NaN)
            token = [word for word in nltk.word_tokenize(text) if word.lower() not in stop_words]  # Removing stopwords
            tokens[associated_w_text] = token
            tagged_tokens.append((movie_id, associated_w_text, nltk.pos_tag(token)))

    # Tqdm progress bar for the second loop
    for movie_id, associated_w_text, tagged_token in tqdm(tagged_tokens, desc="Processing Tokens", leave=False):
        chunks_array.append((movie_id, associated_w_text, nltk.ne_chunk(tagged_token)))

        verbs = []
        adjs = []
        nouns = []

        # Categorize
        for word, pos_tag in tagged_token:
            if pos_tag in verb_tags:
                verbs.append(word)
            elif pos_tag in adj_tags:
                adjs.append(word)
            elif pos_tag in noun_tags:
                nouns.append(word)

        verbs_list.append((movie_id, associated_w_text, verbs))
        adjs_list.append((movie_id, associated_w_text, adjs))
        nouns_list.append((movie_id, associated_w_text, nouns))

    # Returns lists of all verbs, adjectives, and nouns for each movie and raw chunks for each movie
    return verbs_list, adjs_list, nouns_list, chunks_array

In [5]:
verbs, adjs, nouns, chunks = extract_words(output_characters_genders_context, "IMDB_ID", "character_name","associated_words")

Processing Movies: 100%|██████████| 176334/176334 [18:12<00:00, 161.36it/s] 
                                                                            

In [6]:
# Create DataFrames from the lists
verbs_df = pd.DataFrame(verbs, columns=['IMDB_ID', 'character_name', 'Verbs'])
adjs_df = pd.DataFrame(adjs, columns=['IMDB_ID', 'character_name', 'Adjectives'])
nouns_df = pd.DataFrame(nouns, columns=['IMDB_ID', 'character_name', 'Nouns'])
chunks_df = pd.DataFrame(chunks, columns=['IMDB_ID', 'character_name', 'Chunks'])

In [7]:
# Merge DataFrames on 'IMDB_ID' and 'character_name'
final_df = verbs_df.merge(adjs_df, on=['IMDB_ID', 'character_name']) \
                    .merge(nouns_df, on=['IMDB_ID', 'character_name']) \
                    .merge(chunks_df, on=['IMDB_ID', 'character_name'])

In [8]:
# to have the genders, use that df if it is too big to merge with the character_data and save to csv
# DO NOT RUN THAT CELL IF RUN THE NEXT ONE BECAUSE THE GENDER COLUMN WILL BE DUPLICATED
final_df = pd.merge(final_df, output_characters_genders_context[['IMDB_ID', 'character_name', 'gender']], on=['IMDB_ID', 'character_name'], how='left')

In [14]:
final_df

Unnamed: 0,IMDB_ID,character_name,Verbs,Adjectives,Nouns,Chunks,gender
0,tt0228333,Sgt Jericho Butler,"[walk, wearing, sent, opened, released, posses...","[second, pick, disembodied, possible, Unfortun...","[half, humans, surface, pressure, suits, team,...","[(second, JJ), (half, NN), (22nd, CD), (humans...",M
1,tt0228333,Bashira Kincaid,"[killed, returning, blame, cot, escapes, leaving]",[massacre],"[pick, transport, prisoner, Desolation, Willia...","[(pick, NN), (transport, NN), (prisoner, NN), ...",F
2,tt0228333,Michael Descanso,"[planet, finds, missing, discovered, discovere...","[second, 22nd, 22nd, doorway, ancient, Martian...","[Set, century, film, century, film, Mars, mini...","[[(Set, NNP)], (second, JJ), (22nd, JJ), (cent...",M
3,tt0228333,Big Daddy Mars,"[planet, terraformed]",[],"[century, film, depicts]","[(century, NN), (film, NN), (depicts, NNS), (p...",M
4,tt0228333,Akooshay,"[discovered, created, wiped]","[second, 22nd, ancient, fierce]","[Set, century, film, miners, Martian, miners, ...","[[(Set, NNP)], (second, JJ), (22nd, JJ), (cent...",F
...,...,...,...,...,...,...,...
165816,tt9913288,Trent Osborne,"[come, mail, mail, mail, unmarked]","[piece, unmarked, red]","[day, home, bills, get, bills, mail, stands, s...","[(day, NN), (come, VB), (home, NN), (bills, NN...",M
165817,tt9914522,Mackenzie,"[murdered, regarding, make, regarding]","[new, niece]","[Holden, sister, brother, law, husband, Evan, ...","[[(Holden, NNP)], (sister, NN), (brother, NN),...",F
165818,tt9914522,Evan's dad,"[make, regarding, regarding]",[new],"[Mackenzie, husband, decision, make, niece]","[[(Mackenzie, NNP)], (new, JJ), (husband, NN),...",M
165819,tt9914522,Jade,[make],[new],"[honeymoon, Mackenzie, Evan, decision, make, r...","[(honeymoon, NN), [(Mackenzie, NNP)], (new, JJ...",F


previous cell gives df with columns IMDB_ID	character_name	Verbs	Adjectives	Nouns	Chunks	gender

In [10]:
characters_data=pd.read_csv('DATA/characters_data.csv',low_memory=False)

In [11]:
to_save_df = pd.merge(characters_data, final_df, on=['IMDB_ID', 'character_name'], how='left')

In [12]:
to_save_df

Unnamed: 0,character_ID,wikipedia_ID,freebase_ID,actor_ethnicity,actor_name,personnas,IMDB_ID,character_name,actor_gender,box_office_revenue,name,release_date,actor_age,Verbs,Adjectives,Nouns,Chunks,gender
0,1,975900.0,/m/03vyhn,,wanda de jesus,,tt0228333,Akooshay,F,,Ghosts of Mars,2001.0,42.0,"[discovered, created, wiped]","[second, 22nd, ancient, fierce]","[Set, century, film, miners, Martian, miners, ...","[[(Set, NNP)], (second, JJ), (22nd, JJ), (cent...",F
1,2,975900.0,/m/03vyhn,/m/044038p,natasha henstridge,,tt0228333,Lieutenant Melanie Ballard,F,14010832.0,Ghosts of Mars,2001.0,27.0,"[terraformed, allowing, become, authority, sen...","[matriarchal, police, second, second, small, s...","[film, depicts, Mars, positions, story, concer...","[(film, NN), (depicts, NNS), (Mars, NNP), (84,...",F
2,3,975900.0,/m/03vyhn,/m/0x67,ice cube,,tt0228333,Desolation Williams,M,14010832.0,Ghosts of Mars,2001.0,32.0,"[wearing, become, named, named, held, mining, ...","[transport, remote, ancient, horrific, team, d...","[pressure, suits, society, prisoner, Williams,...","[(wearing, VBG), (pressure, NN), (suits, NNS),...",M
3,4,975900.0,/m/03vyhn,,jason statham,,tt0228333,Sgt Jericho Butler,M,14010832.0,Ghosts of Mars,2001.0,34.0,"[walk, wearing, sent, opened, released, posses...","[second, pick, disembodied, possible, Unfortun...","[half, humans, surface, pressure, suits, team,...","[(second, JJ), (half, NN), (22nd, CD), (humans...",M
4,5,975900.0,/m/03vyhn,,clea duvall,,tt0228333,Bashira Kincaid,F,,Ghosts of Mars,2001.0,23.0,"[killed, returning, blame, cot, escapes, leaving]",[massacre],"[pick, transport, prisoner, Desolation, Willia...","[(pick, NN), (transport, NN), (prisoner, NN), ...",F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336453,336454,,,,caleb silvers,,tt9914522,Evan,M,,The Holden Family Plan,2019.0,24.0,"[make, regarding]",[new],"[Mackenzie, husband, decision]","[[(Mackenzie, NNP)], (new, JJ), (husband, NN),...",M
336454,336455,,,,bethany hazelitt,,tt9914522,Mackenzie,F,,The Holden Family Plan,2019.0,29.0,"[murdered, regarding, make, regarding]","[new, niece]","[Holden, sister, brother, law, husband, Evan, ...","[[(Holden, NNP)], (sister, NN), (brother, NN),...",F
336455,336456,,,,joshua bootz,,tt9914522,Evan's dad,M,,The Holden Family Plan,2019.0,,"[make, regarding, regarding]",[new],"[Mackenzie, husband, decision, make, niece]","[[(Mackenzie, NNP)], (new, JJ), (husband, NN),...",M
336456,336457,,,,vince camaj,,tt9914522,Todd,M,,The Holden Family Plan,2019.0,,,,,,


In [13]:
to_save_df.to_csv('DATA/characters_personas_data.csv', index=False)

### CA DEVRAIT MARCHER !!!

### Checking sizes before and after extract_words

In [25]:
before=len(output_characters_genders_context)
after=len(final_df)
print('before:',before,'after:',after,'before-after:',before-after,'number of empty rows:',count_empty_rows)


before: 176334 after: 165821 before-after: 10513 number of empty rows: 10513


This explains the difference in sizes before and after extract_words