In [None]:
!pip install stanza
!pip install thefuzz


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os 
import pandas as pd
import numpy as np
import stanza
from thefuzz import process
data_folder = '/content/sample_data/'



In [None]:
# Loading the data obtained from the NLP pipeline:
df_1 = pd.read_csv(data_folder + "Plot_NLP_Analysis_0_14100.csv", sep='\t', header = None)
df_2 = pd.read_csv(data_folder + "Plot_NLP_Analysis_14101_23201.csv", sep='\t', header = None)
df_3 = pd.read_csv(data_folder + "Plot_NLP_Analysis23202-42302.csv", sep='\t', header = None)
df_1 = df_1.drop(columns=0)
df_2 = df_2.drop(columns=0)
df_3 = df_3.drop(columns=0)

In [None]:
clusters_df = pd.concat([df_1, df_2, df_3], ignore_index=True)
clusters_df.columns = ('Wikipedia movie ID', 'Character partial name', 'Agent Verbs', 'Patient Verbs', 'Attributes')

In [None]:
# Loading the data of character metadata
df_characters = pd.read_csv(data_folder +'character.metadata.tsv', sep='\t')
df_characters.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie release date', 'Character name', 
                         'Actor date of birth', 'Actor gender', 'Actor height (in meters)', 
                         'Actor ethnicity (Freebase ID)', 'Actor name', 'Actor age at movie release',
                        'Freebase character/actor map ID', 'Freebase character ID', 'Freebase actor ID']                       

In [None]:
# Loading the data of movie metadata
df_movie = pd.read_csv(data_folder +'movie.metadata.tsv', sep='\t')
df_movie.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name',
                    'Movie release date', 'Movie box office revenue', 'Movie runtime',
                    'Movie languages (Freebase ID)', 'Movie countries (Freebase ID)', 
                    'Movie genres (Freebase ID)']


In [None]:
df_char_movie = pd.merge(left=df_characters, right=df_movie, how='inner', 
                         on= ['Wikipedia movie ID', 'Freebase movie ID', 
                              'Movie release date'])


In [None]:
df_char_movie = df_char_movie.reset_index(drop = True)

In [None]:
df_char_movie_names = df_char_movie[['Wikipedia movie ID','Character name']].dropna()
df_char_movie_names['All_names'] = df_char_movie_names.groupby(['Wikipedia movie ID'], 
            as_index = True)['Character name'].transform(lambda x: ','.join(x))
df_char_movie_names = df_char_movie_names[['Wikipedia movie ID', 'All_names']].drop_duplicates()
df_char_movie_names['All_names'] = [x.strip('()').split(',') for x in df_char_movie_names['All_names']]


In [None]:
clusters_df = clusters_df.merge(df_char_movie_names, how = 'inner', on = 'Wikipedia movie ID')
clusters_df.head(3)

Unnamed: 0,Wikipedia movie ID,Character partial name,Agent Verbs,Patient Verbs,Attributes,All_names
0,20663735,Maranchery,0.0,0.0,son,"[M.K. Menon, Bharathan, Nandagopal Marar, Mara..."
1,20663735,Manapally,"['accepts', 'arrives', 'tries']",0.0,case crony post help,"[M.K. Menon, Bharathan, Nandagopal Marar, Mara..."
2,20663735,DYSP,0.0,0.0,crony,"[M.K. Menon, Bharathan, Nandagopal Marar, Mara..."


In [None]:
def matching(col1, col2):
    if process.extractOne(col1, list(col2))[1]>80:
        res = process.extractOne(col1, list(col2))[0]    
    else:
        res = 'Not found'
    return res

clusters_df['Full_name'] = clusters_df.apply(lambda x: matching(x['Character partial name'],
                                                                            x['All_names']), axis=1)
clusters_df.tail(50)

Unnamed: 0,Wikipedia movie ID,Character partial name,Agent Verbs,Patient Verbs,Attributes,All_names,Full_name
35362,13348400,Falcon,"['heads', 'fears', 'finds', 'home', 'arrives',...",0.0,owner home dog relationship older goodbye band,"[Martin Falcon, Jennie Lee, Daryle]",Martin Falcon
35363,13348400,Doberman,['chases'],0.0,dog,"[Martin Falcon, Jennie Lee, Daryle]",Not found
35364,20402406,Maribor,0.0,0.0,name,"[Kay, Behman von Bleibruck, Felson, Debelzaq, ...",Not found
35365,20402406,Behmen,"['enter', 'tries', 'continues', 'asks']",['wounded'],sword bond actions,"[Kay, Behman von Bleibruck, Felson, Debelzaq, ...",Not found
35366,20402406,Debelzeq,"['returns', 'comes', 'beginning', 'comes', 'th...",0.0,priest swindler altar fear,"[Kay, Behman von Bleibruck, Felson, Debelzaq, ...",Debelzaq
35367,20402406,Kay,"['returns', 'picks', 'able', 'bury', 'tell']",0.0,priest swindler altar sword able,"[Kay, Behman von Bleibruck, Felson, Debelzaq, ...",Kay
35368,20402406,Hagamar,['attempts'],0.0,swindler,"[Kay, Behman von Bleibruck, Felson, Debelzaq, ...",Hagamar
35369,20402406,Stephen,0.0,0.0,swindler,"[Kay, Behman von Bleibruck, Felson, Debelzaq, ...",Not found
35370,20402406,Eckhart,"['watch', 'gets', 'has']",0.0,volunteers,"[Kay, Behman von Bleibruck, Felson, Debelzaq, ...",Eckhardt
35371,20402406,Mila,0.0,0.0,daughter,"[Kay, Behman von Bleibruck, Felson, Debelzaq, ...",Not found


In [None]:
clusters_df.to_csv(data_folder + "Characters_Matched_DF.csv", sep='\t', header=None)

# TO DO:

- DROP ALL NOT FOUND CHARACTERS
- PUT INFO TOGETHER FOR ALL ROWS CORRESPONDING TO THE SAME ACTOR
- START ANALYSIS FROM HERE:
    - WORK ON GOOGLE COLAB!
    - EMPATH
    - RATIO PATIENT/AGENT VERBS
    - SEPARATE ANALYSIS PER GENRE
    - TRY TO ADD THE MAIN CHARACTER
    

In [None]:
char_df = clusters_df.loc[clusters_df['Full_name'] != 'Not found']

In [None]:
len(char_df)

17488

In [None]:
len(clusters_df)

35412

In [None]:
char_df.head(10)

Unnamed: 0,Wikipedia movie ID,Character partial name,Agent Verbs,Patient Verbs,Attributes,All_names,Full_name
0,20663735,Maranchery,0.0,0.0,son,"[M.K. Menon, Bharathan, Nandagopal Marar, Mara...",Marancheri Induchoodan
3,20663735,Menon,"['refuses', 'regrets', 'suffers']","['accused', 'judged']",judge jail corruption funeral,"[M.K. Menon, Bharathan, Nandagopal Marar, Mara...",M.K. Menon
5,20663735,Induchoodan,"['loses', 'passes', 'thwarts', 'follow', 'fall...",['framed'],dead years state confrontations friend court,"[M.K. Menon, Bharathan, Nandagopal Marar, Mara...",Marancheri Induchoodan
12,595909,Michael,"['holiday', 'uses', 'disintegrate']",['found'],pastor day faith,"[Michael Chamberlain, Lindy Chamberlain, Magaz...",Michael Chamberlain
13,595909,Lindy,"['returns', 'saw', 'seems', 'insisted']","['charged', 'found']",wife,"[Michael Chamberlain, Lindy Chamberlain, Magaz...",Lindy Chamberlain
15,595909,Chamberlains,['overturned'],0.0,convictions,"[Michael Chamberlain, Lindy Chamberlain, Magaz...",Michael Chamberlain
20,5272176,Stevens,['handed'],0.0,home,"[Kate Crawford, Vaughn Stevens, The First Lady...",Vaughn Stevens
21,5272176,Steven,0.0,0.0,house,"[Kate Crawford, Vaughn Stevens, The First Lady...",Vaughn Stevens
22,1952976,Dahlia,"['stands', 'wants', 'unstable', 'see', 'agrees...","['reached', 'left']",girl unstable,"[Mediator, Dahlia, Mr. Murray, Jeff Platzer, K...",Dahlia
23,1952976,Kyle,"['wants', 'threatens', 'picks', 'picks', 'takes']",0.0,ex-husband,"[Mediator, Dahlia, Mr. Murray, Jeff Platzer, K...",Kyle Williams


In [None]:
categories = ['Agent Verbs', 'Patient Verbs', 'Attributes']

for i,cat in enumerate(categories):
  rem = char_df[cat] == '0.0'
  char_df.loc[rem, cat] = char_df.loc[rem, cat].replace('0.0', '')
char_df['All_Agent_Verbs'] = char_df.groupby(['Full_name'])['Agent Verbs'].transform(lambda x: ','.join(x))
char_df['All_Patient_Verbs'] = char_df.groupby(['Full_name'])['Patient Verbs'].transform(lambda x: ','.join(x))
char_df['All_Attributes'] = char_df.groupby(['Full_name'])['Attributes'].transform(lambda x: ','.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

In [None]:
char_df = char_df[[	'Wikipedia movie ID', 'Full_name', 'All_Agent_Verbs', 'All_Patient_Verbs', 'All_Attributes']]

In [None]:
char_df

Unnamed: 0,Wikipedia movie ID,Full_name,All_Agent_Verbs,All_Patient_Verbs,All_Attributes
0,20663735,Marancheri Induchoodan,",['loses', 'passes', 'thwarts', 'follow', 'fal...",",['framed']","son,dead years state confrontations friend court"
3,20663735,M.K. Menon,"['refuses', 'regrets', 'suffers']","['accused', 'judged']",judge jail corruption funeral
5,20663735,Marancheri Induchoodan,",['loses', 'passes', 'thwarts', 'follow', 'fal...",",['framed']","son,dead years state confrontations friend court"
12,595909,Michael Chamberlain,"['holiday', 'uses', 'disintegrate'],['overturn...","['found'],","pastor day faith,convictions"
13,595909,Lindy Chamberlain,"['returns', 'saw', 'seems', 'insisted']","['charged', 'found']",wife
...,...,...,...,...,...
35405,26261438,Kor,,['overjoyed'],townspeople
35407,26261438,Xenos,"['attempts', 'unsuccessful', 'ends']",,priest attempts lack
35408,26261438,Midget Hup,,,sidekick
35409,26261438,Marlenus,['announces'],,death killer


In [None]:
char_df.to_csv(data_folder + "Characters_Matched_DF_Clean.csv", sep='\t', header=None)

From here all testing stuff


In [None]:
clusters_df_test = clusters_df.head(150)

In [None]:
# Python env: pip install thefuzz
# Anaconda env: pip install thefuzz
# -> thefuzz is not yet available on Anaconda (2021-09-18)
# -> you can use the old package: conda install -c conda-forge fuzzywuzzy

#df_char_movie_test = df_char_movie[df_char_movie['Wikipedia movie ID' == 31186339]]#'23890098']]or 'Wikipedia movie ID' =='31186339'or 'Wikipedia movie ID' =='20663735'or 'Wikipedia movie ID' =='2231378']
#df_char_movie_test
ids = clusters_df_test['Wikipedia movie ID'].unique()
df_char_movie_test = df_char_movie[df_char_movie['Wikipedia movie ID'].isin(ids)]


In [None]:
df_char_movie_test2 = df_char_movie_test[['Wikipedia movie ID','Character name']].dropna()
df_char_movie_test2['All_names'] = df_char_movie_test2.groupby(['Wikipedia movie ID'], 
            as_index = True)['Character name'].transform(lambda x: ','.join(x))
df_char_movie_test2['All_names'] = [x.strip('()').split(',') for x in df_char_movie_test2['All_names']]
df_char_movie_test2 = df_char_movie_test2[['Wikipedia movie ID', 'All_names']].drop_duplicates()

In [None]:
clusters_df_test2 = clusters_df_test.merge(df_char_movie_test2, how = 'right', on = 'Wikipedia movie ID')
clusters_df_test2.head(15)

Unnamed: 0,Wikipedia movie ID,Character partial name,Agent Verbs,Patient Verbs,Attributes,All_names
0,20663735,Maranchery,0.0,0.0,son,"M.K. Menon,Bharathan,Nandagopal Marar,Maranche..."
1,20663735,Manapally,"['accepts', 'arrives', 'tries']",0.0,case crony post help,"M.K. Menon,Bharathan,Nandagopal Marar,Maranche..."
2,20663735,DYSP,0.0,0.0,crony,"M.K. Menon,Bharathan,Nandagopal Marar,Maranche..."
3,20663735,Menon,"['refuses', 'regrets', 'suffers']","['accused', 'judged']",judge jail corruption funeral,"M.K. Menon,Bharathan,Nandagopal Marar,Maranche..."
4,20663735,Moopil,0.0,0.0,son,"M.K. Menon,Bharathan,Nandagopal Marar,Maranche..."
5,20663735,Induchoodan,"['loses', 'passes', 'thwarts', 'follow', 'fall...",['framed'],dead years state confrontations friend court,"M.K. Menon,Bharathan,Nandagopal Marar,Maranche..."
6,20663735,Nambiar,['Ramakrishnan'],0.0,rituals funeral,"M.K. Menon,Bharathan,Nandagopal Marar,Maranche..."
7,20663735,Mooppil,0.0,0.0,daughter minded,"M.K. Menon,Bharathan,Nandagopal Marar,Maranche..."
8,20663735,Kanaka,0.0,0.0,girl,"M.K. Menon,Bharathan,Nandagopal Marar,Maranche..."
9,20663735,Chandrabhanu,0.0,0.0,truth,"M.K. Menon,Bharathan,Nandagopal Marar,Maranche..."


In [None]:
def matching(col1, col2):
    if process.extractOne(col1, list(col2))[1]>80:
        res = process.extractOne(col1, list(col2))[0]    
    else:
        res = 'Not found'
    return res

clusters_df_test2['Full_name'] = clusters_df_test2.apply(lambda x: matching(x['Character partial name'],
                                                                            x['All_names']), axis=1)
clusters_df_test2.tail(50)

Unnamed: 0,Wikipedia movie ID,Character partial name,Agent Verbs,Patient Verbs,Attributes,All_names,Full_name
85,447194,Ahtur,['decides'],0.0,high chariot war defeat,"[Haisham, Delilah, Samson, The Saran of Gaza, ...",Not found
86,447194,Samson,"['loses', 'becomes', 'rips', 'asks']","['engaged', 'taken', 'blinded']",hands love passing,"[Haisham, Delilah, Samson, The Saran of Gaza, ...",Samson
87,447194,Dagon,"['collapses', 'lies']",0.0,temple,"[Haisham, Delilah, Samson, The Saran of Gaza, ...",Not found
88,31186339,Primrose,0.0,['chosen'],old,"[Foxface, Katniss Everdeen, Peeta Mellark, Eff...",Primrose Everdeen
89,31186339,Katniss,"['volunteers', 'survives', 'drops', 'has', 'ru...","['gave', 'taken', 'warned', 'presented']",sister love,"[Foxface, Katniss Everdeen, Peeta Mellark, Eff...",Katniss Everdeen
90,31186339,Peeta,"['tribute', 'reveals', 'meant', 'forms', 'begs...",0.0,tribute mobile victorious,"[Foxface, Katniss Everdeen, Peeta Mellark, Eff...",Peeta Mellark
91,31186339,Haymitch,"['able', 'warns']",0.0,victor advice able star rule,"[Foxface, Katniss Everdeen, Peeta Mellark, Eff...",Haymitch Abernathy
92,31186339,Caesar,0.0,0.0,interview TV,"[Foxface, Katniss Everdeen, Peeta Mellark, Eff...",Caesar Flickerman
93,31186339,Rue,"['draws', 'cares', 'draw']",0.0,body,"[Foxface, Katniss Everdeen, Peeta Mellark, Eff...",Rue
94,8388648,Madhav,"['playing', 'friends', 'escapes', 'works', 'se...",0.0,friends neighbours football love love,"[Madhav, Amrutha, Hero friend]",Madhav


In [None]:
#OTHER TESTS TO USE IN CASE WE WANT TO IMPROVE THE CODE


''' 1- TRYING THE MAP FUNCTION
best_char = lambda x: process.extractOne(x, df_char_movie_test["Character name"])[2]  # See note below
clusters_df_test['Char'] = df_char_movie_test.loc[clusters_df_test["Character partial name"].map(best_char).values, "Character name"].values

2- GROUPBY AND DO LIKE CAMILLE
#res = [process.extractOne(elem, char_movie_31186339) for elem in clusters_df_31186339[0:6]]
dfm = pd.DataFrame(clusters_df_31186339.apply(lambda x: process.extractOne(x, char_movie_31186339))
                               .tolist(), columns=["Character name", "ratio", "best_id"])
# Need to find the correlation between both dataframes : in clusters_df, there are only names and in df_char_movie there are full character names:
# However, process.extractOne is comparing one string with a list of strings : we need to group by movie and compare each element of the group with each element of the other group:
clusters = clusters_df.groupby(['Wikipedia movie ID'])['Character partial name']
char_movies = df_char_movie.groupby(['Wikipedia movie ID'])['Character name']
dfm = pd.DataFrame(clusters.apply(lambda x: [process.extractOne(elem, [group.values for name, group in char_movies]) for elem in x])
                               .tolist(), columns=['Character name', 'ratio', 'best_id']) #score_cutoff = 50?
'''


' 1- TRYING THE MAP FUNCTION\nbest_char = lambda x: process.extractOne(x, df_char_movie_test["Character name"])[2]  # See note below\nclusters_df_test[\'Char\'] = df_char_movie_test.loc[clusters_df_test["Character partial name"].map(best_char).values, "Character name"].values\n\n2- GROUPBY AND DO LIKE CAMILLE\n#res = [process.extractOne(elem, char_movie_31186339) for elem in clusters_df_31186339[0:6]]\ndfm = pd.DataFrame(clusters_df_31186339.apply(lambda x: process.extractOne(x, char_movie_31186339))\n                               .tolist(), columns=["Character name", "ratio", "best_id"])\n# Need to find the correlation between both dataframes : in clusters_df, there are only names and in df_char_movie there are full character names:\n# However, process.extractOne is comparing one string with a list of strings : we need to group by movie and compare each element of the group with each element of the other group:\nclusters = clusters_df.groupby([\'Wikipedia movie ID\'])[\'Character par

In [None]:
clusters_df_test3['All_names'][127]

['Henrietta Lowell',
 'Charlie Chan',
 'Alice Lowell',
 'Dick Williams',
 'Fred Gage',
 'Janice Gage',
 'Baxter',
 'Warren T. Phelps',
 'Ulrich',
 'Carlotta']