In [302]:
import pandas as pd
import json
import networkx as nx
from networkx.algorithms import bipartite
from tqdm import tqdm
import statsmodels.api as sm
import statsmodels.formula.api as smf
pd.options.mode.chained_assignment = None 

In [303]:
# get the clusters of the charcters
clusters = pd.read_csv('classification_result.csv')
clusters

Unnamed: 0,name,movie_id,classification
0,Shlykov,23890098,22
1,Flickerman,31186339,13
2,Everdeen,31186339,13
3,Thresh,31186339,4
4,Crane,31186339,16
...,...,...,...
182484,Eden,6040782,1
182485,Godfrey,6040782,17
182486,Jones,6040782,23
182487,Fullard,6040782,20


In [304]:
# read the charcter metadata dataset
character_metadata_header = [
    'movie_id',
    'Freebase movie ID',
    'Movie release date',
    'Character Name',
    'Actor DOB',
    'Actor gender',
    'Actor height',
    'Actor ethnicity',
    'Actor Name',
    'Actor age at movie release',
    'Freebase character map',
    'Freebase character ID',
    'Freebase actor ID'
]
character_metadata = pd.read_csv(
    'MovieSummaries/character.metadata.tsv',
    delimiter='\t',
    names=character_metadata_header
)
characters = character_metadata[['movie_id','Actor Name','Character Name']]
characters

Unnamed: 0,movie_id,Actor Name,Character Name
0,975900,Wanda De Jesus,Akooshay
1,975900,Natasha Henstridge,Lieutenant Melanie Ballard
2,975900,Ice Cube,Desolation Williams
3,975900,Jason Statham,Sgt Jericho Butler
4,975900,Clea DuVall,Bashira Kincaid
...,...,...,...
450664,913762,Dorothy Elias-Fahn,Elensh
450665,913762,Jonathan Fahn,Hibiki
450666,28308153,David Hemmings,
450667,28308153,Roberta Paterson,


In [305]:
# drop all missing charcter name
characters = characters[characters['Character Name'].notna()]
# get the list of all potential matches of characters for every movie
actors_with_clusters = pd.merge(characters,clusters,on=['movie_id'],how = 'inner')

filter the potential character matches by removing all the charcters that either don't correspond
or if a charcter name includes more than one clustered name
(Since clustered character names are only one word. So for example two charcters having the same family name contain both the same name)

In [306]:
# A filter to see if a character name in cluster(as we are taking only one word when clustering)
# is included in the character Name in that movie 
character_filter = actors_with_clusters.apply(lambda x: str(x['name']) in str(x['Character Name']), axis=1)
## remove all rows with a classified character matching more than one Character
filtered_characters = actors_with_clusters[character_filter].drop_duplicates(subset =['movie_id' ,'name'], keep = False)
## remove all rows with a character matching more than one classified Character
filtered_characters = actors_with_clusters[character_filter].drop_duplicates(subset =['movie_id' ,'Character Name'], keep = False)
filtered_characters

KeyboardInterrupt: 

In [None]:
classifications = filtered_characters.groupby('Actor Name')['classification'].apply(list).reset_index(name = 'classifications')
classifications['max_occurences_character'] = classifications['classifications'].apply( lambda x : x.count(max(x,key=x.count)))
classifications['roles played'] = classifications['classifications'].apply( lambda x : len(x))
classifications = classifications[classifications['roles played']>1].sort_values(by= 'max_occurences_character',ascending= False)
classifications

Unnamed: 0,Actor Name,classifications,max_occurences_character,roles played
10742,Mel Blanc,"[3, 6, 3, 23, 3, 20, 3, 1, 3, 8, 23, 13, 3, 17...",72,157
630,Amitabh Bachchan,"[13, 3, 6, 14, 15, 6, 5, 22, 6, 21, 8, 6, 6, 1...",13,77
9962,Mammootty,"[17, 6, 3, 5, 4, 23, 3, 21, 14, 11, 1, 21, 19,...",13,91
6682,James Stewart,"[7, 13, 20, 2, 8, 18, 22, 22, 3, 12, 3, 7, 9, ...",12,55
14079,Samuel L. Jackson,"[16, 4, 17, 15, 17, 17, 17, 7, 6, 6, 6, 6, 6, ...",12,42
...,...,...,...,...
7212,Jesse Garcia,"[5, 23]",1,2
7214,Jesse James,"[6, 20, 9, 19, 0]",1,5
7218,Jesse Metcalfe,"[20, 18, 23]",1,3
7219,Jesse Moss,"[17, 16, 9, 3]",1,4


In [None]:
## get the dataset with oscar nominees and count the wins and nominations
oscar_nominees = pd.read_csv('oscar_nominees.csv')
oscar_actors = oscar_nominees.groupby('Actor Name').agg({'ceremony': 'count',
                                                         'winner'  : 'sum'}).reset_index()
oscar_actors.sort_values(by = 'winner',ascending= False)

Unnamed: 0,Actor Name,ceremony,winner
498,Katharine Hepburn,12,4
912,Walter Brennan,4,3
638,Meryl Streep,21,3
370,Jack Nicholson,12,3
360,Ingrid Bergman,7,3
...,...,...,...
491,June Squibb,1,0
492,Justin Henry,1,0
493,Karen Black,1,0
495,Kate Hudson,1,0


In [None]:
merge_oscar_character = pd.merge(oscar_actors,classifications,on = 'Actor Name',how = 'right', indicator= True)
merge_oscar_character

Unnamed: 0,Actor Name,ceremony,winner,classifications,max_occurences_character,roles played,_merge
0,Mel Blanc,,,"[3, 6, 3, 23, 3, 20, 3, 1, 3, 8, 23, 13, 3, 17...",72,157,right_only
1,Amitabh Bachchan,,,"[13, 3, 6, 14, 15, 6, 5, 22, 6, 21, 8, 6, 6, 1...",13,77,right_only
2,Mammootty,,,"[17, 6, 3, 5, 4, 23, 3, 21, 14, 11, 1, 21, 19,...",13,91,right_only
3,James Stewart,5.0,1.0,"[7, 13, 20, 2, 8, 18, 22, 22, 3, 12, 3, 7, 9, ...",12,55,both
4,Samuel L. Jackson,1.0,0.0,"[16, 4, 17, 15, 17, 17, 17, 7, 6, 6, 6, 6, 6, ...",12,42,both
...,...,...,...,...,...,...,...
5957,Jesse Garcia,,,"[5, 23]",1,2,right_only
5958,Jesse James,,,"[6, 20, 9, 19, 0]",1,5,right_only
5959,Jesse Metcalfe,,,"[20, 18, 23]",1,3,right_only
5960,Jesse Moss,,,"[17, 16, 9, 3]",1,4,right_only


In [None]:
merge_oscar_character['is_oscar_actor'] =  (merge_oscar_character['_merge']== 'both').astype(int)
oscar_nominees_characters = merge_oscar_character[merge_oscar_character['_merge']== 'both'][['Actor Name','max_occurences_character', 'roles played','is_oscar_actor']]
non_oscar_characters = merge_oscar_character[merge_oscar_character['_merge']== 'right_only'][['Actor Name','max_occurences_character', 'roles played','is_oscar_actor']]
oscar_nominees_characters.columns

Index(['Actor Name', 'max_occurences_character', 'roles played',
       'is_oscar_actor'],
      dtype='object')

In [None]:
comparator = pd.merge(oscar_nominees_characters,non_oscar_characters,on = ['roles played'],suffixes=("_1", "_2"))
comparator

Unnamed: 0,Actor Name_1,max_occurences_character_1,roles played,is_oscar_actor_1,Actor Name_2,max_occurences_character_2,is_oscar_actor_2
0,Samuel L. Jackson,12,42,1,Vincent Price,4,0
1,Robert Duvall,10,42,1,Vincent Price,4,0
2,Ben Kingsley,10,29,1,Ray Liotta,5,0
3,Ben Kingsley,10,29,1,Val Kilmer,5,0
4,Ben Kingsley,10,29,1,Dennis Quaid,5,0
...,...,...,...,...,...,...,...
347857,Jennifer Lawrence,1,2,1,Jermaine Williams,1,0
347858,Jennifer Lawrence,1,2,1,Jerry Jewell,1,0
347859,Jennifer Lawrence,1,2,1,Jerry Reed,1,0
347860,Jennifer Lawrence,1,2,1,Jessalyn Wanlim,1,0


In [None]:
## we use perfect matching here in order to have a maximum 1-on-1 matching between movies with oscar winners 
## and movies without any oscar nomination

# Initialise graph
graph = nx.Graph()
graph.add_nodes_from(comparator['Actor Name_1'], bipartite=0)
graph.add_nodes_from(comparator['Actor Name_2'], bipartite=1)
# add edges between movies with at least one oscar nominee and movies with no oscar nomination
# with weight equal the number of genres in common between the two movies
graph.add_weighted_edges_from(
    [(row['Actor Name_1'], row['Actor Name_2'], 1) for index,row in comparator.iterrows()], weight='weight')
# maximize the weight in order to have the best matching between movies 
# matching them with most genres in common while trying to have maximum cardinality
perfect_matching = nx.max_weight_matching(graph)

In [None]:
matched_actors = pd.DataFrame(perfect_matching)
matched_actors = matched_actors.rename(columns = {0:"Actor Name_2", 1: "Actor Name_1"})
matched_actors = pd.merge(matched_actors,comparator, on = ['Actor Name_1','Actor Name_2'],how ='inner')
print('The number of matched pairs is: {}'.format(len(matched_actors)))


The number of matched pairs is: 646


In [None]:
matched_actors.columns

Index(['Actor Name_2', 'Actor Name_1', 'max_occurences_character_1',
       'roles played', 'is_oscar_actor_1', 'max_occurences_character_2',
       'is_oscar_actor_2'],
      dtype='object')

In [None]:
(matched_actors['max_occurences_character_1']>matched_actors['max_occurences_character_2']).sum()/matched_actors.shape[0]*100

42.26006191950464

In [None]:
## split id1 and id2
matched_actors_1 = matched_actors[matched_actors.filter(regex='_1|roles played').columns]
matched_actors_2 = matched_actors[matched_actors.filter(regex='_2|roles played').columns]
# removing prefix for column names so we can have the same column names for matched_actors_1 and matched_actors_2
matched_actors_1.columns = matched_actors_1.columns.str.rstrip('_1')
matched_actors_2.columns = matched_actors_2.columns.str.rstrip('_2')
# concatenating the two dataframes into one
final_matched_actors = pd.concat([matched_actors_1,matched_actors_2])
final_matched_actors['bla'] = (final_matched_actors['max_occurences_character']-1)/(final_matched_actors['roles played']-1)

In [None]:
# linear regression formula
formula = "bla ~ is_oscar_actor"
df = final_matched_actors[['bla','is_oscar_actor']]
#train the linear regression with our final matched movies dataframe
mod = smf.ols(formula=formula, data= df).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                    bla   R-squared:                       0.118
Model:                            OLS   Adj. R-squared:                  0.118
Method:                 Least Squares   F-statistic:                     173.3
Date:                Mon, 19 Dec 2022   Prob (F-statistic):           3.09e-37
Time:                        12:21:51   Log-Likelihood:                 919.26
No. Observations:                1292   AIC:                            -1835.
Df Residuals:                    1290   BIC:                            -1824.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.0565      0.005     12.

## Do actors who played a similar role make the movie better?

In [None]:
movies = pd.read_csv('movies_with_rating.csv')
movies

Unnamed: 0,ID,Movie name,Movie languages,Movie countries,Movie genres,Movie release year,averageRating,numVotes,winner,nominated,Movie release era
0,28463795,Brun bitter,Norwegian Language,Norway,"['Crime Fiction', 'Drama']",1988.0,5.7,40,0,0,200.0
1,10408933,Alexander's Ragtime Band,English Language,United States of America,"['Musical', 'Comedy', 'Black-and-white']",1938.0,6.9,2160,0,0,200.0
2,11250635,The Mechanical Monsters,English Language,United States of America,"['Science Fiction', 'Adventure', 'Animation', ...",,7.4,1441,0,0,198.0
3,77856,Mary Poppins,English Language,United States of America,"[""Children's/Family"", 'Musical', 'Fantasy', 'C...",1964.0,7.8,173375,0,4,198.0
4,32456683,Die Fahne von Kriwoj Rog,German Language,German Democratic Republic,[],1967.0,7.7,22,0,0,198.0
...,...,...,...,...,...,...,...,...,...,...,...
10130,26219108,The Leopard Woman,Silent film,United States of America,"['Silent film', 'Romance Film']",,5.7,48,0,0,194.0
10131,54540,Coming to America,English Language,United States of America,"['Romantic comedy', 'Comedy of manners', 'Dram...",1988.0,7.1,208755,1,3,192.0
10132,1673588,The Brother from Another Planet,English Language,United States of America,"['Science Fiction', 'Indie', 'Cult', 'Drama', ...",1984.0,6.8,6181,0,0,192.0
10133,19525452,Pigen og vandpytten,Danish Language,Denmark,"['Family Film', 'Black-and-white']",1958.0,5.9,89,0,0,197.0


In [433]:
movies_with_character = pd.merge(movies,filtered_characters,right_on='movie_id',left_on = 'ID',how = 'inner')
movies_with_character[movies_with_character['ID']== 1881878]

Unnamed: 0,ID,Movie name,Movie languages,Movie countries,Movie genres,Movie release year,averageRating,numVotes,winner,nominated,Movie release era,movie_id,Actor Name,Character Name,name,classification
7540,1881878,Nick of Time,English Language,United States of America,"['Thriller', 'Crime Fiction', 'Psychological t...",1995.0,8.3,3585,1,2,200.0,1881878,Christopher Walken,Mr. Smith,Smith,3
7541,1881878,Nick of Time,English Language,United States of America,"['Thriller', 'Crime Fiction', 'Psychological t...",1995.0,8.3,3585,1,2,200.0,1881878,Johnny Depp,Gene Watson,Watson,3


In [354]:
first_date_classification = movies_with_character.groupby(['Actor Name','classification']).agg({'Movie release year': 'min'}).reset_index().rename(columns={"Movie release year": "first_date_classification"})
first_date_character = movies_with_character.groupby(['Actor Name','Character Name']).agg({'Movie release year': 'min'}).reset_index().rename(columns={"Movie release year": "first_date_character"})
first_dates = pd.merge(first_date_classification,first_date_character,on =['Actor'])
first_date_character


Unnamed: 0,Actor Name,Character Name,first_date_character
0,A. J. Bowen,Garrick Turrell,2010.0
1,A. J. Bowen,Lewis Denton,2007.0
2,A. J. Bowen,Victor Ulman,2009.0
3,AJ Diana,Amos Burroughs,2011.0
4,Aamir Bashir,Inspector Jai Pratap Singh,2008.0
...,...,...,...
8370,Édouard Montoute,Copain boîte de Ludo,2010.0
8371,Şener Şen,Ferman,1010.0
8372,Željko Ivanek,Joseph Cutler,2012.0
8373,Željko Ivanek,Mark Dolson,1984.0


In [371]:
movies_with_classification_first_date =pd.merge(movies_with_character,first_date_classification,on = ['Actor Name','classification'],how = 'left')
movies_with_character_first_date =pd.merge(movies_with_classification_first_date,first_date_character,on = ['Actor Name','Character Name'],how = 'left')
movies_with_character_first_date


Unnamed: 0,ID,Movie name,Movie languages,Movie countries,Movie genres,Movie release year,averageRating,numVotes,winner,nominated,Movie release era,movie_id,Actor Name,Character Name,name,classification,first_date_classification,first_date_character
0,77856,Mary Poppins,English Language,United States of America,"[""Children's/Family"", 'Musical', 'Fantasy', 'C...",1964.0,7.8,173375,0,4,198.0,77856,Dick Van Dyke,Bert,Bert,3,1964.0,1964.0
1,77856,Mary Poppins,English Language,United States of America,"[""Children's/Family"", 'Musical', 'Fantasy', 'C...",1964.0,7.8,173375,0,4,198.0,77856,Reta Shaw,Mrs. Brill,Brill,21,1964.0,1964.0
2,77856,Mary Poppins,English Language,United States of America,"[""Children's/Family"", 'Musical', 'Fantasy', 'C...",1964.0,7.8,173375,0,4,198.0,77856,Ed Wynn,Uncle Albert,Albert,14,1964.0,1964.0
3,77856,Mary Poppins,English Language,United States of America,"[""Children's/Family"", 'Musical', 'Fantasy', 'C...",1964.0,7.8,173375,0,4,198.0,77856,Hermione Baddeley,Ellen,Ellen,6,1964.0,1964.0
4,77856,Mary Poppins,English Language,United States of America,"[""Children's/Family"", 'Musical', 'Fantasy', 'C...",1964.0,7.8,173375,0,4,198.0,77856,Glynis Johns,Winifred Banks,Banks,6,1964.0,1964.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8551,54540,Coming to America,English Language,United States of America,"['Romantic comedy', 'Comedy of manners', 'Dram...",1988.0,7.1,208755,1,3,192.0,54540,James Earl Jones,King Jaffe Joffer,Joffer,14,1988.0,1988.0
8552,54540,Coming to America,English Language,United States of America,"['Romantic comedy', 'Comedy of manners', 'Dram...",1988.0,7.1,208755,1,3,192.0,54540,Madge Sinclair,Queen Aoleon,Aoleon,3,1988.0,1988.0
8553,54540,Coming to America,English Language,United States of America,"['Romantic comedy', 'Comedy of manners', 'Dram...",1988.0,7.1,208755,1,3,192.0,54540,Shari Headley,Lisa McDowell,Lisa,21,1988.0,1988.0
8554,54540,Coming to America,English Language,United States of America,"['Romantic comedy', 'Comedy of manners', 'Dram...",1988.0,7.1,208755,1,3,192.0,54540,Calvin Lockhart,Colonel Izzi,Izzi,16,1988.0,1988.0


In [432]:
movies_with_character_first_date[movies_with_character_first_date['ID']== 1881878 ][['Actor Name','first_date_classification','first_date_character','Movie release year']]


Unnamed: 0,Actor Name,first_date_classification,first_date_character,Movie release year
7540,Christopher Walken,1995.0,1995.0,1995.0
7541,Johnny Depp,1990.0,1995.0,1995.0


In [422]:
played_before_classification = (movies_with_character_first_date['first_date_classification'] < movies_with_character_first_date['Movie release year'])
played_before_character = (movies_with_character_first_date['first_date_character'] < movies_with_character_first_date['Movie release year'])
movies_with_character_first_date['played_before'] = played_before_classification

In [423]:

movies_similar_roles_played = movies_with_character_first_date.groupby(['ID','averageRating','Movie release era','Movie languages','Movie countries','Movie genres']).agg({'played_before': 'sum'}).reset_index()

In [424]:
movies_played_before = movies_similar_roles_played[movies_similar_roles_played['played_before']>0]
movies_not_played_before = movies_similar_roles_played[movies_similar_roles_played['played_before'] == 0]

In [425]:
comparator_played_before = pd.merge(
    movies_played_before,
    movies_not_played_before,
    on=['Movie release era'],
    suffixes=("_1", "_2")
)
comparator_played_before

Unnamed: 0,ID_1,averageRating_1,Movie release era,Movie languages_1,Movie countries_1,Movie genres_1,played_before_1,ID_2,averageRating_2,Movie languages_2,Movie countries_2,Movie genres_2,played_before_2
0,8695,8.4,191.0,English Language,United Kingdom,"['Drama', 'Comedy']",2,77744,5.1,English Language,United States of America,"['Buddy film', 'Absurdism', 'Cult', 'Satire', ...",0
1,8695,8.4,191.0,English Language,United Kingdom,"['Drama', 'Comedy']",2,113549,2.9,English Language,United States of America,"['Crime Fiction', 'Mystery', 'Horror']",0
2,8695,8.4,191.0,English Language,United Kingdom,"['Drama', 'Comedy']",2,129585,8.0,Sioux language,United States of America,"['Adventure', 'Epic Western', 'Costume drama',...",0
3,8695,8.4,191.0,English Language,United Kingdom,"['Drama', 'Comedy']",2,142274,6.2,English Language,United States of America,"['Cult', 'Parody', 'Science Fiction', 'Adventu...",0
4,8695,8.4,191.0,English Language,United Kingdom,"['Drama', 'Comedy']",2,146236,8.6,Hindi Language,United States of America,"['Science Fiction', 'Drama', 'Adventure', 'New...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
134539,27367770,7.1,196.0,English Language,Ireland,"['Family Drama', 'Indie', 'World cinema', ""Chi...",1,31734473,5.1,English Language,France,['Drama'],0
134540,27367770,7.1,196.0,English Language,Ireland,"['Family Drama', 'Indie', 'World cinema', ""Chi...",1,31935934,7.0,Romanian Language,France,"['Buddy Picture', 'Drama', 'World cinema']",0
134541,27367770,7.1,196.0,English Language,Ireland,"['Family Drama', 'Indie', 'World cinema', ""Chi...",1,32799797,8.0,English Language,United States of America,"['Drama', 'Comedy']",0
134542,27367770,7.1,196.0,English Language,Ireland,"['Family Drama', 'Indie', 'World cinema', ""Chi...",1,33206310,6.3,English Language,United States of America,['Mystery'],0


In [426]:
## see how many genres are in common between two movies
intersection = lambda list1, list2 :len(list(set(list1).intersection(list2)))
comparator_played_before['genres_matches'] = comparator_played_before[['Movie genres_1','Movie genres_2']].apply(lambda x: intersection(*x) ,axis = 1)
# only keep movie pairs with at least one genre in common
comparator_with_matches = comparator_played_before[comparator_played_before['genres_matches']>0]


In [427]:
comparator_with_matches

Unnamed: 0,ID_1,averageRating_1,Movie release era,Movie languages_1,Movie countries_1,Movie genres_1,played_before_1,ID_2,averageRating_2,Movie languages_2,Movie countries_2,Movie genres_2,played_before_2,genres_matches
0,8695,8.4,191.0,English Language,United Kingdom,"['Drama', 'Comedy']",2,77744,5.1,English Language,United States of America,"['Buddy film', 'Absurdism', 'Cult', 'Satire', ...",0,13
1,8695,8.4,191.0,English Language,United Kingdom,"['Drama', 'Comedy']",2,113549,2.9,English Language,United States of America,"['Crime Fiction', 'Mystery', 'Horror']",0,11
2,8695,8.4,191.0,English Language,United Kingdom,"['Drama', 'Comedy']",2,129585,8.0,Sioux language,United States of America,"['Adventure', 'Epic Western', 'Costume drama',...",0,13
3,8695,8.4,191.0,English Language,United Kingdom,"['Drama', 'Comedy']",2,142274,6.2,English Language,United States of America,"['Cult', 'Parody', 'Science Fiction', 'Adventu...",0,13
4,8695,8.4,191.0,English Language,United Kingdom,"['Drama', 'Comedy']",2,146236,8.6,Hindi Language,United States of America,"['Science Fiction', 'Drama', 'Adventure', 'New...",0,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134539,27367770,7.1,196.0,English Language,Ireland,"['Family Drama', 'Indie', 'World cinema', ""Chi...",1,31734473,5.1,English Language,France,['Drama'],0,7
134540,27367770,7.1,196.0,English Language,Ireland,"['Family Drama', 'Indie', 'World cinema', ""Chi...",1,31935934,7.0,Romanian Language,France,"['Buddy Picture', 'Drama', 'World cinema']",0,21
134541,27367770,7.1,196.0,English Language,Ireland,"['Family Drama', 'Indie', 'World cinema', ""Chi...",1,32799797,8.0,English Language,United States of America,"['Drama', 'Comedy']",0,14
134542,27367770,7.1,196.0,English Language,Ireland,"['Family Drama', 'Indie', 'World cinema', ""Chi...",1,33206310,6.3,English Language,United States of America,['Mystery'],0,8


In [428]:
## we use perfect matching here in order to have a maximum 1-on-1 matching between movies with oscar winners 
## and movies without any oscar nomination

# Initialise graph
graph = nx.Graph()
graph.add_nodes_from(comparator_with_matches['ID_1'], bipartite=0)
graph.add_nodes_from(comparator_with_matches['ID_2'], bipartite=1)
# add edges between movies with at least one oscar nominee and movies with no oscar nomination
# with weight equal the number of genres in common between the two movies
graph.add_weighted_edges_from(
    [(row['ID_1'], row['ID_2'], row['genres_matches']) for index,row in tqdm(comparator_with_matches.iterrows(),total =comparator_with_matches.shape[0])], weight='weight')
# maximize the weight in order to have the best matching between movies 
# matching them with most genres in common while trying to have maximum cardinality
perfect_matching = nx.max_weight_matching(graph)

100%|██████████| 134544/134544 [00:05<00:00, 24141.02it/s]


In [429]:
# we merge all movies pairs and the perfect_matching
matched_movies = pd.DataFrame(perfect_matching)
matched_movies = matched_movies.rename(columns = {0:"ID_2", 1: "ID_1"})
matched_movies = pd.merge(matched_movies,comparator_played_before, on = ['ID_1','ID_2'],how ='inner')
print('The number of matched pairs is: {}'.format(len(matched_movies)))

The number of matched pairs is: 355


In [430]:
## split id1 and id2
matched_movies_1 = matched_movies[matched_movies.filter(regex='_1|Movie release era').columns]
matched_movies_2 = matched_movies[matched_movies.filter(regex='_2|Movie release era').columns]
# removing prefix for column names so we can have the same column names for matched_movies_1 and matched_movies_2
matched_movies_1.columns = matched_movies_1.columns.str.rstrip('_1')
matched_movies_2.columns = matched_movies_2.columns.str.rstrip('_2')
# concatenating the two dataframes into one
final_matched_movies = pd.concat([matched_movies_1,matched_movies_2])
# adding is_after value
final_matched_movies['OscarWinner'] =(final_matched_movies['played_before']>0).astype(int)
final_matched_movies

Unnamed: 0,ID,averageRating,Movie release era,Movie languages,Movie countries,Movie genres,played_before,OscarWinner
0,4108752,5.3,195.0,English Language,United States of America,"['Thriller', 'Crime Fiction', 'Psychological t...",1,1
1,835620,6.7,197.0,English Language,United States of America,"['Romantic comedy', 'Romantic drama', 'Romance...",1,1
2,113464,7.9,192.0,English Language,United States of America,"['Romantic comedy', 'Ensemble Film', 'Indie', ...",2,1
3,1881878,8.3,200.0,English Language,United States of America,"['Thriller', 'Crime Fiction', 'Psychological t...",1,1
4,5646653,5.6,200.0,English Language,United States of America,"['Science Fiction', 'Action', 'Fantasy', 'Adve...",2,1
...,...,...,...,...,...,...,...,...
350,12507175,6.7,196.0,English Language,United States of America,"['Costume drama', 'Period piece', 'History', '...",0,0
351,31114108,5.6,195.0,Japanese Language,United States of America,"['Thriller', 'Glamorized Spy Film', 'Drama', '...",0,0
352,2654186,7.6,197.0,English Language,United States of America,"['Thriller', 'Crime Fiction', 'History', 'Docu...",0,0
353,1957162,7.0,197.0,Silent film,United States of America,"['Silent film', 'Indie', 'Comedy of manners', ...",0,0


In [431]:
# linear regression formula
formula = "averageRating ~ played_before"
df = final_matched_movies[['averageRating','played_before']]
#train the linear regression with our final matched movies dataframe
mod = smf.ols(formula=formula, data= df).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:          averageRating   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.3210
Date:                Mon, 19 Dec 2022   Prob (F-statistic):              0.571
Time:                        18:33:52   Log-Likelihood:                -1045.9
No. Observations:                 710   AIC:                             2096.
Df Residuals:                     708   BIC:                             2105.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         6.4578      0.053    121.266