# Validation Average Rating Comparison

## Data

In [None]:
import pandas as pd
#Rating contains the users, their ratings, and the unique Id of the movie they reviewed
rating = pd.read_csv('rating.csv')

In [None]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [None]:
# Link contains the translation of movieId to imbd_movie_id, the id we have in imdb_movie_details
link = pd.read_csv('link.csv')

In [None]:
# Create the dataset rating_imdb by merging rating and link, translating the MovieId into the corresponding imdb_movie_id
rating_imdb = pd.merge(rating, link, on='movieId')[['userId' , 'imdbId' , 'rating' , 'timestamp']]

In [None]:
# Open the dataset with the movies and their imdb id
movie = pd.read_csv('IMDB_movie_details.csv')

In [None]:
movie.head()

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis,title
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"['Action', 'Thriller']",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in...",Giochi di potere
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,['Comedy'],6.6,2013-11-01,Four boys around the age of 10 are friends in ...,Last Vegas
2,tt0243655,"The setting is Camp Firewood, the year 1981. I...",1h 37min,"['Comedy', 'Romance']",6.7,2002-04-11,,Wet Hot American Summer
3,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"['Adventure', 'Drama', 'Western']",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...,Il tesoro della Sierra Madre
4,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"['Comedy', 'Drama', 'Romance']",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...,Election


In [None]:
rating_imdb.head()

Unnamed: 0,userId,imdbId,rating,timestamp
0,1,113497,3.5,2005-04-02 23:53:47
1,5,113497,3.0,1996-12-25 15:26:09
2,13,113497,3.0,1996-11-27 08:19:02
3,29,113497,3.0,1996-06-23 20:36:14
4,34,113497,3.0,1996-10-28 13:29:44


In [None]:
# We now want to filter ratings to include only those movies that are in the Network we have, we need to match with movie['movie_id']
# Pay attention to the format of the imdb id in rating_imdb, it lacks the first 2 letters "tt", so we match on the other following digits
movie['movie_id'] = movie['movie_id'].astype(str).str.extract('(\d+)')
movie['movie_id'] = movie['movie_id'].astype(str).str.lstrip('0')

# Filter ratings_data to include only movies present in movies_data
filtered_ratings = rating_imdb[rating_imdb['imdbId'].astype(str).isin(movie['movie_id'])]

In [None]:
filtered_ratings.head()

Unnamed: 0,userId,imdbId,rating,timestamp
0,1,113497,3.5,2005-04-02 23:53:47
1,5,113497,3.0,1996-12-25 15:26:09
2,13,113497,3.0,1996-11-27 08:19:02
3,29,113497,3.0,1996-06-23 20:36:14
4,34,113497,3.0,1996-10-28 13:29:44


In [None]:
filtered_ratings.groupby( "userId").count().mean()

imdbId       75.790412
rating       75.790412
timestamp    75.790412
dtype: float64

In [None]:
# Import the semantic network 
import networkx as nx
G = nx.read_graphml('synopsis_graph_final.graphml')

In [None]:
# Create communities
import community
from community import community_louvain

# Find partition with Louvain algorithm
louvain_partition0 = community_louvain.best_partition(G, weight='weight', random_state=42)

# Add partition information as a node attribute
nx.set_node_attributes(G, louvain_partition0, 'community')

In [None]:
# save communities into a Dataframe
community = pd.DataFrame(data = louvain_partition0.items(), columns=['node_idx', 'community'])

In [None]:
community.head()

Unnamed: 0,node_idx,community
0,0,0
1,56,1
2,90,3
3,94,3
4,151,1


Create the merged dataset, containing users, review, movie id, movie title, and movie community only if the movie is included in the network

In [None]:
#Define functions to get the correct movie_Id only if the movie is in the Graph G

def get_movie_id(imdbId):
    return G.nodes[imdbId]['movie_id'] if imdbId in G else None
def get_titles(imdbId):
    return G.nodes[imdbId]['title'] if imdbId in G else None

community['title'] = community['node_idx'].apply(get_titles)
# Add movie_id column to dataframe
community['imdbId'] = community['node_idx'].apply(get_movie_id)

In [None]:
community.head()
#Now we also have the id and the title to the nodes of the network with their corresponding community

Unnamed: 0,node_idx,community,imdbId,title
0,0,0,tt0105112,Giochi di potere
1,56,1,tt0406375,Zathura - Un'avventura spaziale
2,90,3,tt0324216,Non aprite quella porta
3,94,3,tt0129290,Patch Adams
4,151,1,tt0398808,Un ponte per Terabithia


In [None]:
#Pay attention to the format of the imbdid and make it compatible to the movie_id format of the reviews dataset
community['imdbId'] = community['imdbId'].astype(str).str.extract('(\d+)')
community['imdbId'] = community['imdbId'].astype(str).str.lstrip('0')
filtered_ratings.imdbId = filtered_ratings.imdbId.astype(str)

In [None]:
#Create the merged data from filtered ratings and community dataframe
merged_data = pd.merge(filtered_ratings, community)
merged_data['time_stamp'] = pd.to_datetime(merged_data['timestamp'])

In [None]:
merged_data.head()

Unnamed: 0,userId,imdbId,rating,timestamp,node_idx,community,title,time_stamp
0,1,113497,3.5,2005-04-02 23:53:47,606,3,Jumanji,2005-04-02 23:53:47
1,5,113497,3.0,1996-12-25 15:26:09,606,3,Jumanji,1996-12-25 15:26:09
2,13,113497,3.0,1996-11-27 08:19:02,606,3,Jumanji,1996-11-27 08:19:02
3,29,113497,3.0,1996-06-23 20:36:14,606,3,Jumanji,1996-06-23 20:36:14
4,34,113497,3.0,1996-10-28 13:29:44,606,3,Jumanji,1996-10-28 13:29:44


## Data Validation by checking averages outside and inside communities

The idea here is to compare the average rating assigned by users to movies inside vs outside a reference community

In [None]:
# Create a dataframe where each line is a user
df_users = pd.DataFrame(merged_data['userId'].unique(), columns=['userId'])
df_users["average_in"] = 0
df_users["average_out"] = 0

For each user take the first movie that has the highest rating, compute the average of the ratings inside that community (excluding that movie) and compare it to the  remaining movies and their communities.

In [None]:

for i in range(len(df_users)): 
  print(i)
  id_user = int(df_users.loc[i].userId)
  df_id = merged_data[merged_data.userId == id_user]
  df_id = df_id.sort_values(by='timestamp')
  sol = df_id[df_id.rating==max(df_id.rating)].iloc[0]
  movie_id = sol["imdbId"]
  community = sol["community"]
  df_id_average_out = df_id[(df_id.imdbId != movie_id) & (df_id.community != community)]
  df_id_average_in = df_id[(df_id.imdbId != movie_id) & (df_id.community == community)]
  df_users["average_out"].loc[i] = df_id_average_out['rating'].mean()
  df_users["average_in"].loc[i] = df_id_average_in['rating'].mean()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users["average_in"].loc[i] = df_id_average_in['rating'].mean()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
133486
133487
133488
133489
133490
133491
133492
133493
133494
133495
133496
133497
133498
133499
133500
133501
133502
133503
133504
133505
133506
133507
133508
133509
133510
133511
133512
133513
133514
133515
133516
133517
133518
133519
133520
133521
133522
133523
133524
133525
133526
133527
133528
133529
133530
133531
133532
133533
133534
133535
133536
133537
133538
133539
133540
133541
133542
133543
133544
133545
133546
133547
133548
133549
133550
133551
133552
133553
133554
133555
133556
133557
133558
133559
133560
133561
133562
133563
133564
133565
133566
133567
133568
133569
133570
133571
133572
133573
133574
133575
133576
133577
133578
133579
133580
133581
133582
133583
133584
133585
133586
133587
133588
133589
133590
133591
133592
133593
133594
133595
133596
133597
133598
133599
133600
133601
133602
133603
133604
133605
133606
133607
133608
133609
133610
133611
133612
133613
133614
133615
133616
133617
133618
1336

In [None]:
# Check the Dataframe
df_users.head()

Unnamed: 0,userId,average_in,average_out
0,1,3.90625,3.756944
1,5,4.6,4.272727
2,13,3.571429,3.725
3,29,3.709677,3.62069
4,34,4.181818,3.847826


In [None]:
#Assign true if our assumption was right, false otherwise
df_users["better_community"] = (df_users['average_in'] > df_users['average_out'])
df_users.to_csv("df_users_synopsis.csv")

In [None]:
#Compute the percentage of True
df_users.better_community.sum()/len(df_users)

0.5759860202475341

## Validation of algorithm recommendations

The idea here is to compare the average rating assigned by users to the recommended movies vs all other watched movies

In [None]:
# Import the Recommendation algorithm type 1: 
import ast
def recommendation1(G, data, watched_film, top_n=15):
    titles= data['title'].tolist()
    if watched_film not in titles:
       raise ValueError("The watched film is not in the graph.")

    idx_film= data.loc[data['title'] == watched_film].index.tolist()[0]
    idx_community= data.loc[data['title'] == watched_film]['community'].tolist()[0]

    neighbors = [int(el) for el in list(G.neighbors(f'{idx_film}'))]     # Get all neighbors of the watched film

    same_community_neighbors = [neighbor for neighbor in neighbors if data['community'][neighbor] == idx_community]     # Filter neighbors that belong to the same community

    watched_film_genres = set(ast.literal_eval(data['genre'][idx_film]))     # Watched film genres

    def ranking_criteria(film):
        film_genres = set(ast.literal_eval(data['genre'][film]))
        common_genres_count = len(watched_film_genres.intersection(film_genres))         # Genre similarity


        similarity= G[f'{idx_film}'][f'{film}']['weight']  # Semantic similarity
        return  common_genres_count, similarity

    ranked_neighbors = sorted(same_community_neighbors, key=ranking_criteria, reverse=True )     # Rank neighbors using the ranking criteria


    neighbors= ranked_neighbors[:top_n]
    suggested_films = {data.iloc[neighbor]['title']: ast.literal_eval(data.iloc[neighbor]['genre']) for neighbor in neighbors}

    return suggested_films


### Create the dataframe that contains the recommendations for each movie

In [None]:
#Re import the graph
G = nx.read_graphml('synopsis_graph_final.graphml')
print("Nodes:")
print(len(G.nodes(data=True)))
print("\nEdges:")
print(len(G.edges(data=True)))

Nodes:
1572

Edges:
106054


In [None]:
#Re impose communities, previously created
louvain_partition0 = community_louvain.best_partition(G, weight='weight', random_state=42)
nx.set_node_attributes(G, louvain_partition0, 'community')
nodes= G.nodes(data = True)

In [None]:
# Import the whole dataset of movies
df = pd.read_csv('IMDB_movie_details.csv')
df.head()

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis,title
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"['Action', 'Thriller']",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in...",Giochi di potere
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,['Comedy'],6.6,2013-11-01,Four boys around the age of 10 are friends in ...,Last Vegas
2,tt0243655,"The setting is Camp Firewood, the year 1981. I...",1h 37min,"['Comedy', 'Romance']",6.7,2002-04-11,,Wet Hot American Summer
3,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"['Adventure', 'Drama', 'Western']",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...,Il tesoro della Sierra Madre
4,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"['Comedy', 'Drama', 'Romance']",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...,Election


In [None]:
# Contruct communities
communities= [nodes[f'{i}']['community'] for i in range(len(df))]
df['community']= communities
df['community'].value_counts()

community
3    369
0    305
2    265
5    246
1    220
4    167
Name: count, dtype: int64

In [None]:
# Check which movies do not have a recommendation and obtain a list of recommendations for each movie possible
titles = df.title.values
l = []
for film in titles:
  print(film)
  try:
    l.append(recommendation1(G,df,film).keys()) #oobtain recommantion for each film
  except:
    l.append("NAN")

Giochi di potere
Last Vegas
Wet Hot American Summer
Il tesoro della Sierra Madre
Election
Hulk
Aliens - Scontro finale
Il diario di Bridget Jones
Star Wars: Episodio II - L'attacco dei cloni
Borat: Studio culturale sull'America a beneficio della gloriosa nazione del Kazakistan
In fuga a quattro zampe
Piccoli campioni
Blood Diamond - Diamanti di sangue
Il mondo non basta
The Descent - Discesa nelle tenebre
Io, robot
Salvate il soldato Ryan
Star Trek - La nemesi
Hunger Games: La ragazza di fuoco
I bambini del cielo
Genitori in trappola
I soliti sospetti
Delicatessen
Mean Girls
A proposito di Henry
Quella casa nel bosco
The Imitation Game
Una vita al massimo
Metropolis
True Detective
Il nemico alle porte
U-Boot 96
L'ultima vacanza
Lezioni di piano
Blue Valentine
Aurora
Le Cronache Di Narnia - Il Principe Caspian
Planet of the Apes - Il pianeta delle scimmie
Ghostbusters
Spring Breakers - Una vacanza da sballo
Westworld - Dove tutto è concesso
Seven Sisters
The Net - Intrappolata nella ret

In [None]:
# add the recommendations to the dataframe
df["recommended"] = l
df = df[df['recommended'] != "NAN"] #remove NAN values
df['recommended'] = [list(i) for i in df['recommended'].values] #append the recommended films to the dataframe in a convenient format

In [None]:
df['recommended'].values

array([list(['Four Brothers - Quattro fratelli', 'Dragon - La storia di Bruce Lee', 'Tra le nuvole', 'Assassini nati - Natural Born Killers', 'American History X', 'Prisoners', 'Mia moglie per finta', 'Shining', 'Fratellastri a 40 anni', 'East Is East', 'La vita è un sogno', 'The Originals']),
       list(['Un weekend da bamboccioni', 'Beautiful Girls', 'Giovani, pazzi e svitati', 'Un disastro di ragazza', 'Finalmente maggiorenni', 'Bad Moms: Mamme molto cattive', 'Proprio lui?', 'Drillbit Taylor', 'Mia moglie per finta', 'Un amore di testimone', 'Come ti rovino le vacanze', 'Gli stagisti', 'Lady Bird', 'Sognando Beckham', 'Prima o poi me lo sposo']),
       list(['50 volte il primo bacio', 'A casa per le vacanze', 'Bella da morire', 'Come farsi lasciare in 10 giorni', 'Una lunga pazza estate', 'Colpo grosso al drago rosso - Rush Hour 2', 'Magic Mike', "Spy Kids 2 - L'isola dei sogni perduti", 'Scooby-Doo', 'Cambia la tua vita con un click', "Zathura - Un'avventura spaziale", 'Il club 

In [None]:
# readjust movie id format
df['movie_id'] = df['movie_id'].astype(str).str.extract('(\d+)')
df['movie_id'] = df['movie_id'].astype(str).str.lstrip('0') #match the movie id (removing tt at the start)

In [None]:
df.head()

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis,title,community,recommended
0,105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"['Action', 'Thriller']",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in...",Giochi di potere,0,"[Four Brothers - Quattro fratelli, Dragon - La..."
1,1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,['Comedy'],6.6,2013-11-01,Four boys around the age of 10 are friends in ...,Last Vegas,0,"[Un weekend da bamboccioni, Beautiful Girls, G..."
2,243655,"The setting is Camp Firewood, the year 1981. I...",1h 37min,"['Comedy', 'Romance']",6.7,2002-04-11,,Wet Hot American Summer,1,"[50 volte il primo bacio, A casa per le vacanz..."
3,40897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"['Adventure', 'Drama', 'Western']",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...,Il tesoro della Sierra Madre,3,"[Revenant - Redivivo, L'urlo dell'odio, Gli sp..."
4,126886,Tracy Flick is running unopposed for this year...,1h 43min,"['Comedy', 'Drama', 'Romance']",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...,Election,0,"[10 cose che odio di te, Beautiful Girls, Ti o..."


### Create the dataframe containing users average ratings

In [None]:
# create the dataframe of users to carry out the validation
df_users = pd.DataFrame(merged_data['userId'].unique(), columns=['userId'])
df_users["average_in"] = 0
df_users["average_out"] = 0

In [None]:
df_users.head()

Unnamed: 0,userId,average_in,average_out
0,1,0,0
1,5,0,0
2,13,0,0
3,29,0,0
4,34,0,0


For each user take the first movie that has the highest rating, compare its recommended movies to the remaining movies the user has watched.


In [None]:
for i in range(len(df_users)):
  print(i)
  id_user = int(df_users.loc[i].userId)
  df_id = merged_data[merged_data.userId == id_user]
  df_id = df_id.sort_values(by='timestamp')
  sol = df_id[df_id.rating==max(df_id.rating)].iloc[0]
  movie_id = sol["imdbId"]
  recommended =  df[df.movie_id == movie_id].recommended.iloc[0]
  df_id_average_out = df_id[(df_id.imdbId != movie_id) & (~df_id.title.isin(recommended))]
  df_id_average_in = df_id[(df_id.imdbId != movie_id) & (df_id.title.isin(recommended))]
  df_users["average_out"].loc[i] = df_id_average_out['rating'].mean()
  df_users["average_in"].loc[i] = df_id_average_in['rating'].mean()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users["average_in"].loc[i] = df_id_average_in['rating'].mean()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
133486
133487
133488
133489
133490
133491
133492
133493
133494
133495
133496
133497
133498
133499
133500
133501
133502
133503
133504
133505
133506
133507
133508
133509
133510
133511
133512
133513
133514
133515
133516
133517
133518
133519
133520
133521
133522
133523
133524
133525
133526
133527
133528
133529
133530
133531
133532
133533
133534
133535
133536
133537
133538
133539
133540
133541
133542
133543
133544
133545
133546
133547
133548
133549
133550
133551
133552
133553
133554
133555
133556
133557
133558
133559
133560
133561
133562
133563
133564
133565
133566
133567
133568
133569
133570
133571
133572
133573
133574
133575
133576
133577
133578
133579
133580
133581
133582
133583
133584
133585
133586
133587
133588
133589
133590
133591
133592
133593
133594
133595
133596
133597
133598
133599
133600
133601
133602
133603
133604
133605
133606
133607
133608
133609
133610
133611
133612
133613
133614
133615
133616
133617
133618
1336

In [None]:
df_users.head()

Unnamed: 0,userId,average_in,average_out
0,1,4.5,3.758824
1,5,4.5,4.341463
2,13,,3.702128
3,29,4.333333,3.627907
4,34,4.0,3.909091


In [None]:
# Manage missing data
import numpy as np
df_users["better_recommended"] = (df_users['average_in'] > df_users['average_out']) #check if the average rating of the recommended films is higher than the average rating of the other films
df_final = df_users[df_users.average_in.notnull() & df_users.average_out.notnull()] #remove the null values (users that have not rated any film in the recommended list or the other films)
df_final.head()

Unnamed: 0,userId,average_in,average_out,better_recommended
0,1,4.5,3.758824,True
1,5,4.5,4.341463,True
3,29,4.333333,3.627907,True
4,34,4.0,3.909091,True
6,88,3.0,2.294118,True


In [None]:
# Compare
df_final_v2 = df_final
df_final_v2["diff"] = df_final_v2["average_in"] - df_final_v2["average_out"]
df_final_v2.head()
df_final_v2["diff"].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_v2["diff"] = df_final_v2["average_in"] - df_final_v2["average_out"]


0.25009685942161053

In [None]:
# Compare in percentage terms
df_final.better_recommended.sum()/len(df_final) #final results are positive! 0.5 would imply that the recommended films have the same average rating as the other films. 

0.6797299086455713

In this case, the average rating of the recommended movies is higher than other movies in almost 70% of the cases!