### Loading the data

In [None]:
import pandas as pd

rating = pd.read_csv('rating.csv') #dataset with reviews

In [None]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [None]:
link = pd.read_csv('link.csv') #dataset with match between movieId and imdbId

In [None]:
link.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [None]:
movie = pd.read_csv('IMDB_movie_details.csv') #dataset with movie details

In [None]:
movie.head()

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis,title
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"['Action', 'Thriller']",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in...",Giochi di potere
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,['Comedy'],6.6,2013-11-01,Four boys around the age of 10 are friends in ...,Last Vegas
2,tt0243655,"The setting is Camp Firewood, the year 1981. I...",1h 37min,"['Comedy', 'Romance']",6.7,2002-04-11,,Wet Hot American Summer
3,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"['Adventure', 'Drama', 'Western']",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...,Il tesoro della Sierra Madre
4,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"['Comedy', 'Drama', 'Romance']",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...,Election


In [None]:
rating_imdb = pd.merge(rating, link, on='movieId')[['userId' , 'imdbId' , 'rating' , 'timestamp']] #dataset with the imbdbId and reviews

In [None]:
rating_imdb.head()

Unnamed: 0,userId,imdbId,rating,timestamp
0,1,113497,3.5,2005-04-02 23:53:47
1,5,113497,3.0,1996-12-25 15:26:09
2,13,113497,3.0,1996-11-27 08:19:02
3,29,113497,3.0,1996-06-23 20:36:14
4,34,113497,3.0,1996-10-28 13:29:44


In [None]:
movie['movie_id'] = movie['movie_id'].astype(str).str.extract('(\d+)')
movie['movie_id'] = movie['movie_id'].astype(str).str.lstrip('0')
filtered_ratings = rating_imdb[rating_imdb['imdbId'].astype(str).isin(movie['movie_id'])]

In [None]:
filtered_ratings.head() #dataset with movies that are in the movie dataset

Unnamed: 0,userId,imdbId,rating,timestamp
0,1,113497,3.5,2005-04-02 23:53:47
1,5,113497,3.0,1996-12-25 15:26:09
2,13,113497,3.0,1996-11-27 08:19:02
3,29,113497,3.0,1996-06-23 20:36:14
4,34,113497,3.0,1996-10-28 13:29:44


### Semantic community analysis

In [None]:
import networkx as nx

G = nx.read_graphml('synopsis_graph_final.graphml')

In [None]:
from community import community_louvain

louvain_partition0 = community_louvain.best_partition(G, weight='weight', random_state=42)

nx.set_node_attributes(G, louvain_partition0, 'community')

In [None]:
community = pd.DataFrame(data = louvain_partition0.items(), columns=['node_idx', 'community'])

In [None]:
community.head()

Unnamed: 0,node_idx,community
0,0,0
1,56,1
2,90,3
3,94,3
4,151,1


In [None]:
def get_movie_id(imdbId):
    return G.nodes[imdbId]['movie_id'] if imdbId in G else None


community['imdbId'] = community['node_idx'].apply(get_movie_id)

In [None]:
community.head()

Unnamed: 0,node_idx,community,imdbId
0,0,0,tt0105112
1,56,1,tt0406375
2,90,3,tt0324216
3,94,3,tt0129290
4,151,1,tt0398808


In [None]:
community['imdbId'] = community['imdbId'].astype(str).str.extract('(\d+)')
community['imdbId'] = community['imdbId'].astype(str).str.lstrip('0') #matching of the imdbId

In [None]:
community

Unnamed: 0,node_idx,community,imdbId
0,0,0,105112
1,56,1,406375
2,90,3,324216
3,94,3,129290
4,151,1,398808
...,...,...,...
1567,1277,5,1389072
1568,937,0,115798
1569,1186,4,1568346
1570,569,0,137523


In [None]:
filtered_ratings

Unnamed: 0,userId,imdbId,rating,timestamp
0,1,113497,3.5,2005-04-02 23:53:47
1,5,113497,3.0,1996-12-25 15:26:09
2,13,113497,3.0,1996-11-27 08:19:02
3,29,113497,3.0,1996-06-23 20:36:14
4,34,113497,3.0,1996-10-28 13:29:44
...,...,...,...,...
19993640,64572,1280558,3.5,2014-05-09 20:55:11
19993641,68606,1280558,4.0,2015-02-27 16:52:47
19993642,71975,1280558,3.5,2015-03-26 02:34:20
19993643,115229,1280558,4.5,2014-06-18 01:56:52


In [None]:
filtered_ratings.imdbId = filtered_ratings.imdbId.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_ratings.imdbId = filtered_ratings.imdbId.astype(str)


In [None]:
merged_data = pd.merge(filtered_ratings, community) #merging of the datasets with ratings and communities of the film

In [None]:
user_community_ratings = merged_data.groupby(['userId', 'community'])['rating'].agg(['std', 'count' , 'mean']).reset_index() #grouping of the data by user and community, collecting film of the same community that are watched by the same user

In [None]:
merged_data.rating.std() #overall standard deviaion

1.030182738816651

In [None]:
merged_data.head()

Unnamed: 0,userId,imdbId,rating,timestamp,node_idx,community
0,1,113497,3.5,2005-04-02 23:53:47,606,3
1,5,113497,3.0,1996-12-25 15:26:09,606,3
2,13,113497,3.0,1996-11-27 08:19:02,606,3
3,29,113497,3.0,1996-06-23 20:36:14,606,3
4,34,113497,3.0,1996-10-28 13:29:44,606,3


In [None]:
user_community_ratings.head()

Unnamed: 0,userId,community,std,count,mean
0,1,0,0.273861,5,3.8
1,1,1,0.288675,3,3.666667
2,1,2,0.415349,19,3.815789
3,1,3,0.342628,28,3.696429
4,1,4,0.572019,17,3.970588


In [None]:
user_community_ratings[user_community_ratings['count']>50]['std'].mean() #mean of the standard deviation of the ratings of the films of the same community watched by the same user (our communities based)

0.8890600350725967

The mean of the standard deviation of the ratings of the movies within the same community watched by the same user (our semantic communities) is of 0.89

### Genre community Analysis

In [None]:
G_genre = nx.read_graphml('genre_graph.graphml')


louvain_partition1 = community_louvain.best_partition(G_genre, weight='weight', random_state=42)

nx.set_node_attributes(G_genre, louvain_partition1, 'community')
G_genre.nodes(data=True)

NodeDataView({'tt0105112': {'genre': "['Thriller', 'Action']", 'community': 0}, 'tt1204975': {'genre': "['Comedy']", 'community': 1}, 'tt0243655': {'genre': "['Comedy', 'Romance']", 'community': 1}, 'tt0040897': {'genre': "['Adventure', 'Western', 'Drama']", 'community': 2}, 'tt0126886': {'genre': "['Drama', 'Comedy', 'Romance']", 'community': 1}, 'tt0286716': {'genre': "['Action', 'Sci-Fi']", 'community': 0}, 'tt0090605': {'genre': "['Adventure', 'Action', 'Sci-Fi']", 'community': 0}, 'tt0243155': {'genre': "['Drama', 'Comedy', 'Romance']", 'community': 1}, 'tt0121765': {'genre': "['Adventure', 'Action', 'Fantasy']", 'community': 0}, 'tt0443453': {'genre': "['Comedy']", 'community': 1}, 'tt0107131': {'genre': "['Adventure', 'Comedy', 'Drama']", 'community': 1}, 'tt0110364': {'genre': "['Sport', 'Comedy', 'Family']", 'community': 1}, 'tt0450259': {'genre': "['Adventure', 'Thriller', 'Drama']", 'community': 2}, 'tt0143145': {'genre': "['Adventure', 'Action', 'Thriller']", 'community': 0

In [None]:
c1 = pd.DataFrame(data = louvain_partition1.items(), columns=['node_idx', 'community'])


In [None]:
c1['node_idx'] = c1['node_idx'].astype(str).str.extract('(\d+)')
c1['node_idx'] = c1['node_idx'].astype(str).str.lstrip('0')

In [None]:
merged_data_1 = pd.merge(filtered_ratings, c1, left_on='imdbId', right_on='node_idx')

In [None]:
merged_data_1.head()

Unnamed: 0,userId,imdbId,rating,timestamp,node_idx,community
0,1,113497,3.5,2005-04-02 23:53:47,113497,0
1,5,113497,3.0,1996-12-25 15:26:09,113497,0
2,13,113497,3.0,1996-11-27 08:19:02,113497,0
3,29,113497,3.0,1996-06-23 20:36:14,113497,0
4,34,113497,3.0,1996-10-28 13:29:44,113497,0


In [None]:
user_community_ratings_1 = merged_data_1.groupby(['userId', 'community'])['rating'].agg(['std', 'count' , 'mean']).reset_index()

In [None]:
user_community_ratings_1[user_community_ratings_1['count'] > 50]

Unnamed: 0,userId,community,std,count,mean
30,11,0,0.773624,123,4.422764
31,11,1,0.762837,60,4.166667
32,11,2,0.896168,68,4.367647
40,14,1,1.002230,56,3.633929
62,21,2,0.684254,54,4.148148
...,...,...,...,...,...
410725,138474,2,0.924832,82,4.378049
410734,138477,2,1.042734,61,3.934426
410780,138493,0,0.828712,63,4.103175
410781,138493,1,0.754560,55,4.190909


In [None]:
user_community_ratings_1[user_community_ratings_1['count'] > 50]['std'].mean() #mean of the standard deviation of the ratings of the films of the same community watched by the same user (genre based)

0.8756182941518473

The mean of the standard deviation of the ratings of the films of the same community watched by the same user (genre based) is of 0.88