# 0.Import Libraries

In [314]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import networkx as nx
! pip install nxviz
import nxviz as nv



# 1.Dataset Info Function

* This function will give info about the dataset.
* I will use it in the entire notebook

In [315]:
def create_analysis_dataframe(dataset):
    list=[]
    for column in dataset.columns:
        data_type = dataset.dtypes[column]
        unique_number = dataset[column].unique().size
        null_ratio = dataset[column].isnull().sum()/dataset.shape[0]
        most_frequent_value = dataset[column].value_counts().idxmax()
        least_frequent_value = dataset[column].value_counts().idxmin()
        min_value = dataset[column].min(skipna=True, numeric_only=False)
        max_value = dataset[column].max(skipna=True, numeric_only=False)
        list.append([column,data_type,unique_number,null_ratio,most_frequent_value,least_frequent_value,min_value,max_value])
    dataset_info=pd.DataFrame(list,columns=['Name',
                                            'Data_Type',
                                            'Unique_Size',
                                            'Null_Ratio',
                                            'Most_Frequent_Value',
                                            'Least_Frequent_Value',
                                           'Min_Value',
                                           'Max_Value']).sort_values(by=['Unique_Size','Name'],
                                                                                 ascending=True,
                                                                                ignore_index=True)
    return dataset_info

# 1.Load Dataset

In [330]:
movies = pd.read_csv('/kaggle/input/movie-recommendation-system/movies.csv')
ratings = pd.read_csv('/kaggle/input/movie-recommendation-system/ratings.csv')

In [317]:
create_analysis_dataframe(movies)

Unnamed: 0,Name,Data_Type,Unique_Size,Null_Ratio,Most_Frequent_Value,Least_Frequent_Value,Min_Value,Max_Value
0,genres,object,1639,0.0,Drama,Action|Children|Fantasy|Sci-Fi,(no genres listed),Western
1,title,object,62325,0.0,The Void (2016),Botany Bay (1953),"""BLOW THE NIGHT!"" Let's Spend the Night Togeth...",줄탁동시 (2012)
2,movieId,int64,62423,0.0,1,1,1,209171


In [318]:
create_analysis_dataframe(ratings)

Unnamed: 0,Name,Data_Type,Unique_Size,Null_Ratio,Most_Frequent_Value,Least_Frequent_Value,Min_Value,Max_Value
0,rating,float64,10,0.0,4.0,0.5,0.5,5.0
1,movieId,int64,59047,0.0,356.0,153318.0,1.0,209171.0
2,userId,int64,162541,0.0,72315.0,159762.0,1.0,162541.0
3,timestamp,int64,20115267,0.0,825638400.0,1567293000.0,789652009.0,1574328000.0


In [319]:
ratings.shape

(25000095, 4)

In [331]:
del movies

**For graph theory, I only need ratings dataframe, this is why I am deleting movies dataframe.**

**Looking at the results above, the dataframe has no null or infinite values, so no preprocessing is needed.**


# 2.Merging and Structuring Movie Ratings Data
* **The dataset is too big for this project, I don't need 25 million rows so I wanna drop some of them.**
* **I will chose the with least value counts movies and users and I will remove them.**

In [321]:
create_analysis_dataframe(ratings)

Unnamed: 0,Name,Data_Type,Unique_Size,Null_Ratio,Most_Frequent_Value,Least_Frequent_Value,Min_Value,Max_Value
0,rating,float64,10,0.0,4.0,0.5,0.5,5.0
1,movieId,int64,59047,0.0,356.0,153318.0,1.0,209171.0
2,userId,int64,162541,0.0,72315.0,159762.0,1.0,162541.0
3,timestamp,int64,20115267,0.0,825638400.0,1567293000.0,789652009.0,1574328000.0


* **The decreased dataset has almost 2.5 million rows.**

In [335]:
sorted(ratings['movieId'].value_counts())

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [322]:
final_data = ratings.sample(frac=0.03, weights='rating', random_state=1)
sorted(ratings['movieId'].unique())[:1000].index()
# del ratings

# converting timestamp to readible date column 
final_data['date'] = pd.to_datetime(final_data['timestamp'],unit='s')
final_data = final_data.drop('timestamp', axis=1)

# seperating date column 
final_data['year'] = final_data['date'].dt.year
final_data['month'] = final_data['date'].dt.month

# I'm adding a small character to each node type to distinguish between them. 
# This is because both the userId and movieId columns have overlapping numbers (e.g., a user and a movie can both have an ID of 10).
final_data['movieId'] = 'm' + final_data['movieId'].astype('str')
final_data['userId'] = 'u' + final_data['userId'].astype('str')

# first look to processed data
final_data.head()

Unnamed: 0,userId,movieId,rating,date,year,month
10435637,u67708,m2278,4.0,2000-10-27 04:43:46,2000,10
18013724,u116679,m2081,4.0,1999-12-31 17:29:54,1999,12
2847,u13,m81834,3.5,2011-02-17 02:07:49,2011,2
7560292,u49045,m2997,2.0,2005-05-05 19:45:34,2005,5
3664818,u24234,m365,3.0,1998-12-19 01:12:08,1998,12


In [323]:
create_analysis_dataframe(final_data)

Unnamed: 0,Name,Data_Type,Unique_Size,Null_Ratio,Most_Frequent_Value,Least_Frequent_Value,Min_Value,Max_Value
0,rating,float64,10,0.0,4.0,0.5,0.5,5.0
1,month,int32,12,0.0,11,9,1,12
2,year,int32,24,0.0,2000,1998,1996,2019
3,movieId,object,17503,0.0,m318,m206093,m1,m99996
4,userId,object,119904,0.0,u72315,u25276,u100,u99999
5,date,datetime64[ns],492150,0.0,1996-03-01 00:00:00,2009-10-02 05:18:22,1996-01-29 00:00:00,2019-11-21 08:55:21


# 3.Creating Graph

In [324]:
G = nx.Graph()

G.add_nodes_from(final_data['userId'].unique(), bipartite='users')
G.add_nodes_from(final_data['movieId'].unique(), bipartite='movies')

for index, row in final_data.iterrows():
    G.add_edge(row['userId'], row['movieId'],weight=row['rating'],month=row['month'],year=row["year"])

print(f"------------------------- GRAPH VALUES -------------------------------------")
print("Number of nodes :",len(G.nodes()))
print("Number of edges :",len(G.edges()))

------------------------- GRAPH VALUES -------------------------------------
Number of nodes : 137407
Number of edges : 500002


# 4.Bipartite Degree Centrality

In [325]:
user_list = [n for n in G.nodes() if G.nodes[n]['bipartite']=='users']
repo_list = [n for n in G.nodes() if G.nodes[n]['bipartite']=='movies']

In [328]:
sorted(nx.bipartite.degree_centrality(G, nodes=repo_list).items(), key=lambda x:x[1], reverse=True)[:10]

[('u72315', 0.03405130549048734),
 ('m318', 0.017230451027488658),
 ('m296', 0.016054510274886576),
 ('m356', 0.016021150253536162),
 ('m2571', 0.01437816920202829),
 ('m593', 0.014269749132639445),
 ('m260', 0.01340238857752869),
 ('m527', 0.01235154790499066),
 ('m2959', 0.012001267680811317),
 ('m50', 0.01145082732852949)]

In [327]:
sorted(nx.bipartite.degree_centrality(G, nodes=user_list).items(), key=lambda x:x[1], reverse=True)[:10]

[('u72315', 0.03405130549048734),
 ('m318', 0.017230451027488658),
 ('m296', 0.016054510274886576),
 ('m356', 0.016021150253536162),
 ('m2571', 0.01437816920202829),
 ('m593', 0.014269749132639445),
 ('m260', 0.01340238857752869),
 ('m527', 0.01235154790499066),
 ('m2959', 0.012001267680811317),
 ('m50', 0.01145082732852949)]