In [96]:
# https://keras.io/examples/graph/node2vec_movielens/
import findspark
findspark.init()

import numpy as np
import pandas as pd
import pyarrow as pa
import itertools
import math
import time
from collections import defaultdict
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.window import *
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType, StructType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import networkx as nx
import matplotlib.pyplot as plt

conf = SparkConf()
conf.set("spark.executor.memory","24g")
conf.set("spark.driver.memory", "8g")
conf.set("spark.driver.cores", "8")
conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.getOrCreate()

In [2]:
movielens = sc.textFile('../data/ml-100k/u.data').map(lambda x: tuple(x.split('\t'))) \
                .map(lambda x: tuple([int(x[0]), int(x[1]), float(x[2])])) #user, item, rating
movielens.first()

(196, 242, 3.0)

In [66]:
min_rating= 5
rated_movie = movielens.filter(lambda x: x[2] >= min_rating)
rated_movie.count()

21201

In [90]:
user_cnt = rated_movie.map(lambda x: (x[1],1)) \
        .reduceByKey(lambda a,b: a+b) \
        .collectAsMap()

In [138]:
# xy
min_weight = 5.5
D = rated_movie.count()

def compute_edge(x):
    res = []
    for pair in list(itertools.combinations(x, 2)):
        k = tuple(sorted([pair[0][0], pair[1][0]]))
        v = 1
        res.append(tuple([k,v]))
    return res

def compute_edge_weight(x):
    k = x[0]
    xy = math.log(x[1])
    x = math.log(user_cnt[k[0]])
    y = math.log(user_cnt[k[1]])
    
    v = xy-x-y+math.log(D)
    return tuple([k,v])
    

edges = rated_movie.map(lambda x: (x[0], [(x[1], x[2])])) \
                    .reduceByKey(lambda a,b: a+b) \
                    .map(lambda x: x[1]) \
                    .flatMap(lambda x: compute_edge(x)) \
                    .reduceByKey(lambda a,b: a+b) \
                    .map(lambda x: compute_edge_weight(x)) \
                    .filter(lambda x: x[1] >= min_weight) \
                    .map(lambda x: tuple([*x[0], x[1]])) \
                    .collect()
#                     
len(edges)

# edges.collect()

39641

In [139]:
g= nx.Graph()
g.add_nodes_from(nodes)
g.add_weighted_edges_from(edges)
print("Total number of graph nodes:", g.number_of_nodes())
print("Total number of graph edges:", g.number_of_edges())

degrees = []
for node in g.nodes:
    degrees.append(g.degree[node])

print("Average node degree:", round(sum(degrees) / len(degrees), 2))

partitions = community_louvain.best_partition(g)
values = list(partitions.values())
print('Number of communities:', len(np.unique(values)))

Total number of graph nodes: 1172
Total number of graph edges: 39641
Average node degree: 67.65
Number of communities: 51


In [153]:
column_arr = ["movie id","movie title","release date","video release date","IMDb URL","unknown","Action","Adventure","Animation","Children's","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"]
item_data = pd.read_csv('../data/ml-100k/u.item', delimiter = '|', names =column_arr)
# item_data= item_data.set_index('movie id')
# item_data = item_data[1]
# item_dict= item_data.to_dict()
item_dict = defaultdict(lambda:[])
item_data = item_data.to_numpy()

for d in item_data:
    res = []
    for indx in range(len(d)):
        if d[indx] == 1:
            res.append(column_arr[indx])
    
    item_dict[d[0]] = res
    

In [154]:
res_dict = defaultdict(lambda: [])
for k in partitions:
    res_dict[partitions[k]].append(item_dict[k])


In [161]:
# for k in res_dict:
#     print(len(res_dict[k]))
res_dict[5]

[['Drama'],
 ['Comedy'],
 ['Comedy', 'Thriller'],
 ['Comedy', 'Drama'],
 ['Drama'],
 ['Drama', 'War'],
 ['Drama', 'Romance'],
 ['Adventure', 'War'],
 ['Film-Noir', 'Mystery'],
 ['Comedy'],
 ['Crime', 'Thriller'],
 ['Drama'],
 ['Drama'],
 ['Comedy', 'Romance'],
 ['Comedy', 'Crime'],
 ['Comedy', 'Romance'],
 ['Drama', 'War'],
 ['Drama'],
 ["Children's", 'Comedy', 'Drama'],
 ['Comedy', 'Sci-Fi'],
 ['Documentary'],
 ['Action', 'Drama', 'Thriller'],
 ['Film-Noir'],
 ['Horror', 'Sci-Fi', 'Thriller'],
 ['Comedy'],
 ['Comedy'],
 ['Comedy', 'Romance'],
 ['Crime', 'Film-Noir', 'Mystery', 'Thriller'],
 ['Comedy', 'Crime'],
 ['Action', 'Drama', 'Western'],
 ['Comedy'],
 ['Drama', 'Romance'],
 ['Animation'],
 ['Drama', 'War'],
 ['Crime', 'Thriller'],
 ['Drama', 'Romance'],
 ['Adventure'],
 ['Comedy', 'Romance', 'Thriller'],
 ['Action', 'Drama', 'War'],
 ['Drama'],
 ['Action', 'Adventure', 'Drama'],
 ['Drama'],
 ['Drama'],
 ['Drama', 'Romance'],
 ['Drama'],
 ['Musical'],
 ['Drama'],
 ['Drama'],
 ['D

In [106]:
def compute_edges(x):
    res = []
    for pair in list(itertools.combinations(x, 2)):
        k = tuple(sorted([pair[0][0], pair[1][0]]))
        v = pair[0][1] + pair[1][1]
        res.append(tuple([k,v]))
    return res

edges = rated_movie.map(lambda x: (x[0], [(x[1], x[2])])) \
                        .reduceByKey(lambda a,b: a+b) \
                        .map(lambda x: x[1]) \
                        .flatMap(lambda x: compute_edges(x)) \
                        .reduceByKey(lambda a,b: (a+b)) \
                        .map(lambda x: tuple([x[0][0],x[0][1],x[1]])) \
                        .collect()


In [78]:
nodes = rated_movie.map(lambda x: x[1]).distinct().collect()


In [79]:
g= nx.Graph()
g.add_nodes_from(nodes)
g.add_weighted_edges_from(edges)
# nx.draw(g)

In [80]:
print("Total number of graph nodes:", g.number_of_nodes())
print("Total number of graph edges:", g.number_of_edges())

Total number of graph nodes: 1172
Total number of graph edges: 174005


In [83]:
degrees = []
for node in g.nodes:
    degrees.append(g.degree[node])

print("Average node degree:", round(sum(degrees) / len(degrees), 2))

Average node degree: 296.94


In [84]:
import community as community_louvain
partitions = community_louvain.best_partition(g)

In [85]:
values = list(partitions.values())

In [86]:
print('Number of communities:', len(np.unique(values)))

Number of communities: 5
