In [1]:
import pandas as pd
import numpy as np
import datetime
from collections import Counter
from py2neo import Graph, Node, Relationship
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
graph = Graph()

In [3]:
query = """
            MATCH (m:Movie)-[:HAS_GENRE]->(g:Genre)
            RETURN  m.movieID as movieID, COALESCE(g.name) as genres
            ORDER BY m.movieID ASC
            """
dffilmgen = pd.DataFrame(graph.run(query).data())

In [4]:
dffilmgen.groupby('genres').count()

Unnamed: 0_level_0,movieID
genres,Unnamed: 1_level_1
Action,1828
Adventure,1263
Animation,611
Children,664
Comedy,3756
Crime,1199
Documentary,440
Drama,4361
Fantasy,779
Film-Noir,87


In [5]:
dffilmgen = pd.get_dummies(dffilmgen, columns=['genres']).groupby('movieID').sum()

In [6]:
query = """
        MATCH (m:Movie)
        RETURN m.movieID as movieID,m.avgrating as avgrating, m.year as year, m.lang as lang
            """
dffilm = pd.DataFrame(graph.run(query).data())

In [7]:
dffilm = dffilm[["movieID","avgrating"]]

In [8]:
dffilm[dffilm.columns[dffilm.isnull().any()]].isnull().sum()


avgrating    18
dtype: int64

In [9]:
dffilm["avgrating"].fillna(0, inplace=True)

In [10]:
dffilm[dffilm.columns[dffilm.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [14]:
cos_genres = cosine_similarity(dffilmgen.values)*0.5
cos_rating = cosine_similarity(dffilm.values)*0.5
cos = cos_genres+cos_rating

In [15]:
movies_sim = pd.DataFrame(cos, columns=dffilm['movieID'], index=dffilm['movieID'])

In [16]:
movies_sim

movieID,147410,147657,147662,147936,148166,148172,148238,148424,148482,148592,...,9,10,11,12,13,14,15,16,17,18
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
147410,1.000000,0.887298,0.658114,0.629099,0.723607,0.500000,0.658114,0.816228,0.500000,0.629099,...,0.695949,0.630104,0.632399,0.713734,0.486155,0.817664,0.877592,0.485593,0.646221,0.713370
147657,0.887298,1.000000,0.500000,0.500000,0.500000,0.500000,0.500000,0.908248,0.500000,0.666667,...,0.472341,0.471989,0.474284,0.490127,0.486154,0.626590,0.656960,0.485592,0.488107,0.489763
147662,0.658114,0.500000,1.000000,0.908248,0.853553,0.500000,1.000000,0.500000,0.500000,0.500000,...,0.649117,0.471988,0.724283,0.490126,0.486154,0.659029,0.694416,0.485592,0.488106,0.843316
147936,0.629099,0.500000,0.908248,1.000000,0.788675,0.500000,0.908248,0.500000,0.500000,0.500000,...,0.616679,0.676114,0.882533,0.490127,0.486155,0.626591,0.656960,0.774268,0.488107,0.778438
148166,0.723607,0.500000,0.853553,0.788675,1.000000,0.500000,0.853553,0.500000,0.500000,0.500000,...,0.722341,0.471989,0.827837,0.490127,0.486154,0.732252,0.778968,0.485592,0.488106,0.989762
148172,0.500000,0.500000,0.500000,0.500000,0.500000,1.000000,0.500000,0.500000,0.788675,0.833333,...,0.616677,0.471988,0.474283,0.490126,0.486153,0.626589,0.490292,0.485591,0.692230,0.489762
148238,0.658114,0.500000,1.000000,0.908248,0.853553,0.500000,1.000000,0.500000,0.500000,0.500000,...,0.649116,0.471988,0.724283,0.490126,0.486153,0.659028,0.694416,0.485591,0.488105,0.843315
148424,0.816228,0.908248,0.500000,0.500000,0.500000,0.500000,0.500000,1.000000,0.500000,0.704124,...,0.472341,0.471989,0.474284,0.490127,0.486154,0.482253,0.490293,0.485592,0.488106,0.489763
148482,0.500000,0.500000,0.500000,0.500000,0.500000,0.788675,0.500000,0.500000,1.000000,0.788675,...,0.722340,0.471988,0.474283,0.490126,0.486154,0.732252,0.490292,0.485592,0.841659,0.489762
148592,0.629099,0.666667,0.500000,0.500000,0.500000,0.833333,0.500000,0.704124,0.788675,1.000000,...,0.616678,0.471988,0.474283,0.490126,0.486154,0.626589,0.490292,0.485592,0.692230,0.489762


In [17]:
def get_similar(movieID):
    df = movies_sim.loc[movies_sim.index == movieID].reset_index(). \
            melt(id_vars='movieID', var_name='sim_moveId', value_name='sim_score'). \
            sort_values('sim_score', axis=0, ascending=False)[1:11]
    return df
movies_similarity = pd.DataFrame(columns=['movieID','sim_moveId','sim_score'])

In [18]:
for x in movies_sim.index.tolist():
    movies_similarity = movies_similarity.append(get_similar(x))

In [19]:
movies_similarity

Unnamed: 0,movieID,sim_moveId,sim_score
2355,147410,93510,1.000000
1706,147410,72605,1.000000
2809,147410,46965,1.000000
3568,147410,68157,1.000000
3000,147410,51925,1.000000
6194,147410,4773,1.000000
6486,147410,5292,1.000000
6948,147410,2922,1.000000
8219,147410,1944,0.999999
7760,147410,1263,0.999998


In [43]:
movies_similarity.to_csv('../Data/movie_sim.csv', index=False)