In [18]:
from neo4j import GraphDatabase
from tqdm import tqdm
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow_text as text
import tensorflow_hub as hub
bert_preprocess=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

# init

In [3]:

class Neo4j:
    def __init__(self,uri,user,password):
        self.driver = GraphDatabase.driver(uri,auth=(user,password))

    def query(self,query):
        result=None
        with self.driver.session() as session:
            result = list(session.run(query))
            session.close()
        return result

    def get_movies_based_on_genre(self,genre_name):
        result=None
        with self.driver.session() as session:
            query = f"""
                    match (g:genre) where g.name=$genre_name with g
                    match (g)-[r:`genre-type`]-(m)
                    return m.id as id,m.Movie as movie,m.genre as genre,m.rating as rating,m.votings as votings
                    """
            result=list(session.run(query,genre_name=genre_name))
            session.close()
        return result

    def close(self):
        self.driver.close()

neo = Neo4j("bolt://localhost:7687","neo4j","1234")
def get_sentence_embeding(sentences):
    preprocessed_text=bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

# input movie id

In [74]:
input_movie_id="tt0044954"

# Collecting input movie genres

In [75]:

# input_movie_genres=pd.DataFrame([dict(_) for _ in neo.get_genre(input_movie)])
input_movie_genres=[ _["genre"] for _ in neo.query(f''' 
                    match (m:movie) where m.id="{input_movie_id}" with m
                    match (m)-[r:`genre-type`]-(g)
                    with m,g
                    return g.name as genre ''')]
input_movie_genres

['Thriller', 'Crime', 'Drama', 'Film-Noir']

# directors of the input movie

In [76]:
input_movie_directors= [ _["id"] for _ in neo.query(f''' 
            match(m: movie) 
            where m.id="{input_movie_id}"
            with m
            match (m)-[r:directed]-(d)
            with m,d
            return d.id as id
''')]
input_movie_directors

[281507, 580017]

# writers of input movie

In [77]:
input_movie_writers= [ _["id"] for _ in neo.query(f''' 
            match(m: movie) 
            where m.id="{input_movie_id}"
            with m
            match (m)-[r:written]-(w)
            with m,w
            return w.id as id
''')]
input_movie_writers

[326107, 271641, 502649]

# same personality directors

In [78]:
# including this directors
same_persona_directors=dict()
for each_director in tqdm(input_movie_directors):
      for g in input_movie_genres:
            if(g=="Film-Noir"):
                g="FilmNoir"
            elif(g=="Sci-Fi"):
                g="SciFi"
            elif(g=="Reality-TV"):
                g="RealityTV"
            g=g+"_trait"
            value=[_[g] for _ in neo.query(f'''
                            match (d:director) 
                            where d.id={each_director}
                            with d 
                            return d.{g} as {g}
                            ''')]
            # print(value)
            query_result=neo.query(f''' 
                        match (d:director)
                        where d.{g}= {value[0]}
                        with d
                        return d.id as id,d.{g} as {g}
            ''')
            # print(query_result)
            for _ in query_result:
                same_persona_directors[_["id"]]=_[g]

print("Length: "+str(len(same_persona_directors)))

100%|██████████| 2/2 [00:00<00:00,  5.44it/s]

Length: 2540





In [79]:
# sorting based on values
for i in same_persona_directors:
    print(i,same_persona_directors[i],sep=" ")
    break
same_persona_directors = sorted(same_persona_directors.items(), key=lambda x: x[1], reverse=True)
for i in same_persona_directors:
    print(i[0],i[1])
    break
print("Length: "+str(len(same_persona_directors)))

1027519 8
682309 20
Length: 2540


# same personality writers

In [80]:
# including this writers
same_persona_writers=dict()
for each_writer in tqdm(input_movie_writers):
      for g in input_movie_genres:
            if(g=="Film-Noir"):
                g="FilmNoir"
            elif(g=="Sci-Fi"):
                g="SciFi"
            elif(g=="Reality-TV"):
                g="RealityTV"
            g=g+"_trait"
            value=[_[g] for _ in neo.query(f'''
                            match (w:writer) 
                            where w.id={each_writer}
                            with w 
                            return w.{g} as {g}
                            ''')]
            # print(value)
            query_result=neo.query(f''' 
                        match (w:writer) 
                        where w.{g}={value[0]}
                        with w
                        return w.id as id,w.{g} as {g}
            ''')
            # print(query_result)
            for _ in query_result:
                same_persona_writers[_["id"]]=_[g]

print("Length: "+str(len(same_persona_writers)))

100%|██████████| 3/3 [00:01<00:00,  2.77it/s]

Length: 5104





In [81]:
# sorting based on values
for i in same_persona_writers:
    print(i,same_persona_writers[i],sep=" ")
    break
same_persona_writers = sorted(same_persona_writers.items(), key=lambda x: x[1], reverse=True)
for i in same_persona_writers:
    print(i[0],i[1])
    break
print("Length: "+str(len(same_persona_writers)))

104335 13
1847868 20
Length: 5104


# input movie rating

In [82]:
input_movie_rating=[_["rating"] for _ in neo.query(
f'''
    match(m:movie)  
    where m.id ="{input_movie_id}"
    with m
    return m.rating as rating
''')]

input_movie_rating[0]

7

# movies from these content creators along with rating comparison

In [83]:
final_movies_list=[]
# test=[91515]
# for each_creator in test:
directors_len=len(same_persona_directors)
if(directors_len>500):
    directors_len=500
for each_creator in tqdm(same_persona_directors[:directors_len]):
    query_result=neo.query(f'''
        match (d:director)
        where d.id ={each_creator[0]}
        with d
        match (d)-[r:directed]-(m:movie)
        with d,m
        where m.rating>{input_movie_rating[0]}
        and any(x in split(m.genre,"|") where x in {input_movie_genres})
        return m.id as id
    ''')
    # print(query_result)
    for each_movie in query_result:
        final_movies_list.append(each_movie["id"])
print("Length: "+str(len(final_movies_list)))

100%|██████████| 500/500 [00:07<00:00, 66.85it/s]

Length: 32





In [84]:
writers_len=len(same_persona_writers)
if(writers_len>500):
    writers_len=500
for each_creator in tqdm(same_persona_writers[:writers_len]):
    query_result=neo.query(f'''
        match (w:writer)
        where w.id ={each_creator[0]}
        with w
        match (w)-[r:written]-(m:movie)
        with w,m
        where m.rating>{input_movie_rating[0]}
        and any(x in split(m.genre,"|") where x in {input_movie_genres})
        return m.id as id
    ''')
    # print(query_result)
    for each_movie in query_result:
        final_movies_list.append(each_movie["id"])
print("Length: "+str(len(final_movies_list)))

100%|██████████| 500/500 [00:06<00:00, 71.90it/s]

Length: 65





In [85]:
l=len(final_movies_list)
if(l>1600):
    final_movies_list=final_movies_list[:1600]
l=len(final_movies_list)
l

65

# input movie plot

In [86]:
input_movie_plot=[ _["plot"] for _ in neo.query(f''' 
            match (m:movie) where m.id="{input_movie_id}"
            with m
            return m.plot as plot
''')]
input_movie_plot[0]

'A woman planning to testify against the mob must be protected against their assassins on the train trip from Chicago to Los Angeles.'

# context embedings 

In [87]:
# input movies vector
input_movie_vector=get_sentence_embeding([input_movie_plot[0]])

# get final movie's plots with their embeddings

In [88]:
final_movies=dict()
if(len(final_movies_list)>150):
        final_movies_list=final_movies_list[:150]
for each_final_movie_id in tqdm(final_movies_list):
    query_result=[ _["plot"] for _ in neo.query(f''' 
            match(m:movie)
            where m.id="{each_final_movie_id}"
            with m
            return m.plot as plot 
    ''')]
    final_movies[each_final_movie_id]=get_sentence_embeding([query_result[0]])
print("Length: "+str(len(final_movies_list)))

100%|██████████| 65/65 [00:11<00:00,  5.54it/s]

Length: 65





# cosine sim of movies with their id

In [89]:
for each_final_movie_id in tqdm(final_movies):
    result=cosine_similarity([input_movie_vector[0]],[final_movies[each_final_movie_id][0]])
    final_movies[each_final_movie_id]=result[0][0]
print("Length: "+str(len(final_movies)))

100%|██████████| 57/57 [00:00<00:00, 735.37it/s]

Length: 57





# sorting based on sim

In [90]:
for i in final_movies:
    print(final_movies[i])
    break
final_movies = sorted(final_movies.items(), key=lambda x: x[1], reverse=True)

0.9807118


In [91]:
for i in final_movies:
    print(i[0],i[1])
    break

tt0044837 0.98946136
