



# ** Movie Talkie : Predicting a movie choice that majority of friends would agree on given n number of movie choices**


---




In [None]:
import sys
import csv
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print("System Info : ", sys.version)
print("Tensorflow Version : ", tf.__version__)

System Info :  3.6.9 (default, Apr 18 2020, 01:56:04) 
[GCC 8.4.0]
Tensorflow Version :  2.2.0


In [None]:
data = pd.read_csv("/content/drive/My Drive/imdb_movies.csv")
print(data.shape)
for col in data.columns:
  print(col,end=",")

(81273, 22)
imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics,

In [None]:
data.head(2)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0


In [None]:
#Installing and Importing Required Libraries 
!pip install rake_nltk
from rake_nltk import Rake

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
#Using Rake to get keywords from the movie description and sklearn for measuring similarity



In [None]:
#Data Preprocessing and Cleaning 
df = data[['title','genre','description']].dropna(axis=0)           #Dropping the rows with empty values
df = df.sample(frac=0.2,random_state=1)                             #This leads to a total of 15769 values 

for col in df.columns:                                              #converting all entries to lowercase 
  df[col] = df[col].str.lower()
df = df.set_index(['title'])                                        #Changing the index to title


#Approach : Combining the genre and description into one column and then calculating the cosine similarity
#           between the different movies.
df['desc'] = ""

for index,row in df.iterrows():
  description = row['description']
  r = Rake()
  r.extract_keywords_from_text(description)
  dict_scores = r.get_word_degrees()                               #To get keyword phrases ranked lowest to highest
  row['desc'] = ','.join(list((dict_scores.keys())))
  row['genre'] = row['genre'].replace(", ",",") + ','

df['desc'] = df['genre'] + df['desc']
for index,row in df.iterrows():
    row['desc'] = row['desc'].replace(','," ")

df.drop(columns=['genre','description'],inplace=True)
df.head()

Unnamed: 0_level_0,desc
title,Unnamed: 1_level_1
max dugan returns,comedy drama life disrupted father child comes...
naajayaz,action drama sweetheart police inspector jay b...
passengers,drama friday night los angeles naturally get m...
samrajyam,action crime thriller story underworld fall up...
g,drama romance gatsbyesque love story set hop g...


In [None]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['desc'])
sim_matrix = cosine_similarity(count_matrix,count_matrix)
print(sim_matrix.shape)

(15769, 15769)


In [None]:
indices = pd.Series(df.index)
key_list = list(dict(indices).keys())
val_list = list(dict(indices).values())
indices

0                       max dugan returns
1                                naajayaz
2                              passengers
3                               samrajyam
4                                       g
                       ...               
15764                     tenemos 18 años
15765            andy hardy's double life
15766                         liar's edge
15767                island of doomed men
15768    little rascals: best of our gang
Name: title, Length: 15769, dtype: object

In [None]:
df.to_csv('test.csv')                                               #Save as CSV file for debugging

In [None]:
#Approach for Subtask 1:

#We take the cosine similarity of every input movie with with every other input movie and
#take the mean of them for a single movie.

#The one with the highest mean value is going to be the recommended movie choice ie the one with highest similarity with 
#all the other movies.

def recommender(n,movies):
  imovies = [key_list[val_list.index(item)] for item in movies]
  scores = []
  for i1 in range(n):
    simscore = 0
    for i2 in range(n):
      if(imovies[i1]!=imovies[i2]):
        simscore += sim_matrix[imovies[i1]][imovies[i2]]
    simscore/=(n-1)
    scores.append(simscore)
  return(movies[np.argmax(scores)])

def sort_tuplelist(tup_list):
  tup_list.sort(key = lambda x : x[1],reverse=True)
  return tup_list

def recommend():
  print("Hello Welcome to the Movie Recommender V1.0 :) ")
  n = int(input('Please Enter the number of movie choices : \n'))
  movies = []
  print("Please Enter the movie choices : \n")

  for i in range(n):
    movie = str(input()).lower()
    movies.append(movie)
  recommended = recommender(n,movies)
  print('The movie recommended for you guys is {}!'.format(recommended))

  #Further predicting 5 more similar movies based on the recommended one and rank them on the basis of relevance
  
  id = key_list[val_list.index(recommended)]
  row = sim_matrix[id]
  row_pair = [(j,row[j]) for j in range(len(row))]
  sort_tuplelist(row_pair)
  print("The 5 best similar matches ranked on basis of relevance are  : \n")
  for i in range(1,6):
    print(i,' : ', val_list[row_pair[i][0]],end="\n")


In [None]:
recommend()

Hello Welcome to the Movie Recommender V1.0 :) 
Please Enter the number of movie choices : 
4
Please Enter the movie choices : 

La La Land
Avengers: Age of Ultron
Captain America: The First Avenger
Hulk
The movie recommended for you guys is avengers: age of ultron!
The 5 best similar matches ranked on basis of relevance are  : 

1  :  spider-man: far from home
2  :  toraberâzu: jigen keisatsu
3  :  plan 9 from outer space
4  :  captain marvel
5  :  iron man
