In [1]:
import os
import time
import gc
import argparse

# data science imports
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# utils import
from fuzzywuzzy import fuzz




In [2]:

from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
#df.head()
t=df[['Title','imdbRating']]
#print(t)
t=t.sort_values(by="imdbRating", ascending=False)
df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [3]:

print(t.head())

                      Title  imdbRating
0  The Shawshank Redemption         9.3
1             The Godfather         9.2
2    The Godfather: Part II         9.0
3           The Dark Knight         9.0
4              12 Angry Men         8.9


In [4]:
df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])

# putting the genres in a list of words
df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))

df['Director'] = df['Director'].map(lambda x: x.split(' '))

# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in df.iterrows():
    row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    row['Director'] = ''.join(row['Director']).lower()

In [5]:
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['Plot'], inplace = True)


In [6]:
df.head(20)

Unnamed: 0,Title,Genre,Director,Actors,Key_words
0,The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[years, finding, solace, number, two, imprison..."
1,The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[clandestine, empire, organized, crime, dynast..."
2,The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[family, crime, syndicate, early, life, 1920s,..."
3,The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[wreaks, havoc, dark, knight, must, accept, on..."
4,12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[miscarriage, forcing, justice, colleagues, re..."
5,Schindler's List,"[biography, drama, history]",stevenspielberg,"[liamneeson, benkingsley, ralphfiennes]","[nazi, germans, german, world, war, ii, occupi..."
6,The Lord of the Rings: The Return of the King,"[adventure, drama, fantasy]",peterjackson,"[noelappleby, aliastin, seanastin]","[gaze, aragorn, lead, men, sam, gandalf, army,..."
7,Pulp Fiction,"[crime, drama]",quentintarantino,"[timroth, amandaplummer, lauralovelace]","[diner, bandits, intertwine, boxer, wife, gang..."
8,Fight Club,[drama],davidfincher,"[edwardnorton, bradpitt, meatloaf]","[evolves, change, way, something, much, care, ..."
9,The Lord of the Rings: The Fellowship of the Ring,"[adventure, drama, fantasy]",peterjackson,"[alanhoward, noelappleby, seanastin]","[eight, companions, set, destroy, shire, journ..."


In [7]:
df.set_index('Title', inplace = True)

In [8]:
df['bag_of_words'] = ''
columns = df.columns
for index,row in df.iterrows():
    words = ''
    for col in columns:
        s=""
        #print(row,col)
        if col != 'Director':
            words = words + ' '.join(row[col])+ ' '
            #print(s.join(row[col]))
        else:
            words = words + row[col]+ ' '
            #print(words)
    row['bag_of_words'] = words

df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

In [9]:
#del df['bag_of_words']
df.head()

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
The Shawshank Redemption,crime drama frankdarabont timrobbins morganfr...
The Godfather,crime drama francisfordcoppola marlonbrando a...
The Godfather: Part II,crime drama francisfordcoppola alpacino rober...
The Dark Knight,action crime drama christophernolan christia...
12 Angry Men,crime drama sidneylumet martinbalsam johnfied...


In [10]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)
indices[:5]

0    The Shawshank Redemption
1               The Godfather
2      The Godfather: Part II
3             The Dark Knight
4                12 Angry Men
Name: Title, dtype: object

In [11]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [12]:
print(count_matrix)

  (0, 59)	1
  (0, 306)	1
  (0, 311)	1
  (0, 519)	1
  (0, 584)	1
  (0, 655)	1
  (0, 768)	1
  (0, 888)	1
  (0, 969)	1
  (0, 1011)	1
  (0, 1269)	1
  (0, 1733)	1
  (0, 1810)	1
  (0, 1899)	1
  (0, 2174)	1
  (0, 2481)	1
  (0, 2678)	1
  (0, 2765)	1
  (0, 2950)	1
  (1, 82)	1
  (1, 118)	1
  (1, 475)	1
  (1, 553)	1
  (1, 584)	2
  (1, 768)	1
  :	:
  (248, 1645)	1
  (248, 1648)	1
  (248, 1735)	1
  (248, 1823)	1
  (248, 1920)	1
  (248, 2188)	1
  (248, 2459)	1
  (248, 2721)	1
  (249, 50)	1
  (249, 146)	1
  (249, 443)	1
  (249, 621)	1
  (249, 705)	1
  (249, 768)	1
  (249, 1278)	1
  (249, 1615)	1
  (249, 1779)	1
  (249, 1828)	1
  (249, 2176)	1
  (249, 2340)	1
  (249, 2472)	1
  (249, 2626)	1
  (249, 2801)	1
  (249, 2825)	1
  (249, 2869)	1


In [13]:
cosine_sim

array([[1.        , 0.15789474, 0.13764944, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.15789474, 1.        , 0.36706517, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.13764944, 0.36706517, 1.        , ..., 0.04588315, 0.04588315,
        0.04850713],
       ...,
       [0.05263158, 0.05263158, 0.04588315, ..., 1.        , 0.05263158,
        0.05564149],
       [0.05263158, 0.05263158, 0.04588315, ..., 0.05263158, 1.        ,
        0.05564149],
       [0.05564149, 0.05564149, 0.04850713, ..., 0.05564149, 0.05564149,
        1.        ]])

In [14]:
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
     #gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies b6
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    return recommended_movies

In [15]:
recommendations('Forrest Gump')

['The Apartment',
 'City Lights',
 'Roman Holiday',
 'Annie Hall',
 'His Girl Friday',
 'Gone with the Wind',
 'Groundhog Day',
 'Brief Encounter',
 'La La Land',
 'The Graduate']

<p style="font-family: Arial; font-size:2.75em;color:purple; font-style:bold"><br>

<br> updated for more searches. Recommendations are more improvised after every search of user 

<br><br></p>

In [16]:
cosine_sim = cosine_similarity(count_matrix, count_matrix[0])

In [17]:
cosine_sim

array([[1.        ],
       [0.15789474],
       [0.13764944],
       [0.08377078],
       [0.11128298],
       [0.04682929],
       [0.09567297],
       [0.24455799],
       [0.04499213],
       [0.04682929],
       [0.05129892],
       [0.        ],
       [0.        ],
       [0.04588315],
       [0.05263158],
       [0.17040573],
       [0.        ],
       [0.03993615],
       [0.21052632],
       [0.05129892],
       [0.0978232 ],
       [0.09176629],
       [0.08998425],
       [0.05006262],
       [0.04682929],
       [0.05564149],
       [0.10814761],
       [0.05263158],
       [0.05407381],
       [0.14350946],
       [0.        ],
       [0.        ],
       [0.05407381],
       [0.        ],
       [0.10814761],
       [0.04588315],
       [0.        ],
       [0.04055536],
       [0.05129892],
       [0.04682929],
       [0.10526316],
       [0.04682929],
       [0.        ],
       [0.05006262],
       [0.05006262],
       [0.        ],
       [0.        ],
       [0.054

In [18]:
count_matrix.shape

(250, 2961)

In [19]:
temp=count_matrix[0]+count_matrix[1]
temp.shape

(1, 2961)

In [20]:
print(count_matrix[0]+count_matrix[1])

  (0, 59)	1
  (0, 82)	1
  (0, 118)	1
  (0, 306)	1
  (0, 311)	1
  (0, 475)	1
  (0, 519)	1
  (0, 553)	1
  (0, 584)	3
  (0, 655)	1
  (0, 768)	2
  (0, 793)	1
  (0, 846)	1
  (0, 888)	1
  (0, 969)	1
  (0, 1007)	1
  (0, 1011)	1
  (0, 1269)	1
  (0, 1355)	1
  (0, 1671)	1
  (0, 1733)	1
  (0, 1810)	1
  (0, 1899)	1
  (0, 1932)	1
  (0, 1972)	1
  (0, 2174)	1
  (0, 2193)	1
  (0, 2481)	1
  (0, 2492)	1
  (0, 2678)	1
  (0, 2730)	1
  (0, 2765)	1
  (0, 2950)	1


In [21]:
queue=["Goodfellas","Psycho","Rope","The Terminator"]
idx4 = indices[indices == queue[-4]].index[0]
idx3 = indices[indices == queue[-3]].index[0]
idx2 = indices[indices == queue[-2]].index[0]
idx1 = indices[indices == queue[-1]].index[0]
temp=count_matrix[idx1]+count_matrix[idx2]+count_matrix[idx3]+count_matrix[idx4]

In [22]:
print(temp)

  (0, 0)	1
  (0, 21)	1
  (0, 24)	1
  (0, 28)	1
  (0, 56)	1
  (0, 104)	2
  (0, 127)	1
  (0, 160)	1
  (0, 169)	1
  (0, 181)	1
  (0, 196)	1
  (0, 308)	1
  (0, 424)	1
  (0, 444)	1
  (0, 477)	1
  (0, 481)	1
  (0, 559)	1
  (0, 567)	1
  (0, 574)	1
  (0, 584)	4
  (0, 605)	1
  (0, 703)	1
  (0, 717)	1
  (0, 751)	1
  (0, 768)	2
  :	:
  (0, 2188)	1
  (0, 2199)	1
  (0, 2262)	1
  (0, 2306)	1
  (0, 2357)	1
  (0, 2381)	1
  (0, 2387)	1
  (0, 2398)	1
  (0, 2483)	1
  (0, 2492)	1
  (0, 2551)	1
  (0, 2559)	1
  (0, 2608)	1
  (0, 2626)	1
  (0, 2666)	2
  (0, 2694)	1
  (0, 2765)	1
  (0, 2776)	1
  (0, 2822)	1
  (0, 2860)	1
  (0, 2870)	1
  (0, 2894)	1
  (0, 2896)	1
  (0, 2950)	1
  (0, 2954)	2


In [23]:
cosine_sim = cosine_similarity(count_matrix, temp)
cosine_sim.shape

(250, 1)

In [24]:
print(list(cosine_sim.reshape(250,)))

[0.18770378225773235, 0.22941573387056177, 0.2545454545454546, 0.11618357280412613, 0.1322921591107271, 0.055670221426890425, 0.07582342329348174, 0.19381883305055495, 0.0534862186740502, 0.03711348095126028, 0.040655781409087086, 0.03319530651546461, 0.055670221426890425, 0.03636363636363637, 0.04171195161282941, 0.5739674089108874, 0.08131156281817417, 0.031650482901035976, 0.16684780645131764, 0.10163945352271772, 0.19381883305055495, 0.14545454545454548, 0.17828739558016732, 0.09918995010726928, 0.03711348095126028, 0.1102434659256059, 0.128564869306645, 0.10427987903207352, 0.08570991287109668, 0.13269099076359303, 0.49285225140763156, 0.022727272727272728, 0.0642824346533225, 0.128564869306645, 0.19284730395996752, 0.07272727272727274, 0.2085144140570748, 0.06428243465332249, 0.10163945352271772, 0.03711348095126028, 0.12513585483848824, 0.11134044285378085, 0.10713739108887085, 0.05951397006436157, 0.05951397006436157, 0.06639061303092922, 0.020327890704543543, 0.064282434653322

In [25]:
title=queue
def recommendation(title, cosine_sim = cosine_sim):
    recommended_movies = []
    idx1 = indices[indices == title[0]].index[0]
    temp=count_matrix[idx1]
    for i in range(1,len(title)):
        idx = indices[indices == title[i]].index[0]
        temp+=count_matrix[idx]
    cosine_sim=cosine_similarity(temp,count_matrix)
    score_series = pd.Series(list(cosine_sim.reshape(250,))).sort_values(ascending = False)

    top_10_indexes = list(score_series.iloc[:10+len(title)].index)
    print(temp)
    # populating the list with the titles of the best 10 matching movies
    i=0
    #print("bhaih")
    c=0
    while(i< 10+len(title)):
        if indices[top_10_indexes[i]] not in queue:
            recommended_movies.append(indices[top_10_indexes[i]])
            #print(indices[top_10_indexes[i]])
        i+=1
    print(list(recommended_movies),end="\n")
recommendation(queue)

  (0, 0)	1
  (0, 21)	1
  (0, 24)	1
  (0, 28)	1
  (0, 56)	1
  (0, 104)	2
  (0, 127)	1
  (0, 160)	1
  (0, 169)	1
  (0, 181)	1
  (0, 196)	1
  (0, 308)	1
  (0, 424)	1
  (0, 444)	1
  (0, 477)	1
  (0, 481)	1
  (0, 559)	1
  (0, 567)	1
  (0, 574)	1
  (0, 584)	4
  (0, 605)	1
  (0, 703)	1
  (0, 717)	1
  (0, 751)	1
  (0, 768)	2
  :	:
  (0, 2188)	1
  (0, 2199)	1
  (0, 2262)	1
  (0, 2306)	1
  (0, 2357)	1
  (0, 2381)	1
  (0, 2387)	1
  (0, 2398)	1
  (0, 2483)	1
  (0, 2492)	1
  (0, 2551)	1
  (0, 2559)	1
  (0, 2608)	1
  (0, 2626)	1
  (0, 2666)	2
  (0, 2694)	1
  (0, 2765)	1
  (0, 2776)	1
  (0, 2822)	1
  (0, 2860)	1
  (0, 2870)	1
  (0, 2894)	1
  (0, 2896)	1
  (0, 2950)	1
  (0, 2954)	2
['The Godfather: Part II', 'Casino', 'The Godfather', 'Baby Driver', 'Cool Hand Luke', 'Fargo', 'Terminator 2: Judgment Day', 'The Silence of the Lambs', 'Pulp Fiction', 'The Departed']
