In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import textblob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
url = '../data/imdb_clean.csv'
df = pd.read_csv(url)
df.head()

In [None]:
df['data'] = df['title'] + ' ' + df['director'] + ' ' + df['genre']
df['data']

In [9]:
# remove punctuation - anything that is not a word or a space
df['data'] = df['data'].str.replace('[^\w\s]','') 
# lower case
df['data'] = df['data'].str.lower()

In [10]:
def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word not in stopwords.words('english')])

df['data'] = df['data'].apply(remove_stopwords)
df['data']

0       shawshank redemption frank darabont drama
1            godfather francis ford coppola crime
2            godfather francis ford coppola drama
3            dark knight christopher nolan action
4             dark knight christopher nolan crime
                          ...                    
2527             invisible man james whale horror
2528             invisible man james whale sci-fi
2529                cell 211 daniel monzón action
2530                 cell 211 daniel monzón crime
2531                 cell 211 daniel monzón drama
Name: data, Length: 2532, dtype: object

In [17]:
# vectorize the data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['data']).toarray()
X.shape

(2532, 2267)

In [21]:
# similarity matrix
similarity = cosine_similarity(X,X)
similarity

array([[1. , 0. , 0.2, ..., 0. , 0. , 0.2],
       [0. , 1. , 0.8, ..., 0. , 0.2, 0. ],
       [0.2, 0.8, 1. , ..., 0. , 0. , 0.2],
       ...,
       [0. , 0. , 0. , ..., 1. , 0.8, 0.8],
       [0. , 0.2, 0. , ..., 0.8, 1. , 0.8],
       [0.2, 0. , 0.2, ..., 0.8, 0.8, 1. ]])

In [22]:
def get_index_from_title(title):
    try:
        return df[df.title == title].index[0]
    except:
        return None

In [None]:

def recommend_movie(title, limit = 10):
    index = get_index_from_title(title)
    if index is None:
        return []
    else:
        movie_scores = []
        for i in range(similarity.shape[0]):
            movie_scores.append((df['title'][i], similarity[index][i]))
        movie_scores.sort(key = lambda x: x[1], reverse = True)
        return movie_scores[1:limit+1]

In [None]:
def recommend_moviee(title, limit = 10):
    index = get_index_from_title(title)
    if index is None:
        return []
    else:
        movie_scores = []
        for i in range(similarity.shape[0]):
            movie_scores.append((df['title'][i], similarity[index][i]))
        movie_scores.sort(key = lambda x: x[1], reverse = True)
        return movie_scores[1:limit+1]

In [26]:
recommend_movie('Harry Potter and the Goblet of Fire', 5)

[('Harry Potter and the Goblet of Fire', 0.8571428571428569),
 ('Harry Potter and the Goblet of Fire', 0.8571428571428569),
 ('Harry Potter and the Prisoner of Azkaban', 0.4285714285714285),
 ("Harry Potter and the Sorcerer's Stone", 0.4285714285714285),
 ('Harry Potter and the Deathly Hallows: Part 2', 0.40089186286863654)]