In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import textblob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
url = '../data/imdb_clean.csv'
df = pd.read_csv(url)
df.head()

In [None]:
df['data'] = df['title'] + ' ' + df['director'] + ' ' + df['genre']
df['data']

In [None]:
# remove punctuation - anything that is not a word or a space
df['data'] = df['data'].str.replace('[^\w\s]','')
# lower case
df['data'] = df['data'].str.lower()

In [None]:
def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word not in stopwords.words('english')])
df['data'] = df['data'].apply(remove_stopwords)
df['data']

In [None]:
# vectorize the data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['data']).toarray()
X.shape

In [None]:
# similarity matrix
similarity = cosine_similarity(X,X)
similarity

In [None]:
def get_index_from_title(title):
    try:
        return df[df.title == title].index[0]
    except:
        return None

In [None]:
def recommend_movie(title, limit = 10):
    index = get_index_from_title(title) 
    if index is None:
        return []
    else:
        movie_scores = []
        for i in range(similarity.shape[0]):
            movie_scores.append((df['title'][i], similarity[index][i])) 
        movie_scores.sort(key = lambda x: x[1], reverse = True)
        return movie_scores[1:limit+1]     

In [None]:
recommend_movie('Harry Potter and the Goblet of Fire', 5)