# Content Based Recommendation

> [Yalim Demirkesen](github.com/demirkeseny)



In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.corpus import wordnet
from ast import literal_eval

In [2]:
links = pd.read_csv('C:/Users/demir/Desktop/movie-recommender/data/links_small.csv')

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
# checking for any missing values
links.isnull().sum()

movieId     0
imdbId      0
tmdbId     13
dtype: int64

In [5]:
movies = pd.read_csv('./data/movies.csv')

In [6]:
# only including the movies that have a link in the link.csv file
# important to have a data frame without any na's after the using inner join
submovies = movies[movies['id'].isin(links['tmdbId'])]

In [7]:
# dropping na's in the submovies
submovies.dropna(subset=['id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
submovies.shape

(9078, 15)

In [9]:
# dropping an unnecessary column
submovies.drop(["Unnamed: 0"], axis = 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [10]:
submovies.head()

Unnamed: 0,id,original_title,popularity,title,vote_average,vote_count,0,1,2,3,4,5,6,7
0,862.0,Toy Story,21.946943,Toy Story,7.7,5415.0,Animation,Comedy,Family,,,,,
1,8844.0,Jumanji,17.015539,Jumanji,6.9,2413.0,Adventure,Fantasy,Family,,,,,
2,15602.0,Grumpier Old Men,11.7129,Grumpier Old Men,6.5,92.0,Romance,Comedy,,,,,,
3,31357.0,Waiting to Exhale,3.859495,Waiting to Exhale,6.1,34.0,Comedy,Drama,Romance,,,,,
4,11862.0,Father of the Bride Part II,8.387519,Father of the Bride Part II,5.7,173.0,Comedy,,,,,,,


In [11]:
# creating a tfidf on the titles
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(submovies['title'])

In [12]:
tfidf_matrix.shape

(9078, 14802)

In [13]:
# since recommender systems rely on the cosine similarity, I benefited from that one
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [14]:
titles= submovies['title']
indices = pd.Series(submovies.index, index = titles)

In [15]:
# created a function to create a recommendation
def recommender(title_of_movie):
    scores = sorted(list(enumerate(cosine_sim[indices[title_of_movie]])), key=lambda x:x[1], reverse=True)[1:31]
    movie_no = [cell[0]for cell in scores]
    return titles.iloc[movie_no]

In [17]:
# running the recommender engine for the movie jumanji
recommender('Toy Story')

2997                        Toy Story 2
15348                       Toy Story 3
21928              Toy Story of Terror!
4799                            The Toy
1999                         L.A. Story
2845                    The Story of Us
5846                     The Story of O
5705                       Toy Soldiers
4168                         Love Story
27833                        True Story
8871                       Police Story
2689                  A Christmas Story
1838                    West Side Story
2736                  A Soldier's Story
2016     Ever After: A Cinderella Story
7916                 A Cinderella Story
6956                        Crime Story
2052              The NeverEnding Story
2850                 The Straight Story
6494                        Tokyo Story
871              The Philadelphia Story
3281     We're Back! A Dinosaur's Story
26813             The Story on Page One
820                The Story of Xinghua
3693                 The Official Story
