# Importing libraries and collecting data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
data=pd.read_csv('imdb_1000.csv')
data.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 979 entries, 0 to 978
Data columns (total 6 columns):
star_rating       979 non-null float64
title             979 non-null object
content_rating    976 non-null object
genre             979 non-null object
duration          979 non-null int64
actors_list       979 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 46.0+ KB


In [4]:
data.shape

(979, 6)

In [5]:
data.isnull().sum()

star_rating       0
title             0
content_rating    3
genre             0
duration          0
actors_list       0
dtype: int64

In [6]:
data=data.dropna()

In [7]:
data.isnull().sum()

star_rating       0
title             0
content_rating    0
genre             0
duration          0
actors_list       0
dtype: int64

In [8]:
data.columns

Index(['star_rating', 'title', 'content_rating', 'genre', 'duration',
       'actors_list'],
      dtype='object')

# Creating dummy variable for Genre and Content_rating

In [9]:
pd.get_dummies(data[["genre"]]).head()

Unnamed: 0,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Drama,genre_Family,genre_Fantasy,genre_Film-Noir,genre_History,genre_Horror,genre_Mystery,genre_Sci-Fi,genre_Thriller,genre_Western
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [10]:
pd.get_dummies(data[["content_rating"]]).head()

Unnamed: 0,content_rating_APPROVED,content_rating_G,content_rating_GP,content_rating_NC-17,content_rating_NOT RATED,content_rating_PASSED,content_rating_PG,content_rating_PG-13,content_rating_R,content_rating_TV-MA,content_rating_UNRATED,content_rating_X
0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0


# Concating into a new dataframe named 'data_feature'

In [11]:
data_feature=pd.concat([pd.get_dummies(data[["genre"]]),pd.get_dummies(data[["content_rating"]]),data['star_rating'],data['duration']],axis=1)
data_feature.head()

Unnamed: 0,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Drama,genre_Family,genre_Fantasy,genre_Film-Noir,...,content_rating_NOT RATED,content_rating_PASSED,content_rating_PG,content_rating_PG-13,content_rating_R,content_rating_TV-MA,content_rating_UNRATED,content_rating_X,star_rating,duration
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,9.3,142
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,9.2,175
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,9.1,200
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,9.0,152
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,8.9,154


Rating ranges from 0–10 in the dataset while the duration number can be even 100+ minutes long. This can bias the distance metric in KNN because features containing bigger numbers will be weighted heavily while the other features will be discounted. So I ended up using MinMaxScaler from scikit-learn as it scales the values from 0–1

In [12]:
from sklearn.preprocessing import MinMaxScaler

In [13]:
min_max_scaler = MinMaxScaler()
data_feature = min_max_scaler.fit_transform(data_feature)

In [14]:
np.round(data_feature,2)

array([[0.  , 0.  , 0.  , ..., 0.  , 1.  , 0.44],
       [0.  , 0.  , 0.  , ..., 0.  , 0.95, 0.62],
       [0.  , 0.  , 0.  , ..., 0.  , 0.89, 0.76],
       ...,
       [1.  , 0.  , 0.  , ..., 0.  , 0.  , 0.42],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.28],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.35]])

# Using KNN algorithm to find our prediction

In [15]:
from sklearn.neighbors import NearestNeighbors

In [16]:
nbrs = NearestNeighbors(n_neighbors=6, algorithm='auto').fit(data_feature)

In [17]:
distances, indices = nbrs.kneighbors(data_feature)

In [18]:
distances

array([[0.        , 0.19271934, 0.22105703, 0.32290536, 0.34242342,
        0.37500256],
       [0.        , 0.14998709, 0.19271934, 0.19710263, 0.36491706,
        0.37672292],
       [0.        , 0.14998709, 0.2790427 , 0.32177936, 0.34242342,
        0.40283674],
       ...,
       [0.        , 0.05722823, 0.07298341, 0.1193168 , 0.12500085,
        0.12500085],
       [0.        , 1.4142582 , 1.41439209, 1.4151926 , 1.4151926 ,
        1.4152372 ],
       [0.        , 0.05293057, 0.05722823, 0.05722823, 0.0596584 ,
        0.07865169]])

In [19]:
indices

array([[  0,   1,   4,  21,   2,  23],
       [  1,   2,   0,   4,  21,  40],
       [  2,   1,   4,  40,   0,  78],
       ...,
       [973, 869, 856, 724, 799, 697],
       [974, 971, 972, 823, 906, 845],
       [975, 841, 831, 891, 844, 929]], dtype=int64)

Creating a function which returns index position of entered name

In [20]:
def get_index_from_name(name):
    return data[data["title"]==name].index.tolist()[0]

In [21]:
get_index_from_name('Pulp Fiction')

4

In [45]:
indices[4][1:]

array([21,  1, 49,  0, 34], dtype=int64)

In [46]:
data.iloc[indices[4][1:],:]['title']

21                 City of God
1                The Godfather
49                The Departed
0     The Shawshank Redemption
34          American History X
Name: title, dtype: object

Here a function is created where for the input film title, the nearest 5 positional indices of values from KNN algorithm will be shown which will be our favorable output

In [22]:
def print_similar_animes(query=None):
     if query:
        found_id = get_index_from_name(query)
        for id in indices[found_id][1:]:
            print(data.ix[id]["title"])

In [23]:
print_similar_animes(query="Pulp Fiction")

City of God
The Godfather
The Departed
The Shawshank Redemption
American History X


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """
