## **Daniel Enciso**
## **CA06-kNN based Recommender Engine**

In [21]:
#Importing the needed packages
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors


# Data Source and Contents

In [22]:
#Accessing the data we will be working with:
#Data File Name: movies_recommendation_data.csv
#NOTE: Use the EXACT URL in your code as data file location

data = pd.read_csv('https://github.com/ArinB/CA05-kNN/raw/master/movies_recommendation_data.csv')


In [23]:
#Confirming the column/row count
data.shape

(30, 11)

In [24]:
#Viewing the first five rows of our dataset
data.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0,0


In [25]:
#Inspecting/investigating our dataset. Statistical description of our dataset
data.describe()

Unnamed: 0,Movie ID,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,48.133333,7.696667,0.233333,0.6,0.1,0.1,0.133333,0.1,0.1,0.0
std,29.288969,0.666169,0.430183,0.498273,0.305129,0.305129,0.345746,0.305129,0.305129,0.0
min,1.0,5.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.75,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,48.5,7.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,64.25,8.175,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,98.0,8.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [26]:
#Ensuring our dataset is complete
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie ID     30 non-null     int64  
 1   Movie Name   30 non-null     object 
 2   IMDB Rating  30 non-null     float64
 3   Biography    30 non-null     int64  
 4   Drama        30 non-null     int64  
 5   Thriller     30 non-null     int64  
 6   Comedy       30 non-null     int64  
 7   Crime        30 non-null     int64  
 8   Mystery      30 non-null     int64  
 9   History      30 non-null     int64  
 10  Label        30 non-null     int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 2.7+ KB


In [27]:
#Confirming whether or not there are null values in our dataset
data.isnull().sum()

Movie ID       0
Movie Name     0
IMDB Rating    0
Biography      0
Drama          0
Thriller       0
Comedy         0
Crime          0
Mystery        0
History        0
Label          0
dtype: int64

# Cleaning the data

In [28]:
# Dropping columns that are not required
# The labels column values are all zeroes because we aren’t using this data set for classification or regression.
data = data.drop(columns='Label')

In [29]:
# Displaying  final columns
data.columns

Index(['Movie ID', 'Movie Name', 'IMDB Rating', 'Biography', 'Drama',
       'Thriller', 'Comedy', 'Crime', 'Mystery', 'History'],
      dtype='object')

In [30]:
#Checking the first five records of our cleaned dataset
data.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0


# Building the Recommender System

In [31]:
# Making input variables for fitting
x = data[['IMDB Rating', 'Biography', 'Drama', 'Thriller', 'Comedy', 'Crime', 'Mystery', 'History']]
y = data['Movie Name']

In [32]:
# Use Nearestneighbors and fit
neigh = NearestNeighbors(n_neighbors=5) 
neigh.fit(x,y)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [33]:
# Implement the recommendation based on the movie 'The_Post'

# Following is the genre information about the movie “The Post”
# IMDB Rating = 7.2, Biography = Yes, Drama = Yes, Thriller = No, Comedy = No, Crime = No, Mystery = No, History = Yes

rec_the_post = [7.2,1,1,0,0,0,0,1]
return_distance, rec = neigh.kneighbors([rec_the_post])
print(rec)

[[28 27 29 16  2]]


# Given a movies data set, what are the 5 most similar movies to a movie query?

In [34]:
# Creating for loop to find 5 movies recommendations

print('Based on your viewing of "The Post" we recommend you watch: ')
for five_recs in rec:
    print(data.iloc[five_recs]['Movie Name'])

Based on your viewing of "The Post" we recommend you watch: 
28    12 Years a Slave
27       Hacksaw Ridge
29      Queen of Katwe
16      The Wind Rises
2     A Beautiful Mind
Name: Movie Name, dtype: object
