1. Importing Packages and Data Quality Analysis

In [None]:
#here we are simply importing the necessary packages to run our ensemble methods

import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors


In [None]:
#reading in the mrd data so that we can begin our data quality analysis
mrd_1 = pd.read_csv('https://github.com/ArinB/CA05-kNN/raw/master/movies_recommendation_data.csv')

In [None]:
#The goal os this project is to build a movie recommendation system from our 
#dataset. We will build a recommendation model and then input a movie and see 
#what movies are most like this. The purpose of this is for a website where 
#someone would search movies similar to one that they like to find other movies 
#that they also like.

#We will start with a data quality analysis

mrd_1.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0,0


In [None]:
#now we are going to perform a data quality analysis on our data to make sure
#it is ready for modelling

#looking at null values
print(mrd_1.isnull().sum())

Movie ID       0
Movie Name     0
IMDB Rating    0
Biography      0
Drama          0
Thriller       0
Comedy         0
Crime          0
Mystery        0
History        0
Label          0
dtype: int64


In [None]:
#looking at data type for columns
print(mrd_1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie ID     30 non-null     int64  
 1   Movie Name   30 non-null     object 
 2   IMDB Rating  30 non-null     float64
 3   Biography    30 non-null     int64  
 4   Drama        30 non-null     int64  
 5   Thriller     30 non-null     int64  
 6   Comedy       30 non-null     int64  
 7   Crime        30 non-null     int64  
 8   Mystery      30 non-null     int64  
 9   History      30 non-null     int64  
 10  Label        30 non-null     int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 2.7+ KB
None


In [None]:
mrd_1.describe()
#Only 30 rows but data looks ready

Unnamed: 0,Movie ID,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,48.133333,7.696667,0.233333,0.6,0.1,0.1,0.133333,0.1,0.1,0.0
std,29.288969,0.666169,0.430183,0.498273,0.305129,0.305129,0.345746,0.305129,0.305129,0.0
min,1.0,5.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.75,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,48.5,7.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,64.25,8.175,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,98.0,8.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


2. Preparing Our Dataset

In [None]:
#We do not need the "Label" column as we aren't using this dataset for 
#classification or regression

mrd = mrd_1.drop(columns='Label')
mrd.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0


3. Creating Our Model

In [None]:
#Now that our model is prepared, we can create our independent and dependent 
#columns

x = mrd[['IMDB Rating', 'Biography', 'Drama', 'Thriller', 'Comedy', 'Crime', 
         'Mystery', 'History']]
y = mrd_1['Movie Name']

In [None]:
#With our x and y datasets made, we can apply our NearestNeighbors function to 
#them.

mrd_fitted = NearestNeighbors(n_neighbors=5) 
mrd_fitted.fit(x,y)

#This is all we have to do to prepare our model so that we can test it with our
#movie "The Post." Hopefully, our test will return five movies similar to 
#"The Post" based on our KNN model

NearestNeighbors()

4. Testing Our Final Model

In [None]:
#Our first step is to actually create our post object. The information and 
#creation is done below:

# IMDB Rating = 7.2, Biography = Yes, Drama = Yes, Thriller = No, Comedy = No, 
#Crime = No, Mystery = No, History = Yes

the_post = [7.2,1,1,0,0,0,0,1]

In [None]:
#Now we will run the mrd_fitted.kneighbors function on our movie to determine the 
#distance from each film and their place in our mrd dataset

distance, movie_row = mrd_fitted.kneighbors([the_post])

  "X does not have valid feature names, but"


In [None]:
#Our distance is the distance from each film in our mrd dataset to "The Post"
#You can see it is ordered by distance already for us!

distance

array([[0.9       , 1.        , 1.0198039 , 1.16619038, 1.41421356]])

In [104]:
#Our movie_row shows us the row number of each movie in our mrd dataset. 
#I wanted to make it a list so it is easier to go through the loop and simply 
#just return the name of the movie. I'm sure there was an easier way to do this 
#but this was the first thing that came to my brain lol.

movie_row
movie_list = []
for x in (0,1,2,3,4):
  movie_list.append(movie_row[0,x])

movie_list

[28, 27, 29, 16, 2]

In [103]:
# Creating for loop to find 5 movies recommendations

print('Here are some titles similar to "The Post" that you might like: \n')
for hi in movie_list:
    print((mrd.iloc[hi]['Movie Name']))

Here are some titles similar to "The Post" that you might like: 

12 Years a Slave
Hacksaw Ridge
Queen of Katwe
The Wind Rises
A Beautiful Mind


In [None]:
#Looks like our model worked great! WE can try this with other movies if we 
#simply input other data!