# **Nearest Neighbor item-based Collaborative Filtering**

> [**Movie Lens Small Latest Dataset - Kaggle**](https://www.kaggle.com/shubhammehta21/movie-lens-small-latest-dataset)

In [None]:
# Install Kaggle.
!pip install --upgrade --force-reinstall --no-deps kaggle

In [None]:
# Files Upload.
from google.colab import files

files.upload()

In [3]:
# Create a Kaggle Folder.
!mkdir ~/.kaggle

# Copy the kaggle.json to the folder created.
!cp kaggle.json ~/.kaggle/

# Permission for the json file to act.
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Dataset Download.
!kaggle datasets download -d shubhammehta21/movie-lens-small-latest-dataset

In [None]:
# Unzip Dataset.
!unzip movie-lens-small-latest-dataset.zip

In [6]:
# Import Library.
import pandas as pd
import numpy as np

# Load Dataset.
movies = pd.read_csv(
    "movies.csv",
    usecols=["movieId", "title"],
    dtype={"movieId": "int32", "title": "str"},
)
rating = pd.read_csv(
    "ratings.csv",
    usecols=["userId", "movieId", "rating"],
    dtype={"userId": "int32", "movieId": "int32", "rating": "float32"},
)

In [7]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [8]:
rating.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [9]:
# Merge both Datasets.
data = pd.merge(rating, movies, on="movieId")
data.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [10]:
# Count total ratings for each movie title.
data = data.dropna(axis=0, subset=["title"])

movie_ratingCount = (
    data.groupby(by=["title"])["rating"]
    .count()
    .reset_index()
    .rename(columns={"rating": "totalRatingCount"})[["title", "totalRatingCount"]]
)

movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [11]:
# Combine Entire Datasets.
data = data.merge(movie_ratingCount, left_on="title", right_on="title", how="left")
data.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [12]:
# Dataset Description.
pd.set_option("display.float_format", lambda x: "%.3f" % x)
print(movie_ratingCount["totalRatingCount"].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRatingCount, dtype: float64


In [13]:
# Extract the most popular movies.
popularity_threshold = 50
popular_movie = data.query("totalRatingCount >= @popularity_threshold")

In [14]:
# Create Pivot Matrix.
movie_features = popular_movie.pivot_table(
    index="title", columns="userId", values="rating"
).fillna(0)
movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [15]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

movie_features_matrix = csr_matrix(movie_features.values)

# Fit the Model.
model_knn = NearestNeighbors(metric="cosine", algorithm="brute")
model_knn.fit(movie_features_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [16]:
# Select a random User Id.
query_index = np.random.choice(movie_features.shape[0])
print(query_index)

distances, indices = model_knn.kneighbors(
    movie_features.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6
)

279


In [17]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print("Recommendations for {0}:\n".format(movie_features.index[query_index]))
    else:
        print(
            "{0}: {1}, with distance of {2}:".format(
                i, movie_features.index[indices.flatten()[i]], distances.flatten()[i]
            )
        )

Recommendations for Mr. & Mrs. Smith (2005):

1: Ocean's Eleven (2001), with distance of 0.45355701446533203:
2: I, Robot (2004), with distance of 0.49264323711395264:
3: Wedding Crashers (2005), with distance of 0.49928081035614014:
4: Matrix Revolutions, The (2003), with distance of 0.5010195970535278:
5: Pirates of the Caribbean: Dead Man's Chest (2006), with distance of 0.5045480132102966:
