# Exercises: Collaborative Filtering

## Problems

1. Work with a new Dataset
2. Interpreting Latent Features with Cosine Distance Recommendations



In [None]:
from fastai.collab import *
from fastai.tabular.all import *

# 1. Another Dataset

Using a new dataset, fit the best recommender system you can, using the techniques from class. Here are some recommendations (haha) for datasets hosted on kaggle. Pick one you feel you know enough about, so that you'll be able to comment when you get to Problem 2.

* [Goodreads book ratings](https://www.kaggle.com/datasets/zygmunt/goodbooks-10k)
* [Anime ratings from MyAnimeList](https://www.kaggle.com/datasets/hernan4444/anime-recommendation-database-2020)
* [Board game recommendations from BoardGameGeek](https://www.kaggle.com/datasets/nfedorov/top-2000-board-games-ratings)
* [Steam video game interaction](https://www.kaggle.com/datasets/tamber/steam-video-games)
* [Amazon Music Reviews](https://cseweb.ucsd.edu/~jmcauley/datasets/amazon/links.html)

Hints to get things running more smoothly:
1. [Rename your columns](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html) to be `"user"`, `"title"`, and `"rating"`. The code for Problem 2 assumes those, and you'll have to either rename your columns here or edit the code below.
1. You may need to use a GPU! Some of these datasets are big.
2. Scale up your batch size as large as you can and still fit into GPU RAM.
3. Scale down your dataset if necessary, using the [sample method](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html)
3. Make sure your epochs run relatively quickly! We don't have all day, people.

In [None]:
%env KAGGLE_USERNAME="donnydutch"
%env KAGGLE_KEY="e000856dbf910d714ae70609d63d00aa"

!kaggle datasets download -d hernan4444/anime-recommendation-database-2020
!unzip anime-recommendation-database-2020

env: KAGGLE_USERNAME="donnydutch"
env: KAGGLE_KEY="e000856dbf910d714ae70609d63d00aa"
Dataset URL: https://www.kaggle.com/datasets/hernan4444/anime-recommendation-database-2020
License(s): CC0-1.0
Downloading anime-recommendation-database-2020.zip to /content
100% 661M/661M [00:05<00:00, 118MB/s]
100% 661M/661M [00:05<00:00, 135MB/s]
Archive:  anime-recommendation-database-2020.zip
  inflating: anime.csv               
  inflating: anime_with_synopsis.csv  
  inflating: animelist.csv           
  inflating: html folder/html/1/details.html  
  inflating: html folder/html/1/pictures.html  
  inflating: html folder/html/1/recomendations.html  
  inflating: html folder/html/1/reviews_1.html  
  inflating: html folder/html/1/reviews_10.html  
  inflating: html folder/html/1/reviews_11.html  
  inflating: html folder/html/1/reviews_12.html  
  inflating: html folder/html/1/reviews_13.html  
  inflating: html folder/html/1/reviews_14.html  
  inflating: html folder/html/1/reviews_15.html  
  i

In [None]:
import pandas as pd

rating = pd.read_csv("rating_complete.csv") #seeing qhat columns we need
rating = rating.sample(n=100000, random_state = 5)
rating = rating.rename(columns = {"user_id":"user","anime_id":"ID"})
rating

Unnamed: 0,user,ID,rating
31734292,194955,22535,8
45352844,278258,31798,10
52399347,321373,32937,8
14468404,88677,25157,10
24348151,149766,39490,6
...,...,...,...
25125020,154789,7593,6
22753425,139714,1689,7
37916211,232846,2151,10
26764850,164705,2001,8


In [None]:
anime = pd.read_csv("/content/anime.csv")
anime = anime[["MAL_ID","Name"]]
anime = anime.rename(columns = {"MAL_ID":"ID","Name":"title"})

In [None]:
anime

Unnamed: 0,ID,title
0,1,Cowboy Bebop
1,5,Cowboy Bebop: Tengoku no Tobira
2,6,Trigun
3,7,Witch Hunter Robin
4,8,Bouken Ou Beet
...,...,...
17557,48481,Daomu Biji Zhi Qinling Shen Shu
17558,48483,Mieruko-chan
17559,48488,Higurashi no Naku Koro ni Sotsu
17560,48491,Yama no Susume: Next Summit


In [None]:
rating = rating.merge(anime)

In [None]:
dls = CollabDataLoaders.from_df(rating, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,114154,Yowamushi Pedal: Grande Road,8
1,196357,Kill la Kill,10
2,102594,Nyan Koi!,8
3,39922,Ginyuu Mokushiroku Meine Liebe,6
4,63774,Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.,8
5,244402,Ikkitousen,5
6,343465,Kono Subarashii Sekai ni Shukufuku wo!: Kono Subarashii Choker ni Shukufuku wo!,8
7,204683,Mobile Suit Gundam 00,8
8,63930,Magikano,5
9,70855,Piano no Mori (TV),5


In [None]:
x,y = dls.one_batch()

In [None]:
print(x[0:10,:])
print(y[0:10])

tensor([[182339,   9896],
        [139565,    701],
        [155498,   2699],
        [150233,   2409],
        [206752,    855],
        [ 98980,   4706],
        [171403,   8474],
        [135836,  10451],
        [183381,   2273],
        [193327,   2265]])
tensor([[ 8],
        [ 8],
        [ 7],
        [ 8],
        [ 7],
        [ 7],
        [10],
        [ 7],
        [ 9],
        [10]], dtype=torch.int8)


In [None]:
print(len(dls.classes["user"]))
print(len(dls.classes["title"]))

291492
16241


In [None]:
user_embedding = Embedding(291492, 5)
movie_embedding = Embedding(16052, 75)

In [None]:
user_features = user_embedding(x[:,0])

In [None]:
movie_features = movie_embedding(x[:,1])

In [None]:
class RecommenderNN (Module):
    def __init__(self, user_sz, item_sz, range=[0.5,5.5], n_act=100):

        self.user_embedding  = Embedding(*user_sz)
        self.movie_embedding = Embedding(*item_sz)

        self.layers = nn.Sequential( #Creating layers
            nn.Linear(user_sz[1]+item_sz[1], n_act),  #Linear layer, our movie and user embeddings aren't being dot producted anymore, the size of each of these doesnt have to be the same anymore
                                     #They can be thought of as inputs for a layer
            nn.ReLU(), #A relu
            nn.Linear(n_act, 1))

        self.min = range[0]
        self.max = range[1]

    def forward(self, x):
        users   = self.user_embedding(x[:,0])
        movies  = self.movie_embedding(x[:,1])
        embeddings = torch.cat([users, movies], dim=1) #This takes our embeddings and makes a single vector as the input

        raw_rating = self.layers(embeddings)

        return torch.sigmoid(raw_rating)*(self.max-self.min) + self.min

In [None]:
embs = get_emb_sz(dls)

model = RecommenderNN(*embs, [0.5,5.5])
learn = Learner(dls, model, loss_func=MSELossFlat(), metrics=rmse)

learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,_rmse,time


## 2. Cosine Similarity

We can use the embeddings for each title to determine which items are closest to each other (alternatively, which other user has the most similar taste). Imagine each embedding as a vector in space. Two items are similar to each other if the angle between their vectors, $\theta$ is small. This is usually reported as $\cos\theta$, because it's fast to calculate. But it's also useful for humans: $\cos0^\circ=1$, and $\cos90^\circ=0$, so it will always be higher for similar vectors.

Below is code [adapted from the textbook](https://github.com/fastai/fastbook/blob/master/08_collab.ipynb) which will find some similar titles based on one submitted. You almost certainly will need to modify your code based on the exact format of your dataset.

1. Get the code to work.
2. Produce some recommendations for your dataset. Does your recommender seem accurate?

In [None]:
class DotProduct (Module):
    def __init__(self, n_users, n_movies, n_features):
        self.user_embedding  = Embedding(n_users, n_features)
        self.movie_embedding = Embedding(n_movies, n_features)

    def forward(self, x):
        users  = self.user_embedding(x[:,0])
        movies = self.movie_embedding(x[:,1])
        return (users*movies).sum(axis=1)

In [None]:
n_users  = len(dls.classes["user"])
n_movies = len(dls.classes["ID"])
n_features = 50

model = DotProduct(n_users, n_movies, n_features)
learn = Learner(dls, model, loss_func=MSELossFlat(), metrics=rmse)

learn.fit_one_cycle(5, 5e-3) #This model ass it takes 17 hours

epoch,train_loss,valid_loss,_rmse,time


KeyboardInterrupt: 

In [None]:
rating = pd.read_csv("rating_complete.csv")
rating

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9
...,...,...,...
57633273,353404,502,8
57633274,353404,987,4
57633275,353404,225,8
57633276,353404,243,7


In [None]:
# Getting a list of some of the unique titles in the dataset
rating["title"].unique()[1:10]

In [None]:
# This is the item's name that I'm looking up.
itemname = "Century: Spice Road"

In [None]:
weights = learn.model.embeds[1].weight                 # Grab the embeddings
idx = torch.tensor(dls.classes['title'].o2i[itemname]) # Determine where this item is

# Calculate distances
distances = nn.CosineSimilarity(dim=1)(weights, weights[idx,:])

# Sort finding the closest distance
top10 = distances.argsort(descending=True)[1:10]
for id in top10:
    print(dls.classes['title'][id])