In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'netflix-prize-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1636%2F792972%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240829%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240829T141321Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D306c4ec68c9b4358ae4b02deb48e48cab0a3cc121784de97d17492d3640c897293acb8ebc9d7765bc9c44d58904b3a900289905c0313ed5462e1b41bc8ded42f25b74b7c29712f7c407bacc988972c41e09e43c63aaf32e53215b915f246b10aae426b2ea7ce13addf04c67d26148e8cc7d11646d96b5cf7de91c9c93acb4f1b20902d5397d1a37a2c60a85c0876938fbf8e8dc8cb71c1766b37083908ba54b6df89935c4764640d12339746c24005a2b35ec49a4936f3a6dac0a37c21e20cd5aa8c0c9852175b798d43a23fb23229689ca5e674c678b9c3f9add153276e0f8e0083dfdc1bcbb6e9900f589609280d37a0c3a30919e7e0ccb8a7fa2fcdece4eb'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/netflix-prize-data/combined_data_3.txt
/kaggle/input/netflix-prize-data/movie_titles.csv
/kaggle/input/netflix-prize-data/combined_data_4.txt
/kaggle/input/netflix-prize-data/combined_data_1.txt
/kaggle/input/netflix-prize-data/README
/kaggle/input/netflix-prize-data/probe.txt
/kaggle/input/netflix-prize-data/combined_data_2.txt
/kaggle/input/netflix-prize-data/qualifying.txt


## Part 1: Data

We iterate through the text files and prepare the matrix M.

In [None]:
import re
import tqdm

movie_expression = re.compile(r"(\d+):")

def parse(lines: list):
    movie_id = None
    movies_and_users = []
    for line in tqdm.tqdm(lines):
        is_movie = movie_expression.search(line)
        if is_movie:
            movie_id = is_movie.groups()[0]
            continue
        user_id, _, _ = line.split(',')
        movies_and_users.append((int(movie_id), int(user_id)))
    return movies_and_users

In [None]:
files = [
    "/kaggle/input/netflix-prize-data/combined_data_3.txt",
#     "/kaggle/input/netflix-prize-data/combined_data_4.txt",
#     "/kaggle/input/netflix-prize-data/combined_data_1.txt",
#     "/kaggle/input/netflix-prize-data/combined_data_2.txt"
]
movies_and_users = []
for f in files:
    print(f'processing file {f}')
    with open(f, "r") as raw_text:
        lines = raw_text.readlines()
    movies_and_users.extend(parse(lines[:260578]))
    print(f'completed processing file {f}')


processing file /kaggle/input/netflix-prize-data/combined_data_3.txt


100%|██████████| 260578/260578 [00:01<00:00, 144890.65it/s]

completed processing file /kaggle/input/netflix-prize-data/combined_data_3.txt





In [None]:
movies, users = zip(*movies_and_users)

[(9211, 1277134), (9211, 2435457), (9211, 2338545), (9211, 2218269), (9211, 441153), (9211, 1921624), (9211, 2096652), (9211, 818736), (9211, 284560), (9211, 1211224)]


In [None]:
print(movies_and_users[:10])
print(movies[:5])
print(users[:5])

[(9211, 1277134), (9211, 2435457), (9211, 2338545), (9211, 2218269), (9211, 441153), (9211, 1921624), (9211, 2096652), (9211, 818736), (9211, 284560), (9211, 1211224)]
(9211, 9211, 9211, 9211, 9211)
(1277134, 2435457, 2338545, 2218269, 441153)


In [None]:
unique_movies = sorted(list(set(movies)))
unique_users = sorted(list(set(users)))

movie_to_idx = {movie: i for i, movie in enumerate(unique_movies)}
user_to_idx = {user: i for i, user in enumerate(unique_users)}

normalised_movies = [movie_to_idx[m] for m in movies]
normalised_users = [user_to_idx[u] for u in users]

M = torch.zeros(len(unique_movies), len(unique_users))
normalised_movies_and_users = list(zip(normalised_movies, normalised_users))

In [None]:
M.shape

torch.Size([55, 142475])

In [None]:
for movie, user in normalised_movies_and_users:
    M[movie, user] = 1

NameError: name 'estimation' is not defined

# Recommendation

This is hacking the Netflix prize dataset to do a slightly different task: predicting what movies users will be interested in. This is **not** ratings based.

There are two methods I'll explore in this notebook. The first is matrix factorization. The second is a DNN two tower approach to recommendation.

## Matrix Factorization

This involves generating a matrix M of all films and all viewers, with the value of each index *i,j* corresponding to whether a given user *i* watched film *j*. This matrix is factorised into two smaller matrices U, F which represent embeddings for users and films respectively. We can iteratively hold each matrix constant while we tune the other to better match the results in M.


In [None]:
embedding_dimension = 200

user_embedding = torch.rand(len(unique_users), embedding_dimension, requires_grad=True, device="cuda")
movie_embedding = torch.rand(len(unique_movies), embedding_dimension, requires_grad=True, device="cuda")

optim = torch.optim.AdamW([user_embedding, movie_embedding], 0.01)

In [None]:
print(movie_embedding)

tensor([[0.0244, 0.4842, 0.5832,  ..., 0.1868, 0.6940, 0.9109],
        [0.9169, 0.1311, 0.5093,  ..., 0.3601, 0.9317, 0.1824],
        [0.6454, 0.2390, 0.4221,  ..., 0.5914, 0.7822, 0.6516],
        ...,
        [0.6624, 0.2916, 0.2972,  ..., 0.7648, 0.3581, 0.3114],
        [0.2482, 0.5220, 0.0044,  ..., 0.9718, 0.8334, 0.6170],
        [0.2406, 0.1570, 0.2425,  ..., 0.6509, 0.1521, 0.5417]],
       device='cuda:0', requires_grad=True)


In [None]:
M = M.to("cuda")

In [None]:
def training_loop(display):
    estimation = movie_embedding @ user_embedding.T
    difference = M - estimation
    loss_1 = (difference * M) ** 2
    non_observation_mask = 1 - M
    loss_2 = (difference * non_observation_mask) ** 2
    loss = loss_1 + (0.005 * loss_2)
    loss = loss.mean()
    loss.backward()
    optim.step()
    optim.zero_grad()
    if display:
        print(loss.item())

In [None]:
for i in range(500):
    display = True if i % 100 == 0 else False
    training_loop(display)


64010020.0
1240736.5
365609.1875
132669.453125
54906.296875


In [None]:
M[:,0]

tensor([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.], device='cuda:0')

In [None]:
(movie_embedding @ user_embedding.T)[:,0]

tensor([ 4.2348e-02, -2.0404e-01,  8.2759e-02, -6.9402e-02,  2.2081e-02,
        -6.3062e-02,  9.9884e-01, -8.6629e-02,  8.5866e-02,  4.3168e-02,
         5.0588e-02, -2.7901e-02,  1.5058e-02,  1.0723e-02,  1.2094e-01,
        -5.1137e-02,  4.0411e-03, -1.6790e-01,  1.0391e-02,  3.9309e-02,
        -7.4312e-02,  5.5644e-03,  1.5956e-01,  6.8946e-02,  3.0017e-03,
         5.7406e-02,  2.7226e-02, -1.4134e-02,  1.6059e-02, -2.6143e-02,
         3.5589e-02, -2.7440e-02, -1.1342e-01,  1.9635e-02,  3.6988e-03,
         1.0292e-01, -1.0080e-02, -5.2544e-02, -1.3273e-02,  3.9138e-02,
        -7.5212e-02, -6.7361e-02,  4.6747e-02,  1.0001e+00, -1.0015e-01,
         1.5583e-01, -7.0365e-02, -1.2481e-01, -1.6555e-02,  3.5837e-02,
         6.2184e-02,  8.5819e-04,  1.8572e-03, -1.1998e-02,  3.9017e-02],
       device='cuda:0', grad_fn=<SelectBackward0>)

### DNNs