In [2]:
# import our dependencies
import torch
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook as tqdm

import pandas as pd
import numpy as np
import datetime as dt
from sklearn.cluster import KMeans

import sqlite3
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect

from flask import Flask, jsonify, render_template, request

import warnings
warnings.filterwarnings('ignore')

In [3]:
# sqlite3 df interact
con = sqlite3.connect("movie_ratings_db.sqlite")
links_df = pd.read_sql_query("SELECT * FROM Links", con)
movies_df = pd.read_sql_query("SELECT * FROM Movies", con)
ratings_df = pd.read_sql_query("SELECT * FROM Ratings", con)
tags_df = pd.read_sql_query("SELECT * FROM Tags", con)

# map movie titles to Ids using to_dict
movie_titles = movies_df.set_index('movieId')['title'].to_dict()

In [4]:
# confirm length of ratings_df, need det for users and movies
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print(f'', {n_users})
print(f'', {n_items})

 {610}
 {9724}


In [5]:
# weight matrix, matrix1 will use n_users, matrix2 will use n_items, 
# overlap will be n_users to movies that they have rated
# class inherits torch.nn, further integration into native pytorch
class MatrixFactorization(torch.nn.Module):
    # factors default to 20, pair process of users and movies
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # unique_user embedding, initialized as lookup tables
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        # unique_movie embedding, ""
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        
        # lookup tables, embeddings, initialized small random values 
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # data[] for movie indicies, takes user/movie pairs rep as index
        users, items = data[:,0], data[:,1].long()
        # dot product : user/movie position mult then sum into factor with axis=1
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    def predict(self, user, item):
            return self.forward(user, item)

In [6]:
# data loader creation, important part of pytorch?
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()
        
        # unique values dict, pair to index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}
        
        # append continuous ID for users and movies as .items in dict
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}
        
        # baseball bat
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])
        
        # remove timestamp and rating, we got what we need
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        # convert x and y to tensor for model application
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [7]:
# matrix model, set epoch to 100
model = MatrixFactorization(n_users, n_items, n_factors=8)
model = model.to(torch.float32)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

# Mean Squared Error loss function
loss_fn = torch.nn.MSELoss()

# adam optimize
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# train data load and set
train_set = Loader()
train_loader = DataLoader(train_set, batch_size=100, shuffle=True)

MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0270, 0.0275, 0.0194,  ..., 0.0058, 0.0134, 0.0433],
        [0.0206, 0.0354, 0.0006,  ..., 0.0368, 0.0102, 0.0129],
        [0.0487, 0.0023, 0.0120,  ..., 0.0005, 0.0307, 0.0201],
        ...,
        [0.0396, 0.0146, 0.0344,  ..., 0.0173, 0.0277, 0.0314],
        [0.0362, 0.0068, 0.0135,  ..., 0.0139, 0.0390, 0.0479],
        [0.0481, 0.0361, 0.0400,  ..., 0.0399, 0.0219, 0.0306]])
item_factors.weight tensor([[2.5792e-02, 9.3603e-03, 4.2609e-03,  ..., 4.8535e-02, 2.5572e-02,
         1.5382e-02],
        [3.3251e-02, 1.0445e-02, 1.2220e-02,  ..., 2.3063e-02, 1.9611e-02,
         2.9625e-02],
        [1.2812e-02, 1.3770e-02, 7.5345e-03,  ..., 6.0316e-03, 4.2613e-02,
         1.5097e-02],
        ...,
        [4.2221e-02, 1.2776e-02, 2.9391e-02,  ..., 2.9169e-03, 3.1951e-02,
         1.1925e-02],
        [3.6145e-02, 9.7863e-03, 1.6592e-02,  ..., 2.9972e-02, 3.

In [11]:
# Training loop
epochs = 20

for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (data, target) in enumerate(train_loader):
                # Zero the gradients
                optimizer.zero_grad()
                # Forward pass
                data = data.to(torch.int)
                output = model(data)
                # Compute loss
                target = target.to(torch.float)
                loss = loss_fn(output, target)
                # Backpropagation
                loss.backward()
                optimizer.step()
        # Track total loss
        total_loss += loss.item()

        # Print average loss per epoch
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(train_loader)}")


Epoch [1/20], Loss: 0.00033244032807818016
Epoch [2/20], Loss: 0.00046850819181285365
Epoch [3/20], Loss: 0.0005380677278970467
Epoch [4/20], Loss: 0.0002825755193991987
Epoch [5/20], Loss: 0.0004807516297452868
Epoch [6/20], Loss: 0.00027337555133906535
Epoch [7/20], Loss: 0.00037403010400482875
Epoch [8/20], Loss: 0.00034668846692745937
Epoch [9/20], Loss: 0.0003704183519890807
Epoch [10/20], Loss: 0.00027101026890417277
Epoch [11/20], Loss: 0.00037352349406073184
Epoch [12/20], Loss: 0.00018202540777601501
Epoch [13/20], Loss: 0.00023443711879113976
Epoch [14/20], Loss: 0.0002986920187092867
Epoch [15/20], Loss: 0.0003527723939496769
Epoch [16/20], Loss: 0.00045731025715649304
Epoch [17/20], Loss: 0.00036901398031161017
Epoch [18/20], Loss: 0.00033267133299730225
Epoch [19/20], Loss: 0.00019754175506569112
Epoch [20/20], Loss: 0.0003787728674466121


In [12]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
            uw = param.data
            c +=1
    else:
        iw = param.data

user_factors.weight tensor([[ 1.1092,  1.4594,  1.7066,  ...,  1.1963,  1.5360,  0.8710],
        [ 0.1186,  0.3997,  0.9254,  ...,  1.1983,  1.0499,  2.4176],
        [ 1.7261,  0.8074,  1.2782,  ..., -0.9178, -2.2086,  3.0694],
        ...,
        [ 0.4656,  0.5721,  2.1031,  ..., -0.4403,  0.5699,  0.8622],
        [ 1.2982,  0.4542,  0.7090,  ...,  0.5769,  1.4522,  1.3640],
        [ 2.0519,  0.8338,  0.9757,  ...,  1.1608,  0.6761,  0.8222]])
item_factors.weight tensor([[0.4844, 0.4523, 0.1260,  ..., 0.6972, 0.6792, 0.5366],
        [0.5464, 0.6324, 0.2943,  ..., 0.0963, 0.4942, 0.9471],
        [0.7219, 0.2649, 0.2737,  ..., 0.2143, 0.4357, 0.6900],
        ...,
        [0.3822, 0.3513, 0.3700,  ..., 0.3422, 0.3734, 0.3508],
        [0.4042, 0.3767, 0.3844,  ..., 0.3981, 0.3658, 0.4175],
        [0.3966, 0.4464, 0.4257,  ..., 0.4104, 0.4269, 0.4391]])


In [13]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()
len(trained_movie_embeddings)

9724

In [21]:
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=12, random_state=42).fit(trained_movie_embeddings)

group_dict = {}

for cluster in range(10):
    print("User_Group #{}".format(cluster))
    movs = []
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        rate_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
        movs.append((movie_titles[movid], rate_count))
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])

User_Group #0
	 Independence Day (a.k.a. ID4) (1996)
	 Stargate (1994)
	 Star Wars: Episode I - The Phantom Menace (1999)
	 Dumb & Dumber (Dumb and Dumber) (1994)
	 X-Men (2000)
	 Twister (1996)
	 Spider-Man (2002)
	 Rock, The (1996)
	 V for Vendetta (2006)
	 Matrix Reloaded, The (2003)
User_Group #1
	 Godzilla (1998)
	 I Still Know What You Did Last Summer (1998)
	 Superman IV: The Quest for Peace (1987)
	 Nutty Professor II: The Klumps (2000)
	 Karate Kid, Part III, The (1989)
	 House on Haunted Hill (1999)
	 Kazaam (1996)
	 Ghost Rider (2007)
	 Flintstones in Viva Rock Vegas, The (2000)
	 Stop! Or My Mom Will Shoot (1992)
User_Group #2
	 Terminator 2: Judgment Day (1991)
	 Mask, The (1994)
	 Terminator, The (1984)
	 Babe (1995)
	 Aliens (1986)
	 Blade Runner (1982)
	 Outbreak (1995)
	 Ace Ventura: When Nature Calls (1995)
	 Starship Troopers (1997)
	 RoboCop (1987)
User_Group #3
	 Star Wars: Episode IV - A New Hope (1977)
	 Toy Story (1995)
	 Shrek (2001)
	 Princess Bride, The (1987