In [1]:
import pandas as pd
import collections
import numpy as np
from tqdm.notebook import tqdm, trange
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt

In [2]:
device = torch.device("cuda:0")
# device = torch.device("cpu")

# A simpler version of the Two-tower Model

[Sampling-Bias-Corrected Neural Modeling for Large Corpus
Item Recommendations](https://research.google/pubs/pub48840/)


# Goal

**Given user information and movie information, predict how would the user rate the moive (1-5).**

# Feature description
There are categorical features (C) and dense features (D). For each categorical feature, I create an embedding layer. Alternatively, for genres, each movie can have more than one genre. So it a bag of categorical feature. I create an create an embeddingbag layer for it. 

For each table, avaiable information includes:

user table:  
- userID (C)
- gender (C)
- age (D)
- occupation (C)
- zip (not used)

moive table:  
- movieID (C)
- title (not used)
- genres (bag of C)

rating table:  
- userID (C)  foreign key to user.userID
- movieID (C)  foreign key to moive.movieID
- rating (D)  target variable
- time (used to generate history)
- history (generated) according to the timestamp, record user's watch history. Only keeping the last 20 movieids for effcient purpose.

# Data loading and transformations

In [3]:
df_user = pd.read_csv('./ml-1m/users.dat', sep='::', engine = 'python', names=['userid', 'gender', 'age', 'occupation', 'zip'], header=None)
df_movie = pd.read_csv('./ml-1m/movies.dat', sep='::', engine = 'python', names=['movieid', 'title', 'genres'], header=None, encoding = "ISO-8859-1")
df_rating = pd.read_csv('./ml-1m/ratings.dat', sep='::', engine = 'python', names=['userid', 'movieid', 'rating', 'time'], header=None)


In [4]:
df_user.head(2)

Unnamed: 0,userid,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072


In [5]:
df_movie.head(2)

Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


In [6]:
df_rating.head(2)

Unnamed: 0,userid,movieid,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109


In [7]:
# transform df_user['age'] between 0 and 1
age = df_user['age'].values.reshape(-1,1)
scaler = preprocessing.MinMaxScaler() 
scaled_age = scaler.fit_transform(age)
df_user['age'] = scaled_age

In [8]:
def toCategoricalId(df, column):
    le = preprocessing.LabelEncoder()
    df[column+'id'] = le.fit_transform(df[column])
    return le

In [9]:
# process other columns in df_user
toCategoricalId(df_user, 'userid')
toCategoricalId(df_user, 'gender')
toCategoricalId(df_user, 'occupation')


LabelEncoder()

In [10]:
# process df_movie['genres'] column
# the genres column is special. 
# Each movie has at least 1 genre and at most 6 (in this particular dataset).
# Either pick the first genre or make it a list of genres and take average genre embedding.
le = preprocessing.LabelEncoder()
le.fit([item for sublist in df_movie['genres'].str.split('|') for item in sublist])
genresids = []
for genres in df_movie['genres'].str.split('|'):
    genresids.append(le.transform(genres).tolist())
df_movie['genresids'] = genresids

In [11]:
num_genres=len(le.classes_)
num_genres

18

In [12]:
# process df_movie['title'] column
# I notice the title always comes with the release year, which is also helpful information
name_and_year = df_movie['title'].str.findall(r'(.*)\((.*)\)').map(lambda x: list(x[0]))
name_and_year = np.array(name_and_year.values.tolist())
df_movie['puretitle'] = name_and_year[:,0]
df_movie['year'] = name_and_year[:,1]
df_movie['year'] = df_movie['year'].map(int)

year = df_movie['year'].values.reshape(-1,1)
scaler = preprocessing.MinMaxScaler() 
scaled_year = scaler.fit_transform(year)
df_movie['year'] = scaled_year

In [13]:
le = toCategoricalId(df_movie, 'movieid') # use the same le to transform movieid
df_rating['movieid'+'id'] = le.transform(df_rating['movieid'])

In [14]:
# sort by time
df_rating = df_rating.sort_values(by=['userid','time'])
df_rating = df_rating.reset_index(drop=True)

In [15]:
histories = []
curr_user = -1
curr_history = []
for i in trange(len(df_rating)):
    row = df_rating.iloc[i]
    histories.append(curr_history)
    if curr_user != row['userid']:
        curr_user = row['userid']
        curr_history = []
    curr_history = curr_history.copy()
    curr_history.append(row['movieidid'])

df_rating['history'] = histories

  0%|          | 0/1000209 [00:00<?, ?it/s]

In [16]:
df_rating

Unnamed: 0,userid,movieid,rating,time,movieidid,history
0,1,3186,4,978300019,3117,[]
1,1,1270,5,978300055,1250,[3117]
2,1,1721,4,978300055,1672,"[3117, 1250]"
3,1,1022,5,978300055,1009,"[3117, 1250, 1672]"
4,1,2340,3,978300103,2271,"[3117, 1250, 1672, 1009]"
...,...,...,...,...,...,...
1000204,6040,2917,4,997454429,2848,"[847, 589, 2315, 1892, 1950, 569, 1395, 211, 3..."
1000205,6040,1921,4,997454464,1852,"[847, 589, 2315, 1892, 1950, 569, 1395, 211, 3..."
1000206,6040,1784,3,997454464,1726,"[847, 589, 2315, 1892, 1950, 569, 1395, 211, 3..."
1000207,6040,161,3,997454486,159,"[847, 589, 2315, 1892, 1950, 569, 1395, 211, 3..."


In [17]:
# merge together
# transform ratings into the range of [0,1]
df = pd.merge(pd.merge(df_rating, df_user, on=['userid']), df_movie, on=['movieid'])
df.drop(columns=['userid','movieid', 'gender', 'occupation', 'zip', 'genres', 'time', 'title'], inplace=True)
df['rating'] = df['rating'].map(lambda x: (x-1.)/4.)

In [18]:
df = df.drop(columns=['movieidid_y'])
df = df.rename(columns={"movieidid_x": "movieidid"})

In [19]:
df.head(3)

Unnamed: 0,rating,movieidid,history,age,useridid,genderid,occupationid,genresids,puretitle,year
0,0.75,3117,[],0.0,0,0,10,[7],"Girl, Interrupted",0.987654
1,0.75,3117,"[1192, 109, 896, 3412, 1533, 476, 585, 389, 19...",0.436364,7,1,12,[7],"Girl, Interrupted",0.987654
2,0.5,3117,"[3032, 523, 1668, 3327, 2748, 1192, 1178, 1207...",0.8,32,1,3,[7],"Girl, Interrupted",0.987654


In [92]:
df['rating'] = df['rating']*4+1

In [28]:
# df.to_csv('combined.csv')

In [93]:
num_gender = df['genderid'].max() + 1
num_user = df['useridid'].max() + 1
num_occupation= df['occupationid'].max() + 1
num_movie = df['movieidid'].max() + 1
num_ratings = len(df)

In [94]:
num_gender, num_user, num_occupation, num_movie, num_genres, num_ratings

(2, 6040, 21, 3883, 18, 1000209)

# Train/test split, customize dataset and dataloader

In [95]:
# 0.8, 0.2 split
train_df=df.sample(frac=0.8) 
test_df=df.drop(train_df.index)

In [96]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        data = self.X.iloc[idx].to_dict()
        label = self.Y.iloc[idx]
        return data, label

In [97]:
# a helper function for collate_fn used in DataLoader
# this function convert list into torch.Tensor and also move it to target device
# it also pads 'num_genres' to make genresids a valid tensor. 'num_genres' will be skipped in the EmbeddingBag layer
def df2pt(dic):
    for k in dic:
        if k == 'genresids':
            batch = dic[k]
            max_cols = max([len(row[-20:]) for row in batch])
            padded = []
            for i in range(len(batch)):
                padded.append( batch[i][-20:] + [num_genres]*(max_cols-len(batch[i][-20:])))
            dic[k] = torch.LongTensor(padded).to(device)
        elif  k =='history':
            batch = dic[k]
            max_cols = max([len(row[-20:]) for row in batch])
            padded = []
            for i in range(len(batch)):
                padded.append( batch[i][-20:] + [num_movie]*(max_cols-len(batch[i][-20:])))
            dic[k] = torch.LongTensor(padded).to(device)
        elif 'id' in k:
            dic[k] = torch.LongTensor(dic[k]).to(device)
        else:
            dic[k] = torch.Tensor([dic[k]]).T.to(device)
    return dic

In [98]:
# self define how to load each batch
def collate_fn(batch):
    batch_size = len(batch)
    batch_X = collections.defaultdict(list)
    batch_Y = []
    
    for i in range(batch_size):
        data, label = batch[i]
        for key in data:
            batch_X[key].append(data[key])
        batch_Y.append(label)
    batch_X = df2pt(batch_X)
    batch_Y = torch.Tensor(batch_Y).to(device)
    return batch_X, batch_Y


In [99]:
# train_dataset and train_dataloader
# X = train_df.drop(columns=['rating']) # title can be put into bert and get a feature vector
X = train_df.drop(columns=['rating','puretitle'])
Y = train_df['rating']
train_dataset = CustomDataset(X,Y)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)


In [100]:
# test_dataset and test_dataloader
# test_X = test_df.drop(columns=['rating']) # title can be put into bert and get a feature vector
test_X = test_df.drop(columns=['rating','puretitle'])
test_Y = test_df['rating']
test_dataset = CustomDataset(test_X,test_Y)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)


# model 

In [125]:
class TwoTower(nn.Module):
    def __init__(self, fclayers = 2, activation = 'relu'):
        super().__init__()
        assert fclayers in [1,2], 'fclayers not implemented'
        self.fclayers = fclayers
        assert activation in ['relu','tanh', 'sigmoid'], 'activation not implemented'
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'tanh':
            self.activation = nn.Tanh()
        else:
            self.activation = nn.Sigmoid()
            
        # user tower
        self.userid_emb_layer = nn.Embedding(num_user,100)
        self.gender_emb_layer = nn.Embedding(num_gender,2)        
        self.age_layer = nn.Linear(1,1)        
        self.occupation_emb_layer = nn.Embedding(num_occupation,4) 
        self.user_fc1 = nn.Linear(207, 64) # 100 + 2 + 1 + 4 + 100
        self.user_fc2 = nn.Linear(64, 32)
        
        # movie tower
        self.movieid_emb_layer = nn.Embedding(num_movie+1,100, padding_idx=num_movie)
        self.genre_emb_layer = nn.EmbeddingBag(num_genres+1, 4, mode = 'mean', padding_idx=num_genres)
        self.year_layer = nn.Linear(1,1)
        self.movie_fc1 = nn.Linear(105, 64)
        self.movie_fc2 = nn.Linear(64, 32)
        
    def forward(self, useridid, genderid, age, history, occupationid, genresids, movieidid, year):
        # user infos
        userid_emb = self.userid_emb_layer(useridid)
        gender_emb = self.gender_emb_layer(genderid)
        age_emb = self.age_layer(age)        
        occupation_emb = self.occupation_emb_layer(occupationid)  
        
        # self made embeddingbag for history
        history_embs = self.movieid_emb_layer(history)
        batch_history_count = torch.clip((history != num_movie).sum(axis = 1), min=1.).repeat(100, 1).T
        history_emb = history_embs.sum(axis = 1)/batch_history_count
        
        user_emb = torch.cat((userid_emb, gender_emb, age_emb, occupation_emb, history_emb), -1)
        
        # movie infos
        movieid_emb = self.movieid_emb_layer(movieidid)
        genre_emb = self.genre_emb_layer(genresids)
        year_emb = self.year_layer(year)
        
        movie_emb = torch.cat((movieid_emb, genre_emb, year_emb), -1)
        
        if self.fclayers == 2:
            user_emb = self.activation(self.user_fc1(user_emb))
            user_emb = self.activation(self.user_fc2(user_emb))
            movie_emb = self.activation(self.movie_fc1(movie_emb))
            movie_emb = self.activation(self.movie_fc2(movie_emb))
        elif self.fclayers == 1:
            user_emb = self.activation(self.user_fc1(user_emb))
            movie_emb = self.activation(self.movie_fc1(movie_emb))
        else:
            user_emb = self.activation(user_emb)
            movie_emb = self.activation(movie_emb)
        # l2 norm, dot product, and output
#         user_emb = (user_emb.T/torch.norm(user_emb, dim=1)).T
#         movie_emb = (movie_emb.T/torch.norm(movie_emb, dim=1)).T
        
        out = torch.sum(user_emb*movie_emb, -1)
#         out = self.activation(out)
#         out = torch.sigmoid(out)
        out = torch.relu(out)
        
        return out

In [126]:
twotower = TwoTower(fclayers = 2, activation = 'relu')
twotower = twotower.to(device)

In [127]:
loss_fn = nn.MSELoss()
optimizer = optim.SGD(twotower.parameters(), lr=0.002, momentum=0.05)
# optimizer = optim.Adam(twotower.parameters(), lr=0.005)
# optimizer = optim.RMSprop(twotower.parameters(), lr=0.003)

In [128]:
train_losses = []
test_losses = []
for i_batch, sample_batched in enumerate(train_dataloader):
    data, label = sample_batched
    y_pred = twotower(**data)
    loss = loss_fn(y_pred, label)
    train_losses.append(loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if i_batch%500 == 0:
        print(f'iter: {i_batch}/{len(train_dataloader)}, tr_loss: {loss}')    

iter: 0/12503, tr_loss: 13.728534698486328
iter: 500/12503, tr_loss: 1.2019962072372437
iter: 1000/12503, tr_loss: 1.4604352712631226
iter: 1500/12503, tr_loss: 1.0942646265029907
iter: 2000/12503, tr_loss: 1.2681233882904053
iter: 2500/12503, tr_loss: 0.8448672294616699
iter: 3000/12503, tr_loss: 1.1146981716156006
iter: 3500/12503, tr_loss: 0.9369009733200073
iter: 4000/12503, tr_loss: 0.9921917915344238
iter: 4500/12503, tr_loss: 0.8940247297286987
iter: 5000/12503, tr_loss: 0.8739302754402161
iter: 5500/12503, tr_loss: 1.012089729309082
iter: 6000/12503, tr_loss: 1.056225299835205
iter: 6500/12503, tr_loss: 0.6917968392372131
iter: 7000/12503, tr_loss: 1.1450015306472778
iter: 7500/12503, tr_loss: 0.9388798475265503
iter: 8000/12503, tr_loss: 0.9819841384887695
iter: 8500/12503, tr_loss: 1.507707118988037
iter: 9000/12503, tr_loss: 0.7535656690597534
iter: 9500/12503, tr_loss: 0.869304895401001
iter: 10000/12503, tr_loss: 0.805438756942749
iter: 10500/12503, tr_loss: 0.953908324241

In [130]:
twotower.eval()
test_loss = 0
test_size = 0
for j_batch, test_batch in enumerate(test_dataloader):
    data, label = test_batch
    y_pred = twotower(**data)
    loss = loss_fn(y_pred, label)
    if not torch.isnan(loss):
        test_loss += loss * len(label)
        test_size += len(label)
avg_test_loss = test_loss / test_size
print('test_loss',avg_test_loss.item())

test_loss 1.0008960962295532


In [131]:
label, y_pred

(tensor([4., 4., 2., 4., 3., 4., 4., 5., 4., 3., 3., 4., 3., 1., 2., 4., 4., 2.,
         4., 5., 1., 4., 5., 5., 4., 3., 4., 5., 3., 3., 3., 1., 4., 4., 5., 3.,
         3., 4., 2., 4., 3., 4.], device='cuda:0'),
 tensor([3.7039, 3.1175, 3.3328, 3.3899, 2.7761, 4.1400, 2.9915, 4.3202, 4.1007,
         3.6889, 3.0198, 4.0283, 3.7854, 3.0865, 3.9929, 3.5580, 3.7590, 2.8136,
         2.9202, 3.6059, 2.9749, 3.3672, 4.2273, 3.5313, 4.3427, 3.0899, 4.0757,
         3.9177, 3.0930, 3.0356, 3.0852, 4.1807, 3.6799, 3.0795, 4.1536, 3.1146,
         3.3371, 3.1619, 3.9560, 3.2734, 3.7010, 3.4955], device='cuda:0',
        grad_fn=<ReluBackward0>))

In [132]:
loss_fn(y_pred,label)

tensor(1.0501, device='cuda:0', grad_fn=<MseLossBackward0>)

# Discussion and improvements/thinking

It achieves 1.0501 MSE error on testset. The best model on [paperwithcode](https://paperswithcode.com/sota/collaborative-filtering-on-movielens-1m) reports 0.8227 RMSE. So it seems that this model is not bad for my first try!

There are many potential improvements.

1. The user's watch-history embedding is calculated by taking the mean of most recent movies' embeddings. There are some issues:   
- User's rating for previously watched movies is not taken into account. A negative rating may have negative effect.
- Highly similar movies should have stronger effect than others. In reality, one may like a movie just because he/she watched a very similir movie. The attension mechanism can be applied.
- More recent watch may matters more, too. It's possible that one would like to change a flavor after watching an action movie.

2. Movie,
- Movie titles are ignored. Titles can be feed into transformer and provides additional information.
- Movie average rating
- Movie actors and directors 

3. Users,  
- User rating scale (some people never give 1, even if one is not satisfied with it.)
- User connections, such as friends, may influence.

In practice,  
- A cold start problem. New users and new movies do not have accurate embeddings associated with. Data augmentation or Contrastive Learning may helps. If the history is empty, a zero tensor is not providing any information. It's even harder.
