In [137]:
import numpy as np
import pandas as pd
import gzip 
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#http://jmcauley.ucsd.edu/data/amazon/
#CDs and Vinyl 5-core

In [3]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

metadata = getDF('meta_CDs_and_Vinyl.json.gz')

In [6]:
metadata['productType'] = metadata['salesRank'].apply(lambda x: [str(i) for i in x][0] if len(str(x))>4 else '')
metadata['genre'] = metadata['categories'].apply(lambda x: str(x).replace(']', '').split(', ')[1].replace("'", '')
                                                                                   if len(str(x).replace(']', '').split(', '))>1 
                                                                                    else '')

In [138]:
a = np.load('merged_album_data.pkl')
#null album titles are no use for us
a = a[~(a['title'].isnull())]
a = a[['reviewerID', 'asin', 'title']].drop_duplicates(subset=['reviewerID', 'title'])
a = a.merge(metadata[['title', 'genre']], on='title', how='inner')

In [139]:
prod_gen_counts = a[['title', 'genre']].drop_duplicates().groupby('title')['genre'].count().rename('n').reset_index()
prod_gen_onetoone = prod_gen_counts[prod_gen_counts['n']==1]
len(prod_gen_onetoone),len(prod_gen_counts)

(36162, 42417)

### drop products with more than 1 genre

In [140]:
a = a[a['title'].isin(prod_gen_onetoone['title'])]

In [141]:
users_list = a['reviewerID'].unique()
users_index = {user: idx for idx, user in enumerate(users_list)}
#tranform each prod into an index
prod_list = a['title'].unique()
prod_index = {prod: idx for idx, prod in enumerate(prod_list)}
#genre index
genre_list = a['genre'].unique()
genre_index = {genre: idx for idx, genre in enumerate(genre_list)}
a = a[['reviewerID', 'asin','title', 'genre']]
a['reviewerID'] = a['reviewerID'].apply(lambda x: users_index[x]).astype('int')
a['prodID'] = a['title'].apply(lambda x: prod_index[x]).astype('int')
music_index = dict(zip(a['title'], a['prodID']))
a['genreID'] = a['genre'].apply(lambda x: genre_index[x]).astype('int')
a = a[['reviewerID', 'prodID', 'genreID']]
a = a.drop_duplicates(subset=['reviewerID', 'prodID', 'genreID'])
a['purchase_flag'] = 1
prod_genre = a[['prodID', 'genreID']].drop_duplicates()

## Generate negative samples

In [142]:
#function used from https://medium.com/@2j/negative-sampling-in-numpy-18a9ad810385
def negsamp_vectorized_bsearch(pos_inds, n_items, n_samp=32):
    raw_samps = np.random.randint(0, n_items, size=n_samp)
    ss = np.searchsorted(pos_inds, raw_samps)
    pos_mask = raw_samps == np.take(pos_inds, ss, mode='clip')
    neg_inds = raw_samps[~pos_mask]
    return neg_inds

In [143]:
#shrink dataframe so that we have one row per reviewer, with products as an array
#n_samp is another hyperparameter that would be good to optimise. Perhaps we could just look at popular products?
a_shrink = a.groupby('reviewerID')['prodID'].apply(np.array).rename('prodIDs').reset_index()
a_shrink['negative_prodIDs'] = a_shrink['prodIDs'].apply(lambda x: negsamp_vectorized_bsearch(x, max(prod_index.values()), n_samp=2*len(x)))

In [144]:
s = a_shrink.apply(lambda x: pd.Series(x['negative_prodIDs']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'prodID'
a_shrink = a_shrink.drop(['prodIDs','negative_prodIDs'], axis=1).join(s)
a_shrink['purchase_flag'] = 0
a = pd.concat([a[['reviewerID', 'prodID', 'purchase_flag']], a_shrink], ignore_index=True)
a = a.merge(prod_genre, how='inner', on='prodID')
a.head()

Unnamed: 0,reviewerID,prodID,purchase_flag,genreID
0,0,0.0,1,0
1,1,0.0,1,0
2,2,0.0,1,0
3,3,0.0,1,0
4,4,0.0,1,0


In [145]:
a = a.sort_values(['reviewerID', 'prodID'], ascending=[True, True]).reset_index()[['reviewerID', 'prodID', 'genreID', 'purchase_flag']]

In [146]:
a.head()

Unnamed: 0,reviewerID,prodID,genreID,purchase_flag
0,0,0.0,0,1
1,0,1210.0,9,0
2,0,2907.0,0,0
3,0,3557.0,12,1
4,0,10526.0,12,0


In [147]:
a.to_pickle('merged_album_data_final')

In [148]:
#check all products are 1-1 with genres
a.groupby('prodID')['genreID'].nunique().sort_values(ascending=False).head()

prodID
36161.0    1
12050.0    1
12056.0    1
12055.0    1
12054.0    1
Name: genreID, dtype: int64

# Build model and dataloader

In [149]:
from torch.utils.data import Dataset, DataLoader
#create a custom data dataset / dataloader
class music_dataset(Dataset):

    def __init__(self):
        xy = np.load('merged_album_data_final')
        self.u = np.array(xy.iloc[:,0:1])
        self.p = np.array(xy.iloc[:,1:2])
        self.g = np.array(xy.iloc[:,2:3])
        self.upg = np.array(xy.drop(xy.columns[3],axis=1))
        self.y = np.array(xy.iloc[:,3:])
    def __len__(self):
        return len(self.upg)

    def __getitem__(self, idx):
        return self.upg[idx], self.y[idx]

In [150]:
#https://forums.fast.ai/t/lesson-4-advanced-discussion/30319/127
from torch.nn import Module, Embedding, BCELoss
from torch.optim import *
from torch.autograd import Variable
import torch 

def get_embs(ni, nf):
    "Create an embedding layer."
    emb = torch.nn.Embedding(ni, nf)
    # See https://arxiv.org/abs/1711.09160
    emb.weight.data.uniform_(0,0.05)
    return emb

class EmbeddingModel(Module):
    #initiate the weights and biases of user and product.
    #these need to be leared through forward pass
    def __init__(self, n_dims, n_users, n_items, n_genres):
        super(EmbeddingModel, self).__init__()
        (self.u_weight, self.i_weight, self.g_weight, self.u_bias, self.i_bias, self.g_bias) = [get_embs(*o) for o in [
            (n_users, n_dims), #user weights
            (n_items, n_dims), #product weights
            (n_genres, n_dims), #genre weights
            (n_users,1), #user bias
            (n_items,1), #product bias
            (n_genres,1)]] #genre bias
    def forward(self, users, items, genres):
        matmul = self.u_weight(users)* self.i_weight(items)* self.g_weight(genres)
        out = matmul.sum(1) + self.u_bias(users).squeeze() + self.i_bias(items).squeeze() + self.g_bias(genres).squeeze()
        #run output through a sigmoid
        return torch.sigmoid(out)

In [151]:
from torch.optim.lr_scheduler import _LRScheduler
class CyclicLR(_LRScheduler):
    def __init__(self, optimizer, schedule, last_epoch=-1):
        assert callable(schedule)
        self.schedule = schedule
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        return [self.schedule(self.last_epoch, lr) for lr in self.base_lrs]
    
def triangular(step_size, max_lr, method='triangular', gamma=0.99):
    def scheduler(epoch, base_lr):
        period = 2 * step_size
        cycle = math.floor(1 + epoch/period)
        x = abs(epoch/step_size - 2*cycle + 1)
        delta = (max_lr - base_lr)*max(0, (1 - x))            
        return base_lr + delta
        
    return scheduler 

def cosine(t_max, eta_min=0):  
    def scheduler(epoch, base_lr):
        t = epoch % t_max
        return eta_min + (base_lr - eta_min)*(1 + math.cos(math.pi*t/t_max))/2
    
    return scheduler


In [None]:
#we have a simple architecture now (embeddings). 
#we need to build a learner to train the model with
# Loop over epochs
#https://github.com/devforfu/pytorch_playground/blob/master/movielens.ipynb
import math
import torch.nn.functional as F

#load dataset and loader
df_new = music_dataset()
ds = DataLoader(df_new, batch_size=64, shuffle=True)

#load model
model = EmbeddingModel(n_dims=40, n_users=len(np.unique(df_new.u)),
           n_items=len(np.unique(df_new.p)), n_genres=len(genre_index))
#loss
lr=1e-2
loss_func = BCELoss()
max_epochs = 1
optimizer = Adam(model.parameters(),lr=lr,weight_decay=1e-5)
iterations_per_epoch = len(ds)
scheduler = CyclicLR(optimizer, cosine(t_max=iterations_per_epoch * 2, eta_min=lr/10))
optimizer.zero_grad()
full_loss_df = []
loss_values = []
iteration = 0
for epoch in range(max_epochs):
    # Training
    loss= 0.
    for local_index, local_batch in enumerate(ds, 0):
        iteration += 1
        loss= 0.
        #pass in the indices of the batch user and prod 
        output = model.forward(local_batch[0][:,0:1].squeeze(1).long(),
                               local_batch[0][:,1:2].squeeze(1).long(),
                               local_batch[0][:,2:3].squeeze(1).long())
        #compare outputs of batch with n=64 to label and compute the loss
        labels = local_batch[1].float()
        #calculate the loss
        loss = loss_func(output, labels.squeeze(1))
        scheduler.step()
        #update the parameters using backpropogation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_values.append(loss.data.item())
        if (len(loss_values) % 100 == 0) | (len(loss_values) == 1):
            print(loss.data.item())
    #load full dataset and run forward pass
    output_full = Variable(model.forward(torch.tensor(df_new.upg[:,0:1].squeeze(1)).long(),
                                 torch.tensor(df_new.upg[:,1:2].squeeze(1)).long(),
                                 torch.tensor(df_new.upg[:,2:3].squeeze(1)).long()), requires_grad=False)
    full_loss = loss_func(output_full, Variable(torch.tensor(df_new.y).float().squeeze(1)))        #calculate the loss
    full_loss_df.append(full_loss.data.item())   

0.6985642910003662
0.6856696009635925
0.6475222110748291
0.7201673984527588
0.6505962014198303
0.5592682361602783
0.6675215363502502
0.6794180274009705
0.6650784015655518
0.5420235395431519
0.5913771986961365
0.6471108198165894
0.6252194046974182
0.658309817314148
0.60185706615448
0.6368136405944824
0.5185902714729309
0.6065595746040344
0.5565740466117859
0.5480939149856567
0.5912162065505981
0.5625611543655396
0.6747888326644897
0.6204331517219543
0.5588315725326538
0.5963636636734009
0.5866166949272156
0.5327626466751099
0.5914582014083862
0.5689964294433594
0.6040873527526855
0.691055178642273
0.6297612190246582
0.6609615087509155
0.5465720295906067
0.5600336790084839
0.568292498588562
0.5567008852958679
0.6060009598731995
0.59093177318573
0.5493320226669312
0.5169522166252136
0.5457373261451721
0.5828155875205994
0.6113525032997131
0.6144486665725708
0.5834595561027527
0.5167630314826965
0.5106644034385681
0.4873962998390198
0.5970523953437805
0.5566437840461731
0.6341115236282349


In [None]:
plt.plot(loss_values), len(loss_values)

In [None]:
# torch.save(model.state_dict(), 'dotprod_scratch_final')
model = torch.load('dotprod_scratch_final')

In [None]:
# element wise multiply product and genre tensors together

In [None]:
def prod_genre_combo(i_index, g_index):
    prod_tensor = model['i_weight.weight'][i_index].double()
    genre_tensor = model['g_weight.weight'][g_index].double()
    return prod_tensor*genre_tensor
prod_genre_combo_tensor = torch.empty(size=(len(prod_genre), 40))
for i in range(len(prod_genre)-1):
    prod_genre_combo_tensor[i] = prod_genre_combo(i,prod_genre['genreID'].iloc[i])

In [None]:
#cosine similarity of albums

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = pd.DataFrame(cosine_similarity(
            X=prod_genre_combo_tensor),
            index=list(music_index.keys()))
similarity_matrix.columns = list(music_index.keys())

In [None]:
def find_similar_albums(album, n):
    similar_items = pd.DataFrame(similarity_matrix.loc[album])
    similar_items.columns = ['score']
    similar_items = similar_items.sort_values('score', ascending=False)
    similar_items = similar_items.head(n).reset_index()
    similar_items = similar_items.rename(index=str, columns={'index': 'item_id'})
    return similar_items.to_dict()

In [None]:
find_similar_albums('Britney', 20)

# Product weights

In [None]:
music_index = dict(zip(df_music['title'], df_music['prodID']))

#from fastai - torch pca
def _pca(x, k=2):
    "Compute PCA of `x` with `k` dimensions."
    x = x-torch.mean(x,0)
    U,S,V = torch.svd(x.t())
    return torch.mm(x,U[:,:k])
torch.Tensor.pca = _pca

def generate_embedding_plot(top=500):
    g = df_music.groupby('title')['overall'].count()
    #grab top music
    top_music = g.sort_values(ascending=False).index.values[:top]
    top_music_w = items_w[[music_index[x] for x in top_music.tolist()]]
    #reduce the dimensions from 40 to 2
    top_music_w_pca = top_music_w.detach().pca(2)
    fac0,fac1 = top_music_w_pca.t()
    music_comp = [(f, i) for f,i in zip(fac0, top_music)]
    
    #produce plot
    idxs = np.random.choice(len(top_music), 300, replace=False)
    idxs = list(range(100))
    X = fac0[idxs]
    Y = fac1[idxs]
    plt.figure(figsize=(25,25))
    plt.scatter(X, Y,clip_on=True)
    for i, x, y in zip(top_music[idxs], X, Y):
        plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=13,clip_on=True)
generate_embedding_plot()    

# Genre weights

In [None]:
genres_w = model['g_weight.weight']
g = df_music.groupby('title')['overall'].count()
#grab top music
top_genres_w = genres_w[[genre_index[x] for x in genre_list]]
#reduce the dimensions from 40 to 2
top_genres_w_pca = top_genres_w.detach().pca(2)
fac0,fac1 = top_genres_w_pca.t()

#produce plot
X = fac0
Y = fac1
plt.figure(figsize=(10,10))
plt.scatter(X, Y,clip_on=True)
for i, x, y in zip(genre_list, X, Y):
    plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=13,clip_on=True) 