In [1]:
# import pytorch libraries
%matplotlib inline
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import random
import scipy
from scipy import sparse 
import itertools
from sklearn.metrics import accuracy_score
from collections import defaultdict, Counter
import pickle
from sklearn.model_selection import train_test_split

In [2]:
ratings_df = pd.read_csv('training.csv')
ratings_df['rating'] = 1
ratings_df.head(2)

Unnamed: 0,user_id,item_id,context_feature_id,rating
0,0,28366,2,1
1,0,16109,2,1


In [3]:
ratings_df.user_id.nunique(), ratings_df.item_id.nunique()

(169698, 37978)

In [4]:
item_df = pd.read_csv('item_feature.csv')
item_df.head(2)

Unnamed: 0,item_id,item_feature_id
0,0,139
1,1,55


### Negative Sampling

In [5]:
# key = user_id; value = set(item1, item2,.....) which the user has interaction with
positive_user_dic = defaultdict(set)
present_tuple_list = [tuple(x) for x in ratings_df[['user_id', 'item_id']].to_numpy()]
for key, val in present_tuple_list:
    positive_user_dic[key].add(val)

# all items set
all_items_set = set(ratings_df.item_id)

# finding most common k items
most_common_item_set = set([x[0] for x in Counter(ratings_df.item_id).most_common(100)])

# uncommon item set
uncommon_item_set = all_items_set - most_common_item_set

# key = user_id, val = num occurences in training data
freq_users_dic = Counter(ratings_df.user_id)

In [6]:
#creating negative users and saving as a pickle file

# negative_user_dic = {}
# for ix, user in enumerate(positive_user_dic):
#     if ix % 10000 == 0 : print(f'Processed for {ix} users')
#     neg_items = np.random.choice(list(uncommon_item_set - positive_user_dic[user]), size=freq_users_dic[user] * 4)
#     negative_user_dic[user] = neg_items
    
# with open('negative_user_dic.pkl', 'wb') as handle:
#     pickle.dump(negative_user_dic, handle)
    
with open('negative_user_dic_v2.pkl', 'rb') as handle:
    negative_user_dic = pickle.load(handle)

In [7]:
negative_record_list = []
for user in negative_user_dic:
    for item in negative_user_dic[user]:
        record = {}
        record['user_id'] = user
        record['item_id'] = item
        record['rating'] = 0
        negative_record_list.append(record)
negative_df = pd.DataFrame(negative_record_list)

In [8]:
df_final = ratings_df[['user_id', 'item_id', 'rating']].append(negative_df)
num_users = df_final.user_id.max()
num_items = df_final.item_id.max()
num_item_features = item_df.item_feature_id.nunique()

In [9]:
df_final.shape

(4851225, 3)

In [10]:
df_final = df_final.merge(item_df, on='item_id')
df_final.shape

(4851225, 4)

In [11]:
df_final.head()

Unnamed: 0,user_id,item_id,rating,item_feature_id
0,0,28366,1,7
1,1731,28366,1,7
2,10168,28366,1,7
3,18883,28366,1,7
4,19763,28366,1,7


### Splitting and Encoding data

In [12]:
train, valid = train_test_split(df_final, test_size=0.2, shuffle=True, stratify=df_final['rating'])

In [13]:
print(train.shape, valid.shape)

(3880980, 4) (970245, 4)


In [14]:
train.user_id.nunique(), valid.user_id.nunique()

(169698, 168305)

In [15]:
len(set(train.user_id.unique()) & set(valid.user_id.unique()))

168305

### Training MF model

In [16]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, num_item_feature, emb_size=100, seed=23):
        super(MF, self).__init__()
        torch.manual_seed(seed)
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        
        self.item_feature_emb = nn.Embedding(num_item_feature, emb_size)
        self.item_feature_bias = nn.Embedding(num_item_feature, 1)
        
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.item_feature_emb.weight.data.uniform_(0,0.05)
        
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        self.item_feature_bias.weight.data.uniform_(-0.01,0.01)
        

    def forward(self, u, v, ife):
        ### BEGIN SOLUTION
        U = self.user_emb(u)
        V = self.item_emb(v)
        IFE = self.item_feature_emb(ife)
        
        b_u = self.user_bias(u).squeeze(1)
        b_v = self.item_bias(v).squeeze(1)
        b_ife = self.item_feature_bias(ife).squeeze(1)
        
        return torch.sigmoid(((U*V).sum(1) +  b_u  + b_v + (U*IFE).sum(1) + b_ife))
    
def train_one_epoch(model, train_df, optimizer):
    """ Trains the model for one epoch"""
    model.train()
    users = torch.LongTensor(train_df.user_id.values)  
    items = torch.LongTensor(train_df.item_id.values) 
    item_features = torch.LongTensor(train_df.item_feature_id.values)
    ratings = torch.FloatTensor(train_df.rating.values) 
    y_hat = model(users, items, item_features)
    train_loss = F.binary_cross_entropy(y_hat, ratings)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    return train_loss.detach()

def valid_metrics(model, valid_df):
    """Computes validation loss and accuracy"""
    model.eval()
    ### BEGIN SOLUTION
    users = torch.LongTensor(valid_df.user_id.values) # .cuda()
    items = torch.LongTensor(valid_df.item_id.values) #.cuda()
    item_features = torch.LongTensor(valid_df.item_feature_id.values)
    ratings = torch.FloatTensor(valid_df.rating.values) #.cuda()
    y_hat = model(users, items, item_features)
    valid_loss = F.binary_cross_entropy(y_hat, ratings)
    valid_acc = sum(np.where(y_hat < 0.5, 0, 1) == ratings.numpy()) / len(ratings.numpy())
    ### END SOLUTION
    return valid_loss.detach(), valid_acc


def training(model, train_df, valid_df, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        train_loss = train_one_epoch(model, train_df, optimizer)
        valid_loss, valid_acc = valid_metrics(model, valid_df) 
        print(f" epoch {i+1}: train loss %.3f valid loss %.3f valid acc %.3f" % (train_loss, valid_loss, valid_acc)) 

In [17]:
[i for i in range(20, 80, 10)]

[20, 30, 40, 50, 60, 70]

In [18]:
# model = MF(num_users, num_items, emb_size=50)
# training(model, df_train, df_val, epochs = 10, lr=0.05)
model_dic = {}
size_range = [i for i in range(20, 81, 10)]
for size in size_range:
    print(size)
    model = MF(num_users + 1, num_items + 1, num_item_features, emb_size=size)
    model_dic[size] = model
    training(model, train, valid, epochs = 15, lr=0.1, wd=5e-6)

20
 epoch 1: train loss 0.701 valid loss 0.671 valid acc 0.801
 epoch 2: train loss 0.671 valid loss 0.535 valid acc 0.819
 epoch 3: train loss 0.534 valid loss 0.480 valid acc 0.819
 epoch 4: train loss 0.478 valid loss 0.456 valid acc 0.819
 epoch 5: train loss 0.451 valid loss 0.424 valid acc 0.825
 epoch 6: train loss 0.417 valid loss 0.394 valid acc 0.826
 epoch 7: train loss 0.385 valid loss 0.375 valid acc 0.828
 epoch 8: train loss 0.365 valid loss 0.365 valid acc 0.828
 epoch 9: train loss 0.354 valid loss 0.358 valid acc 0.831
 epoch 10: train loss 0.345 valid loss 0.347 valid acc 0.847
 epoch 11: train loss 0.334 valid loss 0.334 valid acc 0.881
 epoch 12: train loss 0.321 valid loss 0.321 valid acc 0.902
 epoch 13: train loss 0.308 valid loss 0.310 valid acc 0.909
 epoch 14: train loss 0.298 valid loss 0.302 valid acc 0.912
 epoch 15: train loss 0.290 valid loss 0.295 valid acc 0.914
30
 epoch 1: train loss 0.704 valid loss 0.710 valid acc 0.306
 epoch 2: train loss 0.710 v

### Test predictions

In [19]:
test = pd.read_csv('test_kaggle.csv')
test = test.merge(item_df, on='item_id')
test.head()

Unnamed: 0,id,user_id,item_id,context_feature_id,item_feature_id
0,0,4,16835,2,142
1,434,188,16835,1,142
2,48540,25437,16835,1,142
3,51161,26834,16835,0,142
4,73056,38465,16835,2,142


In [20]:
users = torch.LongTensor(test.user_id.values) # .cuda()
items = torch.LongTensor(test.item_id.values)
item_features = torch.LongTensor(test.item_feature_id.values)
for size in size_range:
    col_name = 'size_' + str(size)
    y_hat = model_dic[size](users, items, item_features)
    test[col_name] = y_hat.detach().numpy()

In [21]:
test = test[['id'] + [col for col in test.columns if 'size' in col]]
test = test.sort_values('id').reset_index(drop=True)
test.head()

Unnamed: 0,id,size_20,size_30,size_40,size_50,size_60,size_70,size_80
0,0,0.410514,0.414741,0.399068,0.39095,0.386666,0.369815,0.354124
1,1,0.151633,0.154151,0.148981,0.148355,0.151373,0.146386,0.14468
2,2,0.641308,0.654169,0.640107,0.644212,0.648831,0.642795,0.639298
3,3,0.125076,0.121813,0.120345,0.119166,0.12072,0.118607,0.118974
4,4,0.124948,0.123047,0.121355,0.121424,0.122496,0.120587,0.120325


In [22]:
test['rating'] = test.iloc[:, 1:].mean(axis=1)
test.head()

Unnamed: 0,id,size_20,size_30,size_40,size_50,size_60,size_70,size_80,rating
0,0,0.410514,0.414741,0.399068,0.39095,0.386666,0.369815,0.354124,0.389411
1,1,0.151633,0.154151,0.148981,0.148355,0.151373,0.146386,0.14468,0.149365
2,2,0.641308,0.654169,0.640107,0.644212,0.648831,0.642795,0.639298,0.644389
3,3,0.125076,0.121813,0.120345,0.119166,0.12072,0.118607,0.118974,0.120672
4,4,0.124948,0.123047,0.121355,0.121424,0.122496,0.120587,0.120325,0.122026


In [23]:
sub = test[['id', 'rating']]
# sub.columns = ['id', 'rating']
sub.head()

Unnamed: 0,id,rating
0,0,0.389411
1,1,0.149365
2,2,0.644389
3,3,0.120672
4,4,0.122026


In [24]:
sub.to_csv('submission_7.csv', index=False)

In [25]:
num_users, num_items, num_item_features

(200152, 39900, 195)

In [26]:
sub_5 = pd.read_csv('submission_5.csv')
sub_5.head()

Unnamed: 0,id,rating
0,0,0.527103
1,1,0.255954
2,2,0.78242
3,3,0.248875
4,4,0.24175


In [31]:
final = sub.merge(sub_5, on='id')
final.head()

Unnamed: 0,id,rating_x,rating_y
0,0,0.389411,0.527103
1,1,0.149365,0.255954
2,2,0.644389,0.78242
3,3,0.120672,0.248875
4,4,0.122026,0.24175


In [32]:
final['rating'] = final.iloc[:, 1:].mean(axis=1)
final.head()

Unnamed: 0,id,rating_x,rating_y,rating
0,0,0.389411,0.527103,0.458257
1,1,0.149365,0.255954,0.20266
2,2,0.644389,0.78242,0.713404
3,3,0.120672,0.248875,0.184773
4,4,0.122026,0.24175,0.181888


In [33]:
final = final[['id', 'rating']]
final.head()

Unnamed: 0,id,rating
0,0,0.458257
1,1,0.20266
2,2,0.713404
3,3,0.184773
4,4,0.181888


In [34]:
final.to_csv('submission_8.csv', index=False)