In [1]:
# import pytorch libraries
%matplotlib inline
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import random
import scipy
from scipy import sparse 
import itertools
from sklearn.metrics import accuracy_score
from collections import defaultdict, Counter
import pickle
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [2]:
ratings_df = pd.read_csv('training.csv')
ratings_df['rating'] = 1
ratings_df.head(2)

Unnamed: 0,user_id,item_id,context_feature_id,rating
0,0,28366,2,1
1,0,16109,2,1


In [3]:
item_df = pd.read_csv('item_feature.csv')
item_df.head(2)

Unnamed: 0,item_id,item_feature_id
0,0,139
1,1,55


In [4]:
df_mode = ratings_df.groupby('user_id')['context_feature_id']\
                    .agg(lambda x: pd.Series.mode(x)[0]).to_frame().reset_index()

In [5]:
df_mode.head(2)

Unnamed: 0,user_id,context_feature_id
0,0,2
1,1,2


### Negative Sampling

In [6]:
# key = user_id; value = set(item1, item2,.....) which the user has interaction with
positive_user_dic = defaultdict(set)
present_tuple_list = [tuple(x) for x in ratings_df[['user_id', 'item_id']].to_numpy()]
for key, val in present_tuple_list:
    positive_user_dic[key].add(val)

# all items set
all_items_set = set(ratings_df.item_id)

# finding most common k items
most_common_item_set = set([x[0] for x in Counter(ratings_df.item_id).most_common(100)])

# uncommon item set
uncommon_item_set = all_items_set - most_common_item_set

# key = user_id, val = num occurences in training data
freq_users_dic = Counter(ratings_df.user_id)

In [7]:
negative_user_dic = {}
# for ix, user in enumerate(positive_user_dic):
#     if ix % 10000 == 0 : print(f'Processed for {ix} users')
#     neg_items = np.random.choice(list(uncommon_item_set - positive_user_dic[user]), size=freq_users_dic[user] * 4)
#     negative_user_dic[user] = neg_items
    
# with open('negative_user_dic_v2.pkl', 'wb') as handle:
#     pickle.dump(negative_user_dic, handle)
    
with open('negative_user_dic_v3.pkl', 'rb') as handle:
    negative_user_dic = pickle.load(handle)

In [8]:
negative_record_list = []
for user in negative_user_dic:
    for item in negative_user_dic[user]:
        record = {}
        record['user_id'] = user
        record['item_id'] = item
        record['rating'] = 0
        negative_record_list.append(record)
negative_df = pd.DataFrame(negative_record_list)

In [9]:
negative_df.head(2)

Unnamed: 0,user_id,item_id,rating
0,0,23054,0
1,0,3774,0


In [10]:
df_mode.head(2)

Unnamed: 0,user_id,context_feature_id
0,0,2
1,1,2


In [11]:
#Merging negative df with the context feature id
negative_df_con = negative_df.merge(df_mode, on='user_id')

In [12]:
negative_df_con.head(2)

Unnamed: 0,user_id,item_id,rating,context_feature_id
0,0,23054,0,2
1,0,3774,0,2


In [13]:
ratings_df.head(2)

Unnamed: 0,user_id,item_id,context_feature_id,rating
0,0,28366,2,1
1,0,16109,2,1


In [14]:
df_final = ratings_df[['user_id', 'item_id', 'rating', 'context_feature_id']].append(negative_df_con)

In [15]:
df_final.tail(2)

Unnamed: 0,user_id,item_id,rating,context_feature_id
1940488,200152,1406,0,2
1940489,200152,39300,0,2


In [16]:
df_final = df_final.merge(item_df, on='item_id')
negative_df_con.shape

(1940490, 4)

In [17]:
df_final.head(2)

Unnamed: 0,user_id,item_id,rating,context_feature_id,item_feature_id
0,0,28366,1,2,7
1,1731,28366,1,1,7


In [18]:
df_final.rating.value_counts()

0    1940490
1     970245
Name: rating, dtype: int64

### Splitting and Encoding data

In [19]:
train, valid = train_test_split(df_final, test_size=0.2, shuffle=True, stratify=df_final['rating'])

In [20]:
# starting encoding from 1. Keeping 0 reserved for the average encoding

train_user_ids = np.sort(np.unique(train.user_id.values))
userid2idx = {o:i+1 for i,o in enumerate(train_user_ids)}
train["user_id"] = train["user_id"].apply(lambda x: userid2idx[x])
valid["user_id"] = valid["user_id"].apply(lambda x: userid2idx.get(x,0))

train_item_ids = np.sort(np.unique(train.item_id.values))
item2idx = {o:i+1 for i,o in enumerate(train_item_ids)}
train["item_id"] = train["item_id"].apply(lambda x: item2idx[x])
valid["item_id"] = valid["item_id"].apply(lambda x: item2idx.get(x, 0))

In [21]:
train.user_id.max() + 1, train.user_id.nunique()

(169693, 169692)

In [22]:
user_prob = 0.01
item_prob = 0.005

user_random_indices = np.random.choice(train.index.values, size=int(user_prob * train.shape[0]), replace=False)
item_random_indices = np.random.choice(train.index.values, size=int(item_prob * train.shape[0]), replace=False)
train.loc[user_random_indices, 'user_id'] = 0
train.loc[item_random_indices, 'item_id'] = 0

user_random_indices = np.random.choice(valid.index.values, size=int(user_prob * valid.shape[0]), replace=False)
item_random_indices = np.random.choice(valid.index.values, size=int(item_prob * valid.shape[0]), replace=False)
valid.loc[user_random_indices, 'user_id'] = 0
valid.loc[item_random_indices, 'item_id'] = 0

In [23]:
train.user_id.max() + 1, train.user_id.nunique()

(169693, 169692)

In [24]:
round(train[train.user_id==0].shape[0]/train.shape[0], 3), round(train[train.item_id==0].shape[0]/train.shape[0], 2)

(0.01, 0.0)

In [25]:
round(valid[valid.user_id==0].shape[0]/valid.shape[0], 3), round(valid[valid.item_id==0].shape[0]/valid.shape[0], 2)

(0.01, 0.0)

### Training MF model

In [26]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, num_item_feature, num_context_feature, emb_size=100):
        super(MF, self).__init__()
        self.emb_size = emb_size
        # defining layers
         # initialising the embedding and bias layer for Users
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_feature_emb = nn.Embedding(num_item_feature, 10)
        self.context_feature_emb = nn.Embedding(num_context_feature, 5)
        
        # init layers
        self.lin1 = nn.Linear(115, 10)
        self.lin2 = nn.Linear(10, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.25)

    def forward(self, u, v, ife, cfe):
        ### BEGIN SOLUTION
        U = self.user_emb(u)
        V = self.item_emb(v)
        IFE = self.item_feature_emb(ife)
        CFE = self.context_feature_emb(cfe)
        x = torch.cat([U, V, IFE, CFE], 1)    
        x = self.dropout(x)
        x = self.lin1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.lin2(x)
        return torch.sigmoid(x)
    
def train_one_epoch(model, train_df, optimizer):
    """ Trains the model for one epoch"""
    model.train()
    ### BEGIN SOLUTION
    users = torch.LongTensor(train_df.user_id.values)  
    items = torch.LongTensor(train_df.item_id.values) 
    item_features = torch.LongTensor(train_df.item_feature_id.values)  
    cont_features = torch.LongTensor(train_df.context_feature_id.values)
    ratings = torch.FloatTensor(train_df.rating.values)  
    
    y_hat = model(users, items, item_features, cont_features).squeeze(1)
    train_loss = F.binary_cross_entropy(y_hat, ratings)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    ### END SOLUTION
    return train_loss.detach()

def valid_metrics(model, valid_df):
    """Computes validation loss and accuracy"""
    model.eval()
    ### BEGIN SOLUTION
    users = torch.LongTensor(valid_df.user_id.values) 
    items = torch.LongTensor(valid_df.item_id.values)
    item_features = torch.LongTensor(valid_df.item_feature_id.values)  
    cont_features = torch.LongTensor(valid_df.context_feature_id.values)
    
    ratings = torch.FloatTensor(valid_df.rating.values) 
    y_hat = model(users, items, item_features, cont_features).squeeze(1)
    valid_loss = F.binary_cross_entropy(y_hat, ratings)
    valid_acc = sum(np.where(y_hat < 0.5, 0, 1) == ratings.numpy()) / len(ratings.numpy())
    ### END SOLUTION
    return valid_loss.detach(), valid_acc


def training(model, train_df, valid_df, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        train_loss = train_one_epoch(model, train_df, optimizer)
        valid_loss, valid_acc = valid_metrics(model, valid_df) 
        print(f" epoch {i+1}: train loss %.3f valid loss %.3f valid acc %.3f" % (train_loss, valid_loss, valid_acc)) 

In [27]:
num_users = train.user_id.max() + 1
num_items = train.item_id.max() + 1
num_item_features = item_df.item_feature_id.max() + 1
num_context_features = ratings_df.context_feature_id.max() + 1

In [28]:
model_dic = {}
size_range = [i for i in range(50, 51, 10)]
for size in size_range:
    print(size)
    model = MF(num_users, num_items, num_item_features, num_context_features, emb_size=size)
    training(model, train, valid, epochs = 10, lr=0.05, wd=1e-6)
    model_dic[size] = model

50
 epoch 1: train loss 0.677 valid loss 0.633 valid acc 0.667
 epoch 2: train loss 0.642 valid loss 0.617 valid acc 0.673
 epoch 3: train loss 0.628 valid loss 0.599 valid acc 0.684
 epoch 4: train loss 0.610 valid loss 0.581 valid acc 0.702
 epoch 5: train loss 0.592 valid loss 0.559 valid acc 0.730
 epoch 6: train loss 0.571 valid loss 0.525 valid acc 0.759
 epoch 7: train loss 0.540 valid loss 0.479 valid acc 0.792
 epoch 8: train loss 0.502 valid loss 0.427 valid acc 0.825
 epoch 9: train loss 0.460 valid loss 0.382 valid acc 0.850
 epoch 10: train loss 0.423 valid loss 0.352 valid acc 0.865


### Test predictions

In [29]:
test = pd.read_csv('test_kaggle.csv')
test = test.merge(item_df, on='item_id')
test["item_id"] = test["item_id"].apply(lambda x: item2idx.get(x, 0))
test["user_id"] = test["user_id"].apply(lambda x: userid2idx.get(x,0))
test = test.sort_values('id')
test.head()

Unnamed: 0,id,user_id,item_id,context_feature_id,item_feature_id
0,0,0,16033,2,142
25,1,0,21518,3,142
38,2,0,1879,1,142
77,3,0,27527,1,148
97,4,0,13755,2,63


In [30]:
users = torch.LongTensor(test.user_id.values)
items = torch.LongTensor(test.item_id.values)
item_features = torch.LongTensor(test.item_feature_id.values)
context_features = torch.LongTensor(test.context_feature_id.values)
for size in size_range:
    col_name = 'size_' + str(size)
    y_hat = model_dic[size](users, items, item_features, context_features)
    test[col_name] = y_hat.detach().numpy()

In [31]:
test = test[['id'] + [col for col in test.columns if 'size' in col]]
test.head()

Unnamed: 0,id,size_50
0,0,0.513939
25,1,0.551575
38,2,0.468621
77,3,0.077152
97,4,0.036618


In [32]:
test['rating'] = test.iloc[:, 1:].mean(axis=1)
test.head()

Unnamed: 0,id,size_50,rating
0,0,0.513939,0.513939
25,1,0.551575,0.551575
38,2,0.468621,0.468621
77,3,0.077152,0.077152
97,4,0.036618,0.036618


In [33]:
sub = test[['id', 'rating']]
sub.head()

Unnamed: 0,id,rating
0,0,0.513939
25,1,0.551575
38,2,0.468621
77,3,0.077152
97,4,0.036618


In [35]:
sub.loc[sub.rating < 0.2, 'rating'] = 0.2
sub.loc[sub.rating > 0.8, 'rating'] = 0.8
sub.rating.min(), sub.rating.max()

(0.20000000298023224, 0.800000011920929)

In [36]:
sub.to_csv('submission_40.csv', index=False)