# creating our enhanced model
### enhancements
- we need to model availability into our model 
- we need to model repeat consumption as users will repeatedly consume the days after but two days after less and 3 days after it is not as likely
- we need to model users with high interactions and users with low interactions. Tow models?

create a NCF
test it
create an availability measure. This could be doing a daily version where users are modelled by their day to day interactions? 
test model on upper and lower quartiles, see if there is a change in acc



In [4]:
import numpy as np
import pandas as pd
import os

file_path = os.path.join(os.getcwd(),'Datasets/100k_a.csv')
cols = ["user","stream","streamer","start","stop"]
data = pd.read_csv(file_path, header=None, names=cols)
data.user = pd.factorize(data.user)[0]+1
data['streamer_raw'] = data.streamer
data.streamer = pd.factorize(data.streamer)[0]+1
print("Num users: ", data.user.nunique())
print("Num streamers: ", data.streamer.nunique())
print("Num interactions: ", len(data))

def custom_train_test_split(interactions_matrix, test_size=0.2):
    # Set initial training and testing matrices
    train_matrix = np.zeros_like(interactions_matrix)
    test_matrix = np.zeros_like(interactions_matrix)
    
    # Ensure at least one interaction per user and per streamer in the training set
    for user in range(interactions_matrix.shape[0]):
        user_interactions = np.where(interactions_matrix[user, :] > 0)[0]
        if len(user_interactions) > 0:
            selected = np.random.choice(user_interactions, size=1)
            train_matrix[user, selected] = interactions_matrix[user, selected]

    for streamer in range(interactions_matrix.shape[1]):
        streamer_interactions = np.where(interactions_matrix[:, streamer] > 0)[0]
        if len(streamer_interactions) > 0:
            selected = np.random.choice(streamer_interactions, size=1)
            train_matrix[selected, streamer] = interactions_matrix[selected, streamer]

    # Distribute the rest of the interactions
    remaining_indices = np.where((interactions_matrix != train_matrix) & (interactions_matrix > 0))
    remaining_indices = list(zip(remaining_indices[0], remaining_indices[1]))
    test_indices = np.random.choice(np.arange(len(remaining_indices)), size=int(len(remaining_indices) * test_size), replace=False)
    test_indices = [remaining_indices[i] for i in test_indices]

    for user, streamer in test_indices:
        test_matrix[user, streamer] = interactions_matrix[user, streamer]
        train_matrix[user, streamer] = 0
    return train_matrix, test_matrix


#let's train on the top 100,000 entries. These are the entries which decrease the sparsity the most.
from sklearn.preprocessing import MinMaxScaler
#calculate the user incteraction frequency
user_interactions = data.groupby('user').size()
streamer_interactions = data.groupby('streamer').size()
#rank the users and streamers by their frequency
user_ranks = user_interactions.rank(method='first', ascending=False)
streamer_ranks = streamer_interactions.rank(method='first', ascending=False)
#score the interactions and select the top 100,000
data['user_score'] = data['user'].map(user_ranks)
data['streamer_score'] = data['streamer'].map(streamer_ranks)
data['combined_score'] = data['user_score'] + data['streamer_score']
data['rating'] = data.stop - data.start


top_data = data.nsmallest(100000, 'combined_score')
filtered_data = top_data.drop(columns=['user_score', 'streamer_score', 'combined_score'])

print("Num users: ", filtered_data.user.nunique())
print("Num streamers: ", filtered_data.streamer.nunique())
print("Num interactions: ", len(filtered_data))

# check for sparsity of the data
potential_num_interactions = filtered_data.streamer.nunique() * filtered_data.user.nunique()
num_interactions = len(filtered_data['streamer']) #each row in the filtered_data is an interaction
sparsity = (1-num_interactions /potential_num_interactions) * 100
print(f'sparsity is {sparsity: .2f}%')




Num users:  100000
Num streamers:  162625
Num interactions:  3051733
Num users:  1474
Num streamers:  1380
Num interactions:  100000
sparsity is  95.08%


In [5]:
#creating a user item interaction matrix
user_streamer_interactions = filtered_data.pivot_table(index='user', columns='streamer', values='rating', aggfunc='sum', fill_value=0).values
#normalize the items 
user_streamer_interactions = MinMaxScaler().fit_transform(user_streamer_interactions)

#this makes sure that all the users and streamers are represented in the training and testing data
train_interactions, test_interactions = custom_train_test_split(user_streamer_interactions, test_size=0.2)

print(user_streamer_interactions.shape, test_interactions.shape, train_interactions.shape)

(1474, 1380) (1474, 1380) (1474, 1380)


In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
#items in this will be streams 
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_size, layers):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.fc_layers = nn.ModuleList([
            nn.Linear(layer[0], layer[1]) for layer in layers
        ])
        self.output = nn.Linear(layers[-1][1], 1)
        self.relu = nn.ReLU()
        #implicit
        self.criterion = nn.BCELoss()
        #explicit
        #self.criterion = nn.MSELoss()
        self.optimiser = torch.optim.Adam(self.parameters(), lr = 0.001)


    def forward(self, user_id, item_id):
        '''this takes in the available items as a vector of 1-n_of_streams
        it multiplies the '''
        user_vec = self.user_embedding(user_id)
        item_vec = self.item_embedding(item_id)
        vector = torch.cat([user_vec, item_vec], dim=-1)
        for layer in self.fc_layers:
            vector = self.relu(layer(vector))
        prediction = torch.sigmoid(self.output(vector))
        return prediction
    
    def train(self, num_epochs, data):
        train_loader = 
        for epoch in range(num_epochs):
            print(f'epoch {epoch} started\n')
            for user_in, item_in, labels in train_loader:
                #send all to gpu
                predictions = self.forward(user_in, item_in, labels)
                loss = self.criterion(predictions, labels)
                self.optimiser.zero_grad()
                loss.backward()
                self.optimiser.step()
        
 

NameError: name '_C' is not defined