# project plan

The recommender will use collaborative filtering as the dataset only has user -> streamer interactions this means we can match users to streams by getting similar interactions such as when users start watching, how long they watch and who they watch. 

we can then use more comlplex techniques to model the users returning to streams and to model trends in user interaction.

on this paper https://cs229.stanford.edu/proj2014/Christopher%20Aberger,%20Recommender.pdf of the algorithms discussed we have Biased SGD was one of the best algorithms but alternating Least squares worked best for a very sparse dataset.

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import surprise


# Loading the DATA


In [2]:
file_path = os.path.join(os.getcwd(),'Datasets/100k_a.csv')
cols = ["user","stream","streamer","start","stop"]
data = pd.read_csv(file_path, header=None, names=cols)
data.user = pd.factorize(data.user)[0]+1
data['streamer_raw'] = data.streamer
data.streamer = pd.factorize(data.streamer)[0]+1
print("Num users: ", data.user.nunique())
print("Num streamers: ", data.streamer.nunique())
print("Num interactions: ", len(data))




Num users:  100000
Num streamers:  162625
Num interactions:  3051733


# data preperation 

make sure the data is 100k in size, make sure any preperation steps are **explained** and **justified**


In [3]:

def custom_train_test_split(interactions_matrix, test_size=0.2):
    # Set initial training and testing matrices
    train_matrix = np.zeros_like(interactions_matrix)
    test_matrix = np.zeros_like(interactions_matrix)
    
    # Ensure at least one interaction per user and per streamer in the training set
    for user in range(interactions_matrix.shape[0]):
        user_interactions = np.where(interactions_matrix[user, :] > 0)[0]
        if len(user_interactions) > 0:
            selected = np.random.choice(user_interactions, size=1)
            train_matrix[user, selected] = interactions_matrix[user, selected]

    for streamer in range(interactions_matrix.shape[1]):
        streamer_interactions = np.where(interactions_matrix[:, streamer] > 0)[0]
        if len(streamer_interactions) > 0:
            selected = np.random.choice(streamer_interactions, size=1)
            train_matrix[selected, streamer] = interactions_matrix[selected, streamer]

    # Distribute the rest of the interactions
    remaining_indices = np.where((interactions_matrix != train_matrix) & (interactions_matrix > 0))
    remaining_indices = list(zip(remaining_indices[0], remaining_indices[1]))
    test_indices = np.random.choice(np.arange(len(remaining_indices)), size=int(len(remaining_indices) * test_size), replace=False)
    test_indices = [remaining_indices[i] for i in test_indices]

    for user, streamer in test_indices:
        test_matrix[user, streamer] = interactions_matrix[user, streamer]
        train_matrix[user, streamer] = 0
    return train_matrix, test_matrix



In [4]:
#let's train on the top 100,000 entries. These are the entries which decrease the sparsity the most.
from sklearn.preprocessing import MinMaxScaler
#calculate the user incteraction frequency
user_interactions = data.groupby('user').size()
streamer_interactions = data.groupby('streamer').size()
#rank the users and streamers by their frequency
user_ranks = user_interactions.rank(method='first', ascending=False)
streamer_ranks = streamer_interactions.rank(method='first', ascending=False)
#score the interactions and select the top 100,000
data['user_score'] = data['user'].map(user_ranks)
data['streamer_score'] = data['streamer'].map(streamer_ranks)
data['combined_score'] = data['user_score'] + data['streamer_score']
data['rating'] = data.stop - data.start


top_data = data.nsmallest(100000, 'combined_score')
filtered_data = top_data.drop(columns=['user_score', 'streamer_score', 'combined_score'])

print("Num users: ", filtered_data.user.nunique())
print("Num streamers: ", filtered_data.streamer.nunique())
print("Num interactions: ", len(filtered_data))

# check for sparsity of the data
potential_num_interactions = filtered_data.streamer.nunique() * filtered_data.user.nunique()
num_interactions = len(filtered_data['streamer']) #each row in the filtered_data is an interaction
sparsity = (1-num_interactions /potential_num_interactions) * 100
print(f'sparsity is {sparsity: .2f}%')




Num users:  1474
Num streamers:  1380
Num interactions:  100000
sparsity is  95.08%


### creating the user - streamer matrix for SVD

make a matrix of users - streams this will then be used to push streamers to the user as a stream is pushed a streamer is the selected by this stream. 

**potentially introduce bias to streamers that post a lot of streams**

In [5]:
user_streamer_interactions = filtered_data.pivot_table(index='user', columns='streamer', values='rating', aggfunc='sum', fill_value=0).values
#normalize the items 
user_streamer_interactions = MinMaxScaler().fit_transform(user_streamer_interactions)

#this makes sure that all the users and streamers are represented in the training and testing data
train_interactions, test_interactions = custom_train_test_split(user_streamer_interactions, test_size=0.2)

print(user_streamer_interactions.shape, test_interactions.shape, train_interactions.shape)

(1474, 1380) (1474, 1380) (1474, 1380)


In [6]:
def check_overlap(train_matrix, test_matrix):
    # Check if any element is present in both matrices
    overlap = np.sum((train_matrix > 0) & (test_matrix > 0))
    return overlap > 0

overlap_exists = check_overlap(train_interactions, test_interactions)
print("Is there overlap in the train and test sets?", overlap_exists)


Is there overlap in the train and test sets? False


# SVD implementation
we will train the model using biased stochiastic gradient descent 

C. Aberger, "Recommender," Project Report, CS229, Stanford University, 2014. [Online]. Available: https://cs229.stanford.edu/proj2014/Christopher%20Aberger,%20Recommender.pdf. [Accessed: Dec. 23, 2023].


### SVD using BSGD class creation

In [9]:
class basic_recommender:
    def __init__(self, dataset):
        self.data = dataset

    def predict_single(self, user,streamer):
        return (self.global_bias + self.user_biases[user] + self.streamer_biases[streamer] + self.user_features[user,:].dot(self.streamer_features[streamer,:]))
   
    
    def train(self, epochs, n_features, learning_rate, regularisation_strength):
        n_users = self.data.shape[0]
        n_streamers = self.data.shape[1]
        self.user_features = np.random.normal(0,0.1, (n_users, n_features))
        self.streamer_features = np.random.normal(0,0.1, (n_streamers, n_features))
        self.user_biases = np.zeros(n_users)
        self.streamer_biases = np.zeros(n_streamers)
        self.global_bias = np.mean(self.data[self.data != 0])


        
        for _ in range(epochs):
            for user in range(n_users):
                for streamer in range(n_streamers):
                    interaction = self.data[user, streamer]
                    if interaction > 0: #makes sure you skip values that are non interactions
                        prediction = self.predict_single(user, streamer)
                        
                        error = interaction - prediction

                        self.user_features[user,:] += learning_rate * (error * self.streamer_features[streamer,:] - regularisation_strength * self.user_features[user, :])
                        self.streamer_features[streamer,:] += learning_rate * (error * self.user_features[user,:] - regularisation_strength * self.streamer_features[streamer, :])
                        self.user_biases[user] += learning_rate * (error - regularisation_strength*self.user_biases[user])
                        self.streamer_biases[streamer] += learning_rate * (error - regularisation_strength*self.streamer_biases[streamer])

    def predict(self, dataset):
        predictions = np.zeros(dataset.shape)
        for user in range(dataset.shape[0]):
            for streamer in range(dataset.shape[1]):
                if dataset[user, streamer] > 0:
                    predictions[user, streamer] = self.predict_single(user,streamer)
        return predictions
    
    def evaluate(self, dataset):
        predictions = self.predict(dataset)
        predictions = predictions[dataset != 0].flatten()
        actual = dataset[dataset != 0].flatten()

        return np.sqrt(mean_squared_error(actual, predictions))
        


# model evaluation 
first we will check the model against the surprise SVD as a baseline

In [10]:

from surprise import Dataset, Reader

user_ids, item_ids = np.where(train_interactions > 0)  # Get user, item indices for non-zero interactions
ratings = train_interactions[user_ids, item_ids]  # Extract corresponding ratings

# Create a DataFrame from train_interactions
train_df = pd.DataFrame({
    'userID': user_ids,
    'itemID': item_ids,
    'rating': ratings
})
# Assuming test_interactions is your test data in matrix form
user_ids, item_ids = np.where(test_interactions > 0)  # Get user, item indices for non-zero interactions
ratings = test_interactions[user_ids, item_ids]  # Extract corresponding ratings

# Create the testset as a list of tuples
testset = list(zip(user_ids, item_ids, ratings))


# Load the train set into Surprise
reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
train_data = Dataset.load_from_df(train_df[['userID', 'itemID', 'rating']], reader)
trainset = train_data.build_full_trainset()  # Build the trainset


In [11]:

from surprise.model_selection import train_test_split


# Splitting the dataset into train and test set


from surprise import SVD
from surprise.accuracy import rmse

# Example hyperparameters
n_features = 100  # Number of latent factors
n_epochs = 20    # Number of epochs
lr_all = 0.005   # Learning rate
reg_all = 0.02   # Regularization term

algo = SVD(n_factors=n_features, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all)
algo.fit(trainset)

# Make predictions on the test set
predictions = algo.test(testset)

# Calculate RMSE
surprise_rmse = rmse(predictions)


RMSE: 0.2589


In [12]:
from surprise import SVDpp
algo = SVDpp(n_factors=n_features, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all)
algo.fit(trainset)

# Make predictions on the test set
predictions = algo.test(testset)

# Calculate RMSE
surprise_rmse = rmse(predictions)

RMSE: 0.2777


In [13]:
SVD = basic_recommender(train_interactions)
SVD.train(
    epochs=n_epochs,
    n_features=n_features,
    learning_rate=lr_all,
    regularisation_strength=reg_all,
)
print(SVD.evaluate(test_interactions))

0.2569604620767916


### model saving and use in interface

In [None]:
# as my model works the best we will use it in the app. 