In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd drive/My Drive/vk_intern/

/content/drive/My Drive/vk_intern


In [4]:
pwd

'/content/drive/My Drive/vk_intern'

In [6]:
# df_movies = pd.read_csv('movie.csv')
# df_ratings = pd.read_csv('rating.csv')
# df_tag = pd.read_csv('genome_scores.csv')

In [21]:
def filter_data():
    df_rating = pd.read_csv('rating.csv')
    df_tag = pd.read_csv('genome_scores.csv')
    n_users = int(len(df_rating.userId.unique()) * 0.1)
    print(f'Initial dataset size: {df_rating.shape[0]} ratings')

    grouped = df_rating.groupby(['movieId'])
    n_movies = grouped.size()
    index_names = n_movies[n_movies > n_users].index

    filtered_df_rating = df_rating[df_rating['movieId'].isin(index_names)]
    filtered_df_tag = df_tag[df_tag['movieId'].isin(index_names)]
    print(f'Filtered dataset size: {filtered_df_rating.shape[0]} ratings')
    print(
        f'Reduced dataset size on {np.round((df_rating.shape[0] - filtered_df_rating.shape[0]) / df_rating.shape[0], 2) * 100}%')

    filtered_df_rating.to_csv('train_rating.csv', index=False)
    filtered_df_tag.to_csv('filtered_tag.csv', index=False)

    return filtered_df_rating, filtered_df_tag

In [22]:
filtered_df_rating, filtered_df_tag = filter_data()

Initial dataset size: 20000263 ratings
Filtered dataset size: 7078836 ratings
Reduced dataset size on 65.0%


In [23]:
MIN_TEST_SIZE = 0.18
MAX_TEST_SIZE = 0.35
TARGET_TEST_SIZE = 0.3
SAMPLE_SIZE = 5000

def get_user_timestamp_threshold():
    """ 1. Picks a sample_size of unique ids of users.
        2. Sorts timestamps of chosen users in ascending order.
        3. Takes the the percentile timestamp.
        4. Returns the median of computed timestamp percentiles.
    """
    df = pd.read_csv('train_rating.csv')
    user_id = np.unique(df["userId"].to_numpy())
    user_id_sample = random.sample(list(user_id), SAMPLE_SIZE)
    thirty_percentile_timestamp = []

    for user in user_id_sample:
        user_timestamp = df.loc[df["userId"] == user, "timestamp"]
        sorted_timestamp = user_timestamp.sort_values(ascending=False).to_numpy()
        current_percentile = sorted_timestamp[int(len(sorted_timestamp) * TARGET_TEST_SIZE)]
        thirty_percentile_timestamp.append(current_percentile)

    sorted_percentile_timestamp = pd.Series(thirty_percentile_timestamp).sort_values(ascending=False)
    optimal_split_value = int(SAMPLE_SIZE * 0.7)
    return sorted_percentile_timestamp.to_numpy()[optimal_split_value]

In [24]:
def split_data():
    for _ in range(5):
        threshold_timestamp = get_user_timestamp_threshold()
        df = pd.read_csv('train_rating.csv')
        train_df = df[df["timestamp"] > threshold_timestamp]
        test_df = df[df["timestamp"] <= threshold_timestamp]
        test_size = np.round(test_df.shape[0] / df.shape[0], 2)
        print(f'Found a split with test size: ~{test_size}')
        if MIN_TEST_SIZE < test_size < MAX_TEST_SIZE:
            print(f'Seems good enough, will save!')
            return train_df, test_df

In [25]:
train_df, test_df = split_data()

Found a split with test size: ~0.22
Seems good enough, will save!


In [26]:
train_df.to_csv("train_rating.csv", index=False)

In [27]:
train_df, val_df = split_data()

Found a split with test size: ~0.28
Seems good enough, will save!


In [28]:
train_df.to_csv("train_rating.csv", index=False)
val_df.to_csv("val_rating.csv", index=False)
test_df.to_csv("test_rating.csv", index=False)

In [29]:
MIN_RATING = 4
K = 25
SEED = 42

def compute_precision(predictions, validation_dataset_path: str, movie_filtered_ids, user_filtered_ids, users,
                      nested_pred=True) -> float:
    inverse_encoding_us = dict(zip(user_filtered_ids.values(), user_filtered_ids.keys()))
    inverse_encoding_mv = dict(zip(movie_filtered_ids.values(), movie_filtered_ids.keys()))

    df = pd.read_csv(validation_dataset_path)
    precision = []

    for i, user in enumerate(users):
        y_pred = predictions[i]
        movies_of_user = df[df["userId"] == inverse_encoding_us[user]]

        relevant_movies = movies_of_user['rating'] >= MIN_RATING
        relevant_movies = movies_of_user[relevant_movies]["movieId"].tolist()

        irrelevant_movies = movies_of_user["rating"] < MIN_RATING
        irrelevant_movies = movies_of_user[irrelevant_movies]["movieId"].tolist()

        true_positive, false_positive = 0, 0

        for movie in y_pred:
            if nested_pred:
                movie = movie[0]
            movie_id = inverse_encoding_mv[movie]
            if movie_id in relevant_movies:
                true_positive += 1
            elif movie_id in irrelevant_movies:
                false_positive += 1

        if true_positive > 0 or false_positive > 0:
            precision.append(true_positive / (true_positive + false_positive))

    print(f'Evaluated on {len(precision)} users')

    return np.mean(precision)

In [30]:
!pip install lightfm
from scipy import sparse
from typing import Tuple
from lightfm.data import Dataset



In [31]:
def mask_movies(matrix):
    """ Converts all ratings bellow min_rating into -1 (disliked by user),
        all ratings above min_rating into 1 (liked by user)
    """
    matrix = matrix.tocsr()
    irrelevant_movies_mask = np.array(matrix[matrix.nonzero()] < MIN_RATING)[0]
    matrix[matrix.nonzero()[0][irrelevant_movies_mask], matrix.nonzero()[1][irrelevant_movies_mask]] = -1
    matrix[matrix > 0] = 1
    return matrix.tocoo()


def make_dataset(rating_dataset_path, tag_dataset_path):

    df_rating = pd.read_csv(rating_dataset_path)
    df_tag = pd.read_csv(tag_dataset_path)

    dataset = Dataset()
    dataset.fit(df_rating["userId"].unique(),
                df_rating["movieId"].unique(),
                item_features=df_tag["tagId"].unique())

    _, weights = dataset.build_interactions(
        [tuple(row) for row in df_rating.drop(["timestamp"], axis=1).values]
    )
    user_item_interaction = mask_movies(weights)
    item_features = dataset.build_item_features([(row[0], {row[1]: row[2]}) for row in df_tag.values])

    return user_item_interaction, item_features, dataset

In [32]:
from lightfm import LightFM

In [33]:
class ModelLightFM:
    def __init__(self):
        self.train_rating_path = 'train_rating.csv'
        self.val_rating_path = 'val_rating.csv'
        self.tag_csv_path = 'filtered_tag.csv'
        self.predictions = None
        self.unique_movies = None
        self.users_to_predict = None
        self.mapping_user_ids = None
        self.mapping_item_ids = None

    def fit(self, latent_size: int, learning_rate: float, item_alpha: float, epochs: int) -> LightFM:

        model = LightFM(no_components=latent_size,
                        learning_schedule="adagrad",
                        loss="warp",
                        # loss="logistic",
                        learning_rate=learning_rate,
                        item_alpha=item_alpha,
                        random_state=SEED)

        interactions, item_features, dataset = make_dataset(self.train_rating_path, self.tag_csv_path)
        self.mapping_user_ids, _, self.mapping_item_ids, _ = dataset.mapping()
        print('Dataset is built! Start fitting the model...')
        model.fit(interactions, item_features=item_features, epochs=epochs, verbose=True)
        return model

    def predict(self, latent_size: int, learning_rate: float, item_alpha: float, epochs: int):
        model = self.fit(latent_size, learning_rate, item_alpha, epochs)

        self.unique_movies = self.mapping_item_ids.values()
        unique_user_keys = self.mapping_user_ids.keys()

        df = pd.read_csv(self.val_rating_path)
        val_users = df["userId"].unique()

        self.users_to_predict = [self.mapping_user_ids[user] for user in val_users if user in unique_user_keys]
        print(f'Model is fitted, start making predictions!')
        self.predictions = []

        for i, user in enumerate(self.users_to_predict):
            print(f'\rPredicted top@{K} movies for {i+1}/{len(self.users_to_predict)} users...', end="")
            input_user_id = np.array([user for _ in range(len(self.unique_movies))], dtype=np.int32)
            scores = model.predict(input_user_id, np.array(list(self.unique_movies), dtype=np.int32))
            movie_scores = dict(zip(self.unique_movies, scores))
            sorted_movie_scores = {k: v for k, v in sorted(movie_scores.items(), key=lambda item: item[1], reverse=True)}
            self.predictions.append(list(sorted_movie_scores.keys())[:K])
        print(f'\n Predictions are saved! Let me compute precision@{K}.')
        return self.predictions, self.users_to_predict, self.mapping_user_ids, self.mapping_item_ids

    def get_metric(self) -> float:
        precision = compute_precision(self.predictions, self.val_rating_path, self.mapping_item_ids,
                                      self.mapping_user_ids, self.users_to_predict, nested_pred=False)
        return precision

In [34]:
latent_size = 10
learning_rate = 0.01
item_alpha = 0
epochs = 2


lightfm = ModelLightFM()
print(f'\n Latent_size: {latent_size}\n Learning rate: {learning_rate}\n Item alpha: {item_alpha}\n '
          f'Epochs: {epochs}')
lightfm.predict(latent_size, learning_rate, item_alpha, epochs)
precision = lightfm.get_metric()
print(f'Precision@{K}:{precision}')


 Latent_size: 10
 Learning rate: 0.01
 Item alpha: 0
 Epochs: 2
Dataset is built! Start fitting the model...


Epoch: 100%|██████████| 2/2 [19:29<00:00, 584.83s/it]


Model is fitted, start making predictions!
Predicted top@25 movies for 3934/3934 users...
 Predictions are saved! Let me compute precision@25.
Evaluated on 3769 users
Precision@25:0.7698487929082714


ALS

In [53]:
import plotly.graph_objects as go
def plot(loss, iterations):
    x = np.arange(len(loss))
    fig = go.Figure(data=go.Scatter(x=x, y=loss))
    fig.update_layout(title=f'Loss convergence: {iterations} iterations',
                      xaxis_title='Iteration',
                      yaxis_title='Loss')
    fig.show()

In [36]:
!pip install implicit



In [37]:
import implicit

  f"CUDA extension is built, but disabling GPU support because of '{e}'",


In [38]:
loss_story = []

In [39]:

def store_loss(iteration, elapsed, loss):
    loss_story.append(loss)

class ALS:
    def __init__(self):
        self.sparse_item_user = None
        self.sparse_user_item = None
        self.predictions_encoded = None
        self.irrelevant_movies = None
        self.relevant_users = None
        self.predictions = None

    @staticmethod
    def read_dataset(data_path: str) -> Tuple[pd.Series, pd.Series, pd.Series]:
        df = pd.read_csv(data_path)
        movies = df["movieId"]
        users = df["userId"]
        rating = df["rating"]
        return movies, users, rating

    def _get_data_encoded(self, data_path: str) -> None:
        df = pd.read_csv(data_path)
        users, movies = df["userId"], df["movieId"]
        # dict with keys: original movie / user id and values: relevant movie/user id from range 0...len(unique()))
        self.movie_filtered_ids = dict(zip(movies.unique(), range(len(movies.unique()))))
        self.user_filtered_ids = dict(zip(users.unique(), range(len(users.unique()))))

    def _get_sparse_matrix(self, data_path: str) -> Tuple[sparse.csr_matrix, sparse.csr_matrix]:
        movies, users, rating = self.read_dataset(data_path)
        n_movies, n_users = len(movies.unique()), len(users.unique())

        rating.where(rating < MIN_RATING, 0, inplace=True)
        rating.where(rating >= MIN_RATING, 1, inplace=True)

        movie_encoded_idx = [self.movie_filtered_ids[movie] for movie in movies]
        user_encoded_idx = [self.user_filtered_ids[user] for user in users]

        self.sparse_item_user = sparse.csr_matrix((rating, (movie_encoded_idx, user_encoded_idx)),
                                                  shape=(n_movies, n_users))
        self.sparse_user_item = sparse.csr_matrix((rating, (user_encoded_idx, movie_encoded_idx)),
                                                  shape=(n_users, n_movies))
        return self.sparse_item_user, self.sparse_user_item

    def train(self, latent_dim: int, regularization: float, iterations: int, alpha: float, train_loss: bool,
              train_dataset_path: str):

        self._get_data_encoded(train_dataset_path)

        self.sparse_item_user, self.sparse_user_item = self._get_sparse_matrix(train_dataset_path)

        model = implicit.als.AlternatingLeastSquares(factors=latent_dim,
                                                     regularization=regularization,
                                                     iterations=iterations,
                                                     calculate_training_loss=train_loss,
                                                     random_state=SEED)


        matrix = (self.sparse_item_user * alpha).tocsr().astype(float)
        matrix.data += 1.
        model.fit_callback = store_loss
        model.fit(matrix)

        return model
    

    def filter_users_for_validation(self, validation_dataset_path: str) -> None:
        movies, users, rating = self.read_dataset(validation_dataset_path)

        unique_movies = movies.unique()
        unique_users = users.unique()

        unique_movie_keys = self.movie_filtered_ids.keys()
        unique_user_keys = self.user_filtered_ids.keys()

        self.irrelevant_movies = [movie for movie in unique_movie_keys if self.movie_filtered_ids[movie] not in unique_movies]
        self.relevant_users = [self.user_filtered_ids[user] for user in unique_users if user in unique_user_keys]

    def get_encoded_predictions(self, model, val_dataset_path):
        self.filter_users_for_validation(val_dataset_path)
        self.predictions = []
        for user in self.relevant_users:
            top_k_movies = self.predict(model, user)
            self.predictions.append(top_k_movies)
        return self.predictions

    def predict(self, model, user: int) -> np.ndarray:
        top_k_movies = model.recommend(user,
                                       self.sparse_user_item.tocsr().astype(float),
                                       N=25,
                                       filter_already_liked_items=False,
                                       filter_items=self.irrelevant_movies)
        return top_k_movies

    def get_metric(self, validation_dataset_path: str):
        precision = compute_precision(self.predictions, validation_dataset_path, self.movie_filtered_ids,
                                      self.user_filtered_ids, self.relevant_users)
        return precision

    def test(self, model):
        predictions = self.get_encoded_predictions(model, 'test_rating.csv')
        precision = self.get_metric('test_rating.csv')
        return predictions, self.relevant_users, self.movie_filtered_ids, self.user_filtered_ids, precision

In [40]:
factors = 20
iterations = 30
alpha = 40
regularization = 0.1
train_loss = True
train_dataset_path = 'train_rating.csv'
val_dataset_path = 'val_rating.csv'
eval_mode = True
loss_story = []

als = ALS()

print(f'\n Factors: {factors}\n Iterations: {iterations}\n Alpha: {alpha}\n '
          f'Regularization: {regularization}')
model = als.train(factors, 
                  regularization,
                  iterations,
                  alpha,
                  train_loss,
                  train_dataset_path)



 Factors: 20
 Iterations: 30
 Alpha: 40
 Regularization: 0.1


  "OpenBLAS detected. Its highly recommend to set the environment variable "


  0%|          | 0/30 [00:00<?, ?it/s]

In [54]:
plot(loss_story, iterations)

In [42]:
if eval_mode:
    predictions = als.get_encoded_predictions(model, 'val_rating.csv')
    precision = als.get_metric(val_dataset_path)
    print(f'Precision@{K}\n{precision}')

IndexError: ignored