In [2]:
import numpy as np
import pandas as pd

from multiprocessing import Pool

from sklearn.model_selection import KFold, train_test_split
books = pd.read_csv("books.csv")[["book_id", "title", "authors"]]
ratings = pd.read_csv("ratings.csv")

In [3]:
minimum_number_of_books_rated_in_common = 10

In [4]:
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [5]:
ratings.user_id.value_counts().min()

19

In [6]:
user_ids = ratings.user_id.unique()
# np.random.seed(42)
# np.random.shuffle(user_ids)
print(f"Number of user: {len(user_ids)}")

Number of user: 53424


In [7]:
def scoring(column):
    # Consider those users with at least a similarity of minimal_similarity
    neighbours = similarities > minimal_similarity
    # Calculate weighted mean of ratings as scores
    numerator = np.sum(column[neighbours]*similarities[neighbours])
    denominator = np.sum(similarities[neighbours][column[neighbours].notna()])
    predicted_rating = numerator/denominator if denominator != 0 else np.nan

    # If book has been rated less than minimal_number_of_ratings, set its score to nan
    if column[neighbours].notna().sum() <= minimal_number_of_ratings:
        predicted_rating = np.nan
    return predicted_rating

In [15]:
class CollaborativeFilter():
    def __init__(self, ratings: pd.DataFrame, user_col: str="user_id", item_col: str="book_id") -> None:
        self.ratings = ratings
        self.user_col = user_col
        self.item_col = item_col
        
    def pairwise_correlation(self, A, B):
        am = A - np.mean(A, axis=0, keepdims=True)
        bm = B - np.mean(B, axis=0, keepdims=True)
        return am.T @ bm /  (np.sqrt(
            np.sum(am**2, axis=0, keepdims=True)).T * np.sqrt(np.sum(bm**2, axis=0, keepdims=True)))
        
    def get_similarities(self, input_ratings: pd.DataFrame) -> pd.DataFrame:
        relevant_ratings = pd.merge(self.ratings, input_ratings[self.item_col], on=self.item_col, how='inner')
        user_id = input_ratings[self.user_col].unique()[0]
        relevant_ratings = pd.concat([relevant_ratings[relevant_ratings[self.user_col]!=user_id], input_ratings])
        uii_matrix = relevant_ratings.pivot_table(
           index=[self.user_col], 
           columns=[self.item_col], 
           values="rating").fillna(np.nan)
        # return uii_matrix.apply(
            # lambda x: self.pairwise_correlation(x.values, uii_matrix.loc[user_id].values), axis=1)
        
        # return self.pairwise_correlation(uii_matrix.values.T, uii_matrix.loc[user_id].values.T)
        return uii_matrix
        
    def get_similarities_old(self, input_ratings: pd.DataFrame) -> pd.DataFrame:
        relevant_ratings = pd.merge(self.ratings, input_ratings[self.item_col], on=self.item_col, how='inner')
        user_id = input_ratings[self.user_col].unique()[0]
        relevant_ratings = pd.concat([relevant_ratings[relevant_ratings[self.user_col]!=user_id], input_ratings])
        uii_matrix = relevant_ratings.pivot_table(
           index=[self.user_col], 
           columns=[self.item_col], 
           values="rating").fillna(np.nan)
        return uii_matrix.corrwith(uii_matrix.loc[user_id], axis=1)

In [9]:
K=8
kf = KFold(n_splits=K, shuffle=True, random_state=42)
kf.get_n_splits(user_ids)
print(kf)
for i, (train_index, test_index) in enumerate(kf.split(user_ids)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    train_user_ids, test_user_ids = user_ids[train_index], user_ids[test_index]
    train_ratings, test_ratings = ratings[ratings.user_id.isin(train_user_ids)], \
                                    ratings[ratings.user_id.isin(test_user_ids)]
    input_ratings, heldout_ratings = train_test_split(test_ratings, 
                                                      stratify=test_ratings.user_id, 
                                                      test_size=0.1, 
                                                      random_state=42)
    cf = CollaborativeFilter(train_ratings)
    
    # train_ratings = pd.concat([train_ratings, input_ratings])
    # uii_matrix = train_ratings.pivot_table(
    #   index=["user_id"], 
    #   columns=["book_id"], 
    #   values="rating").fillna(np.nan)
    # similarities = uii_matrix.corr(min_periods=minimum_number_of_books_rated_in_common)
    mae = 0
    coverage = 0
    for user_id in test_user_ids[:1]:
        similarities = cf.get_similarities(input_ratings[input_ratings.user_id==user_id])
        
        # minimum_number_of_books_rated_in_common = 10
        # # Only use those which have an intersection of more than n books
        # my_books_read = uii_matrix.loc[user_id].notna()
        # intersections = uii_matrix.apply(lambda x: (x.notna() & my_books_read).sum(), axis=1)
        # similarities[intersections < minimum_number_of_books_rated_in_common] = np.nan

        # # Remove self similarity
        # similarities[user_id] = np.nan

        minimal_similarity = 0.7
        minimal_number_of_ratings = 5

        predicted_scores = uii_matrix.apply(lambda x: scoring(x))
        true_scores = heldout_ratings[heldout_ratings.user_id==user_id]
        predictions = true_scores.merge(predicted_scores.rename('scores'), on='book_id', how='left')
        coverage += 1-predictions.scores.isna().sum()/len(predictions)
        mae += (predictions.rating-predictions.scores).abs().mean()
    coverage /= len(test_user_ids)
    mae /= len(test_user_ids)
        
        
    if i==0:
        break

KFold(n_splits=8, random_state=42, shuffle=True)
Fold 0:
  Train: index=[    0     1     2 ... 53421 53422 53423]
  Test:  index=[    4     6    34 ... 53397 53400 53412]


NameError: name 'uii_matrix' is not defined

In [16]:
cf = CollaborativeFilter(train_ratings)

In [17]:
uii = cf.get_similarities(input_ratings[input_ratings.user_id==user_id])

In [22]:
uii.values.shape

(38096, 88)

In [24]:
uii.loc[user_id].values.shape

(88,)

In [14]:
similarities.min()

nan

In [None]:
from time import time
start_time = time()
similarities = cf.get_similarities(input_ratings[input_ratings.user_id==user_id])
print(time()-start_time)

In [73]:
user_id

(0, 8)

In [72]:
input_ratings[input_ratings.user_id==user_id]

Unnamed: 0,user_id,book_id,rating
5865646,30318,1267,3
3213073,39103,2925,3
1880951,26054,2523,3
3190520,39022,9194,5
5125670,52142,6314,5
...,...,...,...
4731607,50060,2536,4
530426,9710,3585,1
4626623,40705,25,4
3968512,15459,12,2


In [9]:
K=8
kf = KFold(n_splits=K, shuffle=True, random_state=42)
kf.get_n_splits(user_ids)
print(kf)
for i, (train_index, test_index) in enumerate(kf.split(user_ids)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    train_user_ids, test_user_ids = user_ids[train_index], user_ids[test_index]
    train_ratings, test_ratings = ratings[ratings.user_id.isin(train_user_ids)], \
                                    ratings[ratings.user_id.isin(test_user_ids)]
    input_ratings, heldout_ratings = train_test_split(test_ratings, 
                                                      stratify=test_ratings.user_id, 
                                                      test_size=0.1, 
                                                      random_state=42)
    train_ratings = pd.concat([train_ratings, input_ratings])
    uii_matrix = train_ratings.pivot_table(
      index=["user_id"], 
      columns=["book_id"], 
      values="rating").fillna(np.nan)
    # similarities = uii_matrix.corr(min_periods=minimum_number_of_books_rated_in_common)
    mae = 0
    coverage = 0
    for user_id in enumerate(test_user_ids[:1]):
        similarities = uii_matrix.corrwith(uii_matrix.loc[user_id], axis=1)
        
        minimum_number_of_books_rated_in_common = 10

        # Only use those which have an intersection of more than n books
        my_books_read = uii_matrix.loc[user_id].notna()
        intersections = uii_matrix.apply(lambda x: (x.notna() & my_books_read).sum(), axis=1)
        similarities[intersections < minimum_number_of_books_rated_in_common] = np.nan

        # Remove self similarity
        similarities[user_id] = np.nan

        minimal_similarity = 0.7
        minimal_number_of_ratings = 5

        predicted_scores = uii_matrix.apply(lambda x: scoring(x))
        true_scores = heldout_ratings[heldout_ratings.user_id==user_id]
        predictions = true_scores.merge(predicted_scores.rename('scores'), on='book_id', how='left')
        coverage += 1-predictions.scores.isna().sum()/len(predictions)
        mae += (predictions.rating-predictions.scores).abs().mean()
    coverage /= len(test_user_ids)
    mae /= len(test_user_ids)
        
        
    if i==0:
        break

KFold(n_splits=8, random_state=42, shuffle=True)
Fold 0:
  Train: index=[    0     1     2 ... 53421 53422 53423]
  Test:  index=[    4     6    34 ... 53397 53400 53412]


  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


In [14]:
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

In [16]:
def get_similarities(user_id):
    return uii_matrix.corrwith(uii_matrix.loc[user_id], axis=1)

In [19]:
len(test_user_ids)

6678

In [23]:
from time import time
start_time = time()
with ProcessPoolExecutor(max_workers=8) as executor:
    results = executor.map(get_similarities, test_user_ids[:8])
print(time()-start_time)

  c = cov(x, y, rowvar)
  c = cov(x, y, rowvar)
  c = cov(x, y, rowvar)
  c = cov(x, y, rowvar)
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


81.79463362693787


In [None]:
            with Pool(processes=4) as p:
                similarities = p.map(uii_matrix.corrwith(uii_matrix.loc[user_id], axis=1) for user_id in test_user_ids],
                        )



In [10]:
len(test_user_ids)

user_id
1        0.190494
2             NaN
3             NaN
4       -0.081870
5             NaN
           ...   
53420         NaN
53421    0.575282
53422         NaN
53423         NaN
53424         NaN
Length: 53424, dtype: float64

In [12]:
uii_matrix.shape

(53424, 10000)

In [20]:
(predictions.rating-predictions.scores).abs().mean()

0.8185837954257885

In [22]:
(predictions[~predictions.scores.isna()].rating-predictions[~predictions.scores.isna()].scores).abs().mean()

0.8185837954257885

In [23]:
predictions[~predictions.scores.isna()]

Unnamed: 0,user_id,book_id,rating,scores
0,8,1214,5,3.887177
3,8,531,3,2.80786
4,8,80,5,4.593925
6,8,1937,3,4.15506
7,8,4,3,4.068401
8,8,264,3,4.059791
9,8,65,5,4.264203


In [18]:
predictions = true_scores.merge(predicted_scores.rename('scores'), on='book_id', how='left')
predictions.head(20)

Unnamed: 0,user_id,book_id,rating,scores
0,8,1214,5,3.887177
1,8,3700,4,
2,8,2781,4,
3,8,531,3,2.80786
4,8,80,5,4.593925
5,8,782,3,
6,8,1937,3,4.15506
7,8,4,3,4.068401
8,8,264,3,4.059791
9,8,65,5,4.264203


In [16]:
1-predictions.scores.isna().sum()/len(predictions)

0.7

In [24]:
test_ratings.index

Int64Index([    294,     295,     296,     326,     327,     328,     329,
                330,     331,     332,
            ...
            5976128, 5976166, 5976308, 5976325, 5976335, 5976348, 5976462,
            5976467, 5976468, 5976469],
           dtype='int64', length=746932)

In [31]:
heldout_ratings = train_test_split(test_ratings, stratify=test_ratings.user_id, test_size=0.1, random_state=42)

In [32]:
heldout_ratings[1]

Unnamed: 0,user_id,book_id,rating
1306545,19123,681,2
4140955,32747,88,3
1955432,26855,123,3
4958313,17494,363,4
498574,9328,13,5
...,...,...,...
3286456,39782,4722,4
2014564,26781,35,5
5152592,3993,1092,3
2420551,17019,5080,4
