In [None]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

### Setup

In [None]:

import pandas as pd
import os
from os.path import exists
import zipfile
import numpy as np

In [None]:
!pip install gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!gdown https://drive.google.com/uc?id=1kl65YOvoSAMSgszQQbua2q4Zwe1HgPct

Downloading...
From: https://drive.google.com/uc?id=1kl65YOvoSAMSgszQQbua2q4Zwe1HgPct
To: /content/data.zip
  0% 0.00/4.16M [00:00<?, ?B/s]100% 4.16M/4.16M [00:00<00:00, 248MB/s]


In [None]:
!unzip -o "data.zip"  -d  "/content"

Archive:  data.zip
  inflating: /content/amazon_min.csv  
  inflating: /content/book_min.csv   
  inflating: /content/ciao_min.csv   
  inflating: /content/ecom_min.csv   
  inflating: /content/food_min.csv   
  inflating: /content/movies_min.csv  


In [None]:
SEED=42

In [None]:

movie_10k_df = pd.read_csv("movies_min.csv")
amazon_df =pd.read_csv('amazon_min.csv')
book_df=pd.read_csv("book_min.csv")
food_df = pd.read_csv('food_min.csv')
ecommerce_df=pd.read_csv('ecom_min.csv')
ciao_df=pd.read_csv('ciao_min.csv')

# Sparsity 

In [None]:
from scipy.sparse import csr_matrix
def checkSparsity(main_df):
   
    # pivot_df = main_df.pivot_table(
    #     index='itemID',
    #     columns='userID',
    #     values='rating'
    # )
    totalUsers=main_df.userID.unique().shape[0]
    totalitems= main_df.itemID.unique().shape[0]
    total=totalUsers*totalitems
    totalreconds=main_df.shape[0]
    sparsity=(total-totalreconds)/total
    return sparsity

In [None]:

sparsity_food=checkSparsity(food_df)
sparsity_food

0.9758585086369416

In [None]:
sparsity_amazon=checkSparsity(amazon_df)
sparsity_amazon

0.9919133172746379

In [None]:

sparsity_movielens=checkSparsity(movie_10k_df)
sparsity_movielens

0.900792867888234

In [None]:
sparsity_ecom=checkSparsity(ecommerce_df)
sparsity_ecom

0.9994459394919134

<!-- Observations :
1. Very sparse dataset sparsity 0.98
2. Many items and users in the dataset has less interaction data -->

<!-- ### Divide dataset based on following:
- All users rated very few items - will use contant based filtering method.To get threshold value for filtering we first find median of number of rating given by each users.Next choose value less then that median value such that we decrese sparsity as much as possible(less then .95)
- For other data we will go with collaborative filtering or other deep learning methods -->

# Simple Algorithm for Recommendation (SAR)

This is simple recommender algorithm we run this algorithm as firt step if we get following attributes in the dataset - itemid,userid,rating,timestamp

In [None]:
!pip install recommenders

In [None]:
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.sar import SAR


In [None]:
USER="userID"
ITEM="itemID"
RATING="rating"
TIMESTAMP="time"
PREDICTION="prediction"
TOP_K=20
SEED=42 

In [None]:
class SAR_Algo:

  def setup(self,data,col_user=USER,col_item=ITEM,col_rating=RATING,col_time=TIMESTAMP,col_prediction=PREDICTION,ratio=0.8,top_k=TOP_K,seed=SEED):
    self.col_item=col_item
    self.col_user=col_user
    self.top_k=top_k
    self.data=data
    self.col_rating=col_rating
    self.col_prediction=col_prediction
    self.header={
        "col_user": col_user,
        "col_item":col_item,
        "col_rating": col_rating,
        "col_timestamp":col_time,
        "col_prediction": col_prediction
        }
    self.train, self.test = python_stratified_split(self.data, ratio=0.8, col_user=self.header["col_user"], col_item=self.header["col_item"], seed=seed)
    self.model= SAR(
    similarity_type="jaccard", 
    time_decay_coefficient=35, 
    time_now=None, 
    timedecay_formula=True, 
    **self.header
    )
  def trainmodel(self):
    self.model.fit(self.train)
  def prediction(self):
    self.top_k_val = self.model.recommend_k_items(self.test, top_k=self.top_k, remove_seen=True)
    self.top_k_with_titles = (self.top_k_val.join(self.data[[self.col_item]].drop_duplicates().set_index(self.col_item), 
                                on=self.col_item, 
                                how='inner').sort_values(by=[self.col_user, self.col_prediction], ascending=False))

    return self.top_k_with_titles
  def evaluatemodel(self):
    TOP_K=20
    args = [self.test, self.top_k_val]
    kwargs = dict(col_user=self.col_user, 
              col_item=self.col_item, 
              col_rating=self.col_rating, 
              col_prediction=self.col_prediction, 
              relevancy_method='top_k', 
              k=self.top_k)

    eval_map = map_at_k(*args, **kwargs)
    eval_ndcg = ndcg_at_k(*args, **kwargs)
    eval_precision = precision_at_k(*args, **kwargs)
    eval_recall = recall_at_k(*args, **kwargs)


    
    print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')
    return eval_map,eval_precision,eval_recall, self.top_k_val

In [None]:
def startSAR(data_df):
  sar=SAR_Algo()
  sar.setup(data=data_df,col_user=USER,col_item=ITEM,col_rating=RATING,col_time=TIMESTAMP,col_prediction=PREDICTION,ratio=0.8,top_k=TOP_K,seed=SEED)
  start = time.time()
  sar.trainmodel()
  end = time.time()
  top_k_data=sar.prediction()
  duration=end-start
  eval_map,eval_precision,eval_recall, top_k_val=sar.evaluatemodel()
  return eval_map,eval_precision,eval_recall,duration,top_k_val,sar

In [None]:
import time
obj = time.gmtime(0)
epoch = time.asctime(obj)
curr_time = round(time.time()*1000)

In [None]:
movie_10k_df[TIMESTAMP]=curr_time

In [None]:
movie_10k_df[RATING]=1

In [None]:
import time
sar_eval_map=0
sar_eval_precision=0
sar_eval_recall=0
sar_top_k_data=None

    
data_df=movie_10k_df[[USER,ITEM,RATING,TIMESTAMP]]
    
eval_map,eval_precision,eval_recall,duration,top_k_val,model=startSAR(data_df)
   

Model:
Top K:		 20
MAP:		 0.140727
NDCG:		 0.341842
Precision@K:	 0.223595
Recall@K:	 0.288000


In [None]:
book_df=book_df[[USER,ITEM,RATING]]
book_df[TIMESTAMP]=curr_time

In [None]:
import time
sar_eval_map=0
sar_eval_precision=0
sar_eval_recall=0
sar_top_k_data=None

    
data_df=book_df[[USER,ITEM,RATING,TIMESTAMP]]
    
eval_map,eval_precision,eval_recall,duration,top_k_val,model=startSAR(data_df)
   

Model:
Top K:		 20
MAP:		 0.045967
NDCG:		 0.089472
Precision@K:	 0.028872
Recall@K:	 0.104402


In [None]:
book_df[RATING]=1

In [None]:
import time
sar_eval_map=0
sar_eval_precision=0
sar_eval_recall=0
sar_top_k_data=None

    
data_df=book_df[[USER,ITEM,RATING,TIMESTAMP]]
    
eval_map,eval_precision,eval_recall,duration,top_k_val,model=startSAR(data_df)
   

Model:
Top K:		 20
MAP:		 0.045566
NDCG:		 0.088526
Precision@K:	 0.028338
Recall@K:	 0.102257


In [None]:
import time
sar_eval_map=0
sar_eval_precision=0
sar_eval_recall=0
sar_top_k_data=None

    
data_df=amazon_df[[USER,ITEM,RATING,TIMESTAMP]]
    
eval_map,eval_precision,eval_recall,duration,top_k_val,model=startSAR(data_df)

Model:
Top K:		 20
MAP:		 0.009112
NDCG:		 0.022752
Precision@K:	 0.008576
Recall@K:	 0.035393


In [None]:
amazon_df[RATING]=1

In [None]:
import time
sar_eval_map=0
sar_eval_precision=0
sar_eval_recall=0
sar_top_k_data=None

    
data_df=amazon_df[[USER,ITEM,RATING,TIMESTAMP]]
    
eval_map,eval_precision,eval_recall,duration,top_k_val,model=startSAR(data_df)

Model:
Top K:		 20
MAP:		 0.008725
NDCG:		 0.022287
Precision@K:	 0.008439
Recall@K:	 0.035509
