In [137]:
import pandas as pd
import joblib
from src.utils import load_config

CONFIG_DATA = load_config()

In [138]:
CONFIG_DATA

{'dataset_path': 'https://raw.githubusercontent.com/fakhrirobi/e-commerce_recommender_starter/master/dataset/Amazone_Magazine_Review.csv',
 'user_column': 'reviewerID',
 'item_column': 'itemID',
 'interaction_column': 'rating',
 'minimum_interaction': 3,
 'training_pct': 0.6,
 'val_pct': 0.5,
 'train_utility_matrix_path': 'dataset/processed/train_utility_matrix.pkl',
 'val_utility_matrix_path': 'dataset/processed/val_utility_matrix.pkl',
 'test_utility_matrix_path': 'dataset/processed/test_utility_matrix.pkl',
 'seed': 43,
 'reviewer_id_to_ordered_id_path': 'dataset/mapper/reviewer_id_to_ordered_id.pkl',
 'ordered_id_to_reviewer_id_path': 'dataset/mapper/ordered_id_to_reviewer_id.pkl',
 'item_id_to_ordered_id_path': 'dataset/mapper/item_id_to_ordered_id_path.pkl',
 'ordered_id_to_item_id_path': 'dataset/mapper/ordered_id_to_item_id.pkl'}

## Data Collection

In [139]:

#read dataset
rating = pd.read_csv(CONFIG_DATA['dataset_path'])

In [140]:
#check its head
rating.head()

Unnamed: 0,reviewerID,itemID,rating
0,B00005N7P0,AH2IFH762VY5U,5.0
1,B00005N7P0,AOSFI0JEYU4XM,5.0
2,B00005N7OJ,A3JPFWKS83R49V,3.0
3,B00005N7OJ,A19FKU6JZQ2ECJ,5.0
4,B00005N7P0,A25MDGOMZ2GALN,5.0


In [141]:
#check some stats
rating.describe()

Unnamed: 0,rating
count,89689.0
mean,4.036638
std,1.419791
min,1.0
25%,3.0
50%,5.0
75%,5.0
max,5.0


In [142]:
#check data type 
rating.dtypes

reviewerID     object
itemID         object
rating        float64
dtype: object

In [143]:
#wrap as function 

def load_data() : 
    #read data
    data  = pd.read_csv(CONFIG_DATA['dataset_path'])
    
    #print datashape
    print('Data shape : ',data.shape)
    
    
    #print datatypes 
    print('Data Types : \n ',data.dtypes)
    #return data 
    return data 
    
    
    

In [144]:
data = load_data()

Data shape :  (89689, 3)
Data Types : 
  reviewerID     object
itemID         object
rating        float64
dtype: object


## Making Simple Recommendation

In [145]:
rating.reviewerID.nunique()

2428

In [146]:
rating.itemID.nunique()

72098

### How many items that user rated ? 

In [147]:
number_of_interaction = (rating.groupby('reviewerID',as_index=False)
                            .agg(avg_interaction=pd.NamedAgg('itemID','count'))
                            )

In [148]:
number_of_interaction.describe()

Unnamed: 0,avg_interaction
count,2428.0
mean,36.939456
std,117.218818
min,1.0
25%,2.0
50%,7.0
75%,21.0
max,1718.0


### Finding items that have most rated

In [149]:
rated_count = (rating.groupby('itemID',as_index=False)
                     .agg(rating_count = pd.NamedAgg('reviewerID','count'))
                     .sort_values('rating_count',ascending=False))

### Giving Recommendation

One way to recommend is we can give most rated items, 
However most rated items does not means its highly rated by users. 


In [150]:
rated_count

Unnamed: 0,itemID,rating_count
48609,A3JPFWKS83R49V,55
32315,A2OTUWUSH49XIN,26
60746,AEMZRE6QYVQBS,25
46846,A3GA09FYFKL4EY,24
52524,A3R7MXVQRGGIQ9,22
...,...,...
26064,A2D5KCTV2F6DWF,1
26067,A2D5UUBWHC4PP7,1
26068,A2D5XM37KOQ43F,1
26069,A2D5Y5Q2I8M7NU,1


usually recommendation is in `Top N Recommendation`, such as `Top 10 Items`, etc. 

In [151]:
rated_count.head(10)

Unnamed: 0,itemID,rating_count
48609,A3JPFWKS83R49V,55
32315,A2OTUWUSH49XIN,26
60746,AEMZRE6QYVQBS,25
46846,A3GA09FYFKL4EY,24
52524,A3R7MXVQRGGIQ9,22
38444,A30H2335OM7RD6,22
14817,A1RPTVW5VEOSI,21
64002,AKMEY1BSHSDG7,21
69735,AVF9FV7AMRP5C,20
28160,A2H3JURQZOHVMB,20


In [152]:
def recommend_top_n(rating, n_items=10) : 
    
    rated_count = (rating.groupby('itemID',as_index=False)
                        .agg(rating_count = pd.NamedAgg('reviewerID','count'))
                        .sort_values('rating_count',ascending=False)
                        .head(n_items))
    return rated_count

In [153]:
recommend_top_n(rating=rating,
                n_items=30)

Unnamed: 0,itemID,rating_count
48609,A3JPFWKS83R49V,55
32315,A2OTUWUSH49XIN,26
60746,AEMZRE6QYVQBS,25
46846,A3GA09FYFKL4EY,24
52524,A3R7MXVQRGGIQ9,22
38444,A30H2335OM7RD6,22
14817,A1RPTVW5VEOSI,21
64002,AKMEY1BSHSDG7,21
69735,AVF9FV7AMRP5C,20
28160,A2H3JURQZOHVMB,20


## Creating Utility Matrix

### Filter user with minimal rating

In [154]:
filter_minimum_interaction = number_of_interaction['avg_interaction']<CONFIG_DATA['minimum_interaction']
under_minimum_reviewerID = number_of_interaction.loc[filter_minimum_interaction,'reviewerID'].tolist()

In [155]:
#drop rating_based on index 
rating_adjusted = rating.loc[
    ~rating['reviewerID'].isin(under_minimum_reviewerID)
]
rating_adjusted

Unnamed: 0,reviewerID,itemID,rating
0,B00005N7P0,AH2IFH762VY5U,5.0
1,B00005N7P0,AOSFI0JEYU4XM,5.0
2,B00005N7OJ,A3JPFWKS83R49V,3.0
3,B00005N7OJ,A19FKU6JZQ2ECJ,5.0
4,B00005N7P0,A25MDGOMZ2GALN,5.0
...,...,...,...
89682,B01HI8V1C2,A2XSSQH4VO5B1L,5.0
89683,B01HI8V1C2,A2K4AUMORPI9GZ,5.0
89684,B01HI8V1C2,AB5HCI50SF1TK,5.0
89685,B01HI8V1C2,A2X11NZMPI7M0T,1.0


In [156]:
def remove_min_interaction(rating_data,number_of_interaction,threshold=3) : 
    rating_data = rating_data.copy()
    filter_minimum_interaction = number_of_interaction['avg_interaction']<threshold
    under_threshold_reviewerID = number_of_interaction.loc[filter_minimum_interaction,'reviewerID'].tolist()
    rating_adjusted = rating_data.loc[
        ~rating['reviewerID'].isin(under_threshold_reviewerID) ]

    return rating_adjusted
    

filter `reviewerID` with minimal 3 interactions

In [157]:
rating_data_filtered = remove_min_interaction(
    rating_data=rating,
    number_of_interaction=number_of_interaction,
    threshold=3
)
rating_data_filtered

Unnamed: 0,reviewerID,itemID,rating
0,B00005N7P0,AH2IFH762VY5U,5.0
1,B00005N7P0,AOSFI0JEYU4XM,5.0
2,B00005N7OJ,A3JPFWKS83R49V,3.0
3,B00005N7OJ,A19FKU6JZQ2ECJ,5.0
4,B00005N7P0,A25MDGOMZ2GALN,5.0
...,...,...,...
89682,B01HI8V1C2,A2XSSQH4VO5B1L,5.0
89683,B01HI8V1C2,A2K4AUMORPI9GZ,5.0
89684,B01HI8V1C2,AB5HCI50SF1TK,5.0
89685,B01HI8V1C2,A2X11NZMPI7M0T,1.0


### Mapping `reviewerID` and `itemID` 


In [158]:
#map user id --> ordered id 
reviewer_id_to_ordered_id = {}
ordered_id_to_reviewer_id = {}
for idx,reviewer_id in enumerate(rating['reviewerID'].unique()) : 
    reviewer_id_to_ordered_id[reviewer_id] = idx+1
    ordered_id_to_reviewer_id[idx+1] = reviewer_id




In [159]:
#map user id --> ordered id 
item_id_to_ordered_id = {}
ordered_id_to_item_id = {}
for idx,item_id in enumerate(rating['itemID'].unique()) : 
    item_id_to_ordered_id[item_id] = idx+1
    ordered_id_to_item_id[idx+1] = item_id


In [160]:
joblib.dump(reviewer_id_to_ordered_id,CONFIG_DATA['reviewer_id_to_ordered_id_path'])
joblib.dump(ordered_id_to_reviewer_id,CONFIG_DATA['ordered_id_to_reviewer_id_path'])

joblib.dump(item_id_to_ordered_id,CONFIG_DATA['item_id_to_ordered_id_path'])
joblib.dump(ordered_id_to_item_id,CONFIG_DATA['ordered_id_to_item_id_path'])

['dataset/mapper/ordered_id_to_item_id.pkl']

In [161]:
mapped_reviewer_item_rating = rating.copy()

mapped_reviewer_item_rating.reviewerID = mapped_reviewer_item_rating.reviewerID.map(reviewer_id_to_ordered_id)
mapped_reviewer_item_rating.itemID = mapped_reviewer_item_rating.itemID.map(item_id_to_ordered_id)


In [162]:
mapped_reviewer_item_rating

Unnamed: 0,reviewerID,itemID,rating
0,1,1,5.0
1,1,2,5.0
2,2,3,3.0
3,2,4,5.0
4,1,5,5.0
...,...,...,...
89684,2427,72095,5.0
89685,2427,72096,1.0
89686,2427,58405,5.0
89687,2428,72097,5.0


### Utility Matrix

In [163]:
from scipy.sparse import coo_matrix,csr_matrix


from scipy.sparse import coo_matrix
"""
From Documentation 
Constructing a matrix using ijv format

row  = np.array([0, 3, 1, 0])
col  = np.array([0, 3, 1, 2])
data = np.array([4, 5, 7, 9])
coo_array((data, (row, col)), shape=(4, 4)).toarray()
array([[4, 0, 9, 0],
       [0, 7, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 5]])
"""
row = mapped_reviewer_item_rating.reviewerID.values
col = mapped_reviewer_item_rating.itemID.values
data = mapped_reviewer_item_rating.rating.values


utility_matrix = coo_matrix((data,(row,col)))


In [164]:
def create_utility_matrix(utility_df,row_name,col_name,interaction_name) : 


    row = utility_df[row_name].values
    col = utility_df[col_name].values
    data = utility_df[interaction_name].values


    utility_matrix = coo_matrix((data,(row,col)))
    utility_matrix = utility_matrix
    return utility_matrix

In [165]:
utility_matrix = create_utility_matrix(
    utility_df=mapped_reviewer_item_rating, 
    row_name=CONFIG_DATA['user_column'],
    col_name=CONFIG_DATA['item_column'],
    interaction_name=CONFIG_DATA['interaction_column'],
    
)

In [166]:
from implicit.evaluation import train_test_split
train_data,test_data = train_test_split(ratings=utility_matrix,
                 train_percentage=CONFIG_DATA['training_pct'],
                 random_state=43)

In [167]:
val_data,test_data = train_test_split(ratings=test_data,
                 train_percentage=CONFIG_DATA['val_pct'],
                 random_state=CONFIG_DATA['seed'])

In [168]:
def data_splitting(utility_matrix) :
    '''Function for splitting data into training,val,test utility matrix ''' 
    train_data,test_data = train_test_split(ratings=utility_matrix,
                 train_percentage=CONFIG_DATA['training_pct'],
                 random_state=CONFIG_DATA['seed'])
    val_data,test_data = train_test_split(ratings=test_data,
                 train_percentage=CONFIG_DATA['val_pct'],
                 random_state=CONFIG_DATA['seed'])
    
    print('Train data length',train_data.nnz)
    print('Val data length',val_data.nnz)
    print('Test data length',test_data.nnz)
    
    #dump utility matrix 
    joblib.dump(train_data,CONFIG_DATA['train_utility_matrix_path'])
    joblib.dump(val_data,CONFIG_DATA['val_utility_matrix_path'])
    joblib.dump(test_data,CONFIG_DATA['test_utility_matrix_path'])
    

In [169]:
data_splitting(utility_matrix=utility_matrix)

Train data length 53176
Val data length 17788
Test data length 18003
