In [36]:

import pandas as pd #data processing
import joblib #


from src.utils import load_config


CONFIG_DATA = load_config()

In [2]:
type(CONFIG_DATA)

dict

## Data Collection

In [3]:

#read dataset
rating = pd.read_csv(CONFIG_DATA['dataset_path'])

In [4]:
#check its head
rating.head()

Unnamed: 0,reviewerID,itemID,rating
0,B00005N7P0,AH2IFH762VY5U,5.0
1,B00005N7P0,AOSFI0JEYU4XM,5.0
2,B00005N7OJ,A3JPFWKS83R49V,3.0
3,B00005N7OJ,A19FKU6JZQ2ECJ,5.0
4,B00005N7P0,A25MDGOMZ2GALN,5.0


In [5]:
#check some stats
rating.describe()

Unnamed: 0,rating
count,89689.0
mean,4.036638
std,1.419791
min,1.0
25%,3.0
50%,5.0
75%,5.0
max,5.0


In [6]:
#check data type 
rating.dtypes

reviewerID     object
itemID         object
rating        float64
dtype: object

In [7]:
#wrap as function 

def load_data() : 
    #read data
    data  = pd.read_csv(CONFIG_DATA['dataset_path'])
    
    #print datashape
    print('Data shape : ',data.shape)
    
    
    #print datatypes 
    print('Data Types : \n ',data.dtypes)
    #return data 
    return data 
    
    
    

In [8]:
data = load_data()

Data shape :  (89689, 3)
Data Types : 
  reviewerID     object
itemID         object
rating        float64
dtype: object


In [9]:
data.shape

(89689, 3)

## Making Simple Recommendation

In [10]:
rating.reviewerID.nunique()

2428

In [11]:
rating.itemID.nunique()

72098

### How many items that user rated ? 

In [12]:
rating

Unnamed: 0,reviewerID,itemID,rating
0,B00005N7P0,AH2IFH762VY5U,5.0
1,B00005N7P0,AOSFI0JEYU4XM,5.0
2,B00005N7OJ,A3JPFWKS83R49V,3.0
3,B00005N7OJ,A19FKU6JZQ2ECJ,5.0
4,B00005N7P0,A25MDGOMZ2GALN,5.0
...,...,...,...
89684,B01HI8V1C2,AB5HCI50SF1TK,5.0
89685,B01HI8V1C2,A2X11NZMPI7M0T,1.0
89686,B01HI8V1C2,A3EGFFBRQVO7L7,5.0
89687,B01HIZSSQM,A3H23AVKFN62ZT,5.0


In [13]:
number_of_interaction = (rating.groupby('reviewerID',as_index=False)
                            .agg(avg_interaction=pd.NamedAgg('itemID','count'))
                            )


In [14]:
number_of_interaction

Unnamed: 0,reviewerID,avg_interaction
0,B00005N7NQ,117
1,B00005N7O3,5
2,B00005N7O4,6
3,B00005N7O6,64
4,B00005N7O9,23
...,...,...
2423,B01HI8V1AE,4
2424,B01HI8V1C2,10
2425,B01HI8V1I6,24
2426,B01HI8V1MC,14


In [15]:
number_of_interaction.describe()

Unnamed: 0,avg_interaction
count,2428.0
mean,36.939456
std,117.218818
min,1.0
25%,2.0
50%,7.0
75%,21.0
max,1718.0


### Finding items that have most rated

In [16]:
rated_count = (rating.groupby('itemID',as_index=False)
                     .agg(rating_count = pd.NamedAgg('reviewerID','count'))
                     .sort_values('rating_count',ascending=False))

In [17]:
rated_count

Unnamed: 0,itemID,rating_count
48609,A3JPFWKS83R49V,55
32315,A2OTUWUSH49XIN,26
60746,AEMZRE6QYVQBS,25
46846,A3GA09FYFKL4EY,24
52524,A3R7MXVQRGGIQ9,22
...,...,...
26064,A2D5KCTV2F6DWF,1
26067,A2D5UUBWHC4PP7,1
26068,A2D5XM37KOQ43F,1
26069,A2D5Y5Q2I8M7NU,1


### Giving Recommendation

One way to recommend is we can give most rated items, 
However most rated items does not means its highly rated by users. 


In [18]:
rated_count

Unnamed: 0,itemID,rating_count
48609,A3JPFWKS83R49V,55
32315,A2OTUWUSH49XIN,26
60746,AEMZRE6QYVQBS,25
46846,A3GA09FYFKL4EY,24
52524,A3R7MXVQRGGIQ9,22
...,...,...
26064,A2D5KCTV2F6DWF,1
26067,A2D5UUBWHC4PP7,1
26068,A2D5XM37KOQ43F,1
26069,A2D5Y5Q2I8M7NU,1


usually recommendation is in `Top N Recommendation`, such as `Top 10 Items`, etc. 

In [19]:
rated_count.head(10)

Unnamed: 0,itemID,rating_count
48609,A3JPFWKS83R49V,55
32315,A2OTUWUSH49XIN,26
60746,AEMZRE6QYVQBS,25
46846,A3GA09FYFKL4EY,24
52524,A3R7MXVQRGGIQ9,22
38444,A30H2335OM7RD6,22
14817,A1RPTVW5VEOSI,21
64002,AKMEY1BSHSDG7,21
69735,AVF9FV7AMRP5C,20
28160,A2H3JURQZOHVMB,20


In [20]:
def recommend_top_n(rating, n_items=10) : 
    
    rated_count = (rating.groupby('itemID',as_index=False)
                        .agg(rating_count = pd.NamedAgg('reviewerID','count'))
                        .sort_values('rating_count',ascending=False)
                        .head(n_items))
    return rated_count

In [21]:
recommend_top_n(rating=rating,
                n_items=30)

Unnamed: 0,itemID,rating_count
48609,A3JPFWKS83R49V,55
32315,A2OTUWUSH49XIN,26
60746,AEMZRE6QYVQBS,25
46846,A3GA09FYFKL4EY,24
52524,A3R7MXVQRGGIQ9,22
38444,A30H2335OM7RD6,22
14817,A1RPTVW5VEOSI,21
64002,AKMEY1BSHSDG7,21
69735,AVF9FV7AMRP5C,20
28160,A2H3JURQZOHVMB,20


In [22]:
rating.sample(30)

Unnamed: 0,reviewerID,itemID,rating
73561,B0161KW62M,A1ME31VEOPCTLO,5.0
29817,B000066T0E,A1K048AXSQV5BO,5.0
60379,B000W3MB5M,A145C2WRUSNZGD,5.0
69120,B00009MQ2F,AMWSZ61FILX2J,5.0
8661,B00005N7TL,A2KX12N9W5K3N9,5.0
77345,B00006KAZW,A1763227S34XWO,5.0
51557,B000ILUOEI,A23JE5G4UIA98N,5.0
2133,B00005N7OV,A1DH9KHAWD442F,5.0
52438,B000ILY9LW,A1NBXZKWV8UF7G,5.0
38305,B00007B10Y,A1L76UYQY7NDFW,5.0


## Creating Utility Matrix

### Filter user with minimal rating

In [23]:
filter_minimum_interaction = number_of_interaction['avg_interaction'] < CONFIG_DATA['minimum_interaction']
under_minimum_reviewerID = number_of_interaction.loc[filter_minimum_interaction,'reviewerID'].tolist()

In [24]:
len(under_minimum_reviewerID)

659

In [25]:
rating.shape

(89689, 3)

In [26]:
#drop rating_based on index 
rating_adjusted = rating.loc[
    ~rating['reviewerID'].isin(under_minimum_reviewerID)
]
rating_adjusted

Unnamed: 0,reviewerID,itemID,rating
0,B00005N7P0,AH2IFH762VY5U,5.0
1,B00005N7P0,AOSFI0JEYU4XM,5.0
2,B00005N7OJ,A3JPFWKS83R49V,3.0
3,B00005N7OJ,A19FKU6JZQ2ECJ,5.0
4,B00005N7P0,A25MDGOMZ2GALN,5.0
...,...,...,...
89682,B01HI8V1C2,A2XSSQH4VO5B1L,5.0
89683,B01HI8V1C2,A2K4AUMORPI9GZ,5.0
89684,B01HI8V1C2,AB5HCI50SF1TK,5.0
89685,B01HI8V1C2,A2X11NZMPI7M0T,1.0


In [27]:
def remove_min_interaction(rating_data,number_of_interaction,threshold=3) : 
    rating_data = rating_data.copy()
    filter_minimum_interaction = number_of_interaction['avg_interaction']<threshold
    under_threshold_reviewerID = number_of_interaction.loc[filter_minimum_interaction,'reviewerID'].tolist()
    rating_adjusted = rating_data.loc[
        ~rating['reviewerID'].isin(under_threshold_reviewerID) ]

    return rating_adjusted
    

filter `reviewerID` with minimal 3 interactions

In [28]:
rating_data_filtered = remove_min_interaction(
    rating_data=rating,
    number_of_interaction=number_of_interaction,
    threshold=3
)
rating_data_filtered

Unnamed: 0,reviewerID,itemID,rating
0,B00005N7P0,AH2IFH762VY5U,5.0
1,B00005N7P0,AOSFI0JEYU4XM,5.0
2,B00005N7OJ,A3JPFWKS83R49V,3.0
3,B00005N7OJ,A19FKU6JZQ2ECJ,5.0
4,B00005N7P0,A25MDGOMZ2GALN,5.0
...,...,...,...
89682,B01HI8V1C2,A2XSSQH4VO5B1L,5.0
89683,B01HI8V1C2,A2K4AUMORPI9GZ,5.0
89684,B01HI8V1C2,AB5HCI50SF1TK,5.0
89685,B01HI8V1C2,A2X11NZMPI7M0T,1.0


### Mapping `reviewerID` and `itemID` 


In [29]:
rating['reviewerID'].nunique()

2428

In [30]:
#map user id --> ordered id 
reviewer_id_to_ordered_id = {}
ordered_id_to_reviewer_id = {}
for idx,reviewer_id in enumerate(rating['reviewerID'].unique()) : 
    reviewer_id_to_ordered_id[reviewer_id] = idx+1
    ordered_id_to_reviewer_id[idx+1] = reviewer_id




In [31]:
#map user id --> ordered id 
item_id_to_ordered_id = {}
ordered_id_to_item_id = {}
for idx,item_id in enumerate(rating['itemID'].unique()) : 
    item_id_to_ordered_id[item_id] = idx+1
    ordered_id_to_item_id[idx+1] = item_id


In [32]:
help(joblib.dump)

Help on function dump in module joblib.numpy_pickle:

dump(value, filename, compress=0, protocol=None, cache_size=None)
    Persist an arbitrary Python object into one file.
    
    Read more in the :ref:`User Guide <persistence>`.
    
    Parameters
    ----------
    value: any Python object
        The object to store to disk.
    filename: str, pathlib.Path, or file object.
        The file object or path of the file in which it is to be stored.
        The compression method corresponding to one of the supported filename
        extensions ('.z', '.gz', '.bz2', '.xz' or '.lzma') will be used
        automatically.
    compress: int from 0 to 9 or bool or 2-tuple, optional
        Optional compression level for the data. 0 or False is no compression.
        Higher value means more compression, but also slower read and
        write times. Using a value of 3 is often a good compromise.
        See the notes for more details.
        If compress is True, the compression level used

In [33]:
joblib.dump(reviewer_id_to_ordered_id,CONFIG_DATA['reviewer_id_to_ordered_id_path'])
joblib.dump(ordered_id_to_reviewer_id,CONFIG_DATA['ordered_id_to_reviewer_id_path'])

joblib.dump(item_id_to_ordered_id,CONFIG_DATA['item_id_to_ordered_id_path'])
joblib.dump(ordered_id_to_item_id,CONFIG_DATA['ordered_id_to_item_id_path'])

['dataset/mapper/ordered_id_to_item_id.pkl']

In [34]:
mapped_reviewer_item_rating = rating.copy()

mapped_reviewer_item_rating.reviewerID = mapped_reviewer_item_rating.reviewerID.map(reviewer_id_to_ordered_id)
mapped_reviewer_item_rating.itemID = mapped_reviewer_item_rating.itemID.map(item_id_to_ordered_id)


In [58]:
rating

Unnamed: 0,reviewerID,itemID,rating
0,B00005N7P0,AH2IFH762VY5U,5.0
1,B00005N7P0,AOSFI0JEYU4XM,5.0
2,B00005N7OJ,A3JPFWKS83R49V,3.0
3,B00005N7OJ,A19FKU6JZQ2ECJ,5.0
4,B00005N7P0,A25MDGOMZ2GALN,5.0
...,...,...,...
89684,B01HI8V1C2,AB5HCI50SF1TK,5.0
89685,B01HI8V1C2,A2X11NZMPI7M0T,1.0
89686,B01HI8V1C2,A3EGFFBRQVO7L7,5.0
89687,B01HIZSSQM,A3H23AVKFN62ZT,5.0


In [57]:
mapped_reviewer_item_rating

Unnamed: 0,reviewerID,itemID,rating
0,1,1,5.0
1,1,2,5.0
2,2,3,3.0
3,2,4,5.0
4,1,5,5.0
...,...,...,...
89684,2427,72095,5.0
89685,2427,72096,1.0
89686,2427,58405,5.0
89687,2428,72097,5.0


### Utility Matrix

In [64]:
from scipy.sparse import coo_matrix,csr_matrix


from scipy.sparse import coo_matrix
"""
From Documentation 
Constructing a matrix using ijv format

row  = np.array([0, 3, 1, 0])
col  = np.array([0, 3, 1, 2])
data = np.array([4, 5, 7, 9])
coo_array((data, (row, col)), shape=(4, 4)).toarray()
array([[4, 0, 9, 0],
       [0, 7, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 5]])
"""

row = mapped_reviewer_item_rating.reviewerID.values
col = mapped_reviewer_item_rating.itemID.values
data = mapped_reviewer_item_rating.rating.values


utility_matrix = coo_matrix((data,(row,col)))


In [85]:
print(utility_matrix)

  (1, 1)	5.0
  (1, 2)	5.0
  (2, 3)	3.0
  (2, 4)	5.0
  (1, 5)	5.0
  (1, 6)	3.0
  (1, 7)	5.0
  (1, 8)	5.0
  (1, 9)	4.0
  (1, 10)	4.0
  (1, 11)	5.0
  (1, 12)	2.0
  (1, 13)	5.0
  (1, 14)	5.0
  (1, 15)	4.0
  (1, 16)	5.0
  (1, 17)	5.0
  (1, 18)	5.0
  (1, 19)	3.0
  (1, 20)	5.0
  (1, 21)	2.0
  (1, 22)	5.0
  (1, 23)	5.0
  (1, 24)	5.0
  (1, 25)	4.0
  :	:
  (965, 66607)	5.0
  (965, 10606)	4.0
  (965, 39284)	5.0
  (965, 19267)	4.0
  (965, 72089)	5.0
  (965, 45144)	5.0
  (965, 4538)	3.0
  (965, 72090)	5.0
  (965, 46372)	3.0
  (2426, 72091)	4.0
  (2426, 3930)	5.0
  (2426, 8330)	5.0
  (2426, 60125)	5.0
  (2427, 72059)	3.0
  (2427, 72092)	3.0
  (2427, 71234)	3.0
  (2427, 17114)	5.0
  (2427, 72093)	5.0
  (2427, 60125)	5.0
  (2427, 72094)	5.0
  (2427, 72095)	5.0
  (2427, 72096)	1.0
  (2427, 58405)	5.0
  (2428, 72097)	5.0
  (2428, 72098)	5.0


In [86]:
def create_utility_matrix(utility_df,row_name,col_name,interaction_name) : 


    row = utility_df[row_name].values
    col = utility_df[col_name].values
    data = utility_df[interaction_name].values 


    utility_matrix = coo_matrix((data,(row,col))) 

    return utility_matrix

In [87]:
utility_matrix = create_utility_matrix(
    utility_df=mapped_reviewer_item_rating, 
    row_name=CONFIG_DATA['user_column'],
    col_name=CONFIG_DATA['item_column'],
    interaction_name=CONFIG_DATA['interaction_column'],
    
)

<2429x72099 sparse matrix of type '<class 'numpy.float64'>'
	with 89689 stored elements in COOrdinate format>

In [74]:
from implicit.evaluation import train_test_split

train_data,test_data = train_test_split(ratings=utility_matrix,
                 train_percentage=CONFIG_DATA['training_pct'],
                 random_state=43)

In [82]:
(utility_matrix.nnz)

89689

In [76]:
train_data.nnz

53176

In [75]:
val_data,test_data = train_test_split(ratings=test_data,
                 train_percentage=CONFIG_DATA['val_pct'],
                 random_state=CONFIG_DATA['seed'])

In [77]:
val_data.nnz

17788

In [78]:
test_data.nnz

18003

In [83]:
def data_splitting(utility_matrix) :
    '''Function for splitting data into training,val,test utility matrix ''' 
    train_data,test_data = train_test_split(ratings=utility_matrix,
                 train_percentage=CONFIG_DATA['training_pct'],
                 random_state=CONFIG_DATA['seed'])
    val_data,test_data = train_test_split(ratings=test_data,
                 train_percentage=CONFIG_DATA['val_pct'],
                 random_state=CONFIG_DATA['seed'])
    
    print('Train data length',train_data.nnz)
    print('Val data length',val_data.nnz)
    print('Test data length',test_data.nnz)
    
    
    #dump utility matrix 
    joblib.dump(train_data,CONFIG_DATA['train_utility_matrix_path'])
    joblib.dump(val_data,CONFIG_DATA['val_utility_matrix_path'])
    joblib.dump(test_data,CONFIG_DATA['test_utility_matrix_path'])
    

In [169]:
data_splitting(utility_matrix=utility_matrix)

Train data length 53176
Val data length 17788
Test data length 18003


## Saving Utility Dataframe 

In [38]:
mapped_reviewer_item_rating.to_csv(CONFIG_DATA['utility_dataframe_path'],index=False)