# Service Intelligence [Recommender Systems for Services]



# Factorization Machine (FM)


![screensh](https://drive.google.com/uc?export=view&id=1EQU3eXDI1GHVbEaYx9pN8FDpzDC1S9Oz)

<img src="https://drive.google.com/uc?export=view&id=1a7g5NudnRDJRc8vn4KXNYZDnsmFzBpsy" width="80%">

## Input data format for FM

- Combination of one-hot encoding vectors (and continous variables)
- Very sparse (most of the elements of matrix are zero)

<img src="https://drive.google.com/uc?export=view&id=1hpL4zcxGxBtXg_pM_BOfXozpBFaTErR4" width="80%">


# Data preparation (DP)

In [38]:
import numpy as np
import pandas as pd
from google.colab import files
import io

In [39]:
# Load interaction data (user_id, item_id) using pandas read_csv() function
## User A buy item a
## User B buy item c ...
### Output variable name : interaction_df
'''
file_uploaded = files.upload()
df = pd.read_csv(io.BytesIO(file_uploaded['interaction.csv']))
'''
interaction_df = pd.read_csv('interaction.csv')
interaction_df.head()

# The target variable is whether a transaction (interaction between a user and an item) occured or not (0 or 1)



Unnamed: 0,user_id,item_id
0,1369335,26615000
1,1369335,30626000
2,1369335,23917000
3,1369335,26563000
4,1369335,1294201000


In [40]:
# Load User information data (user_id, sex, age) using pandas read_csv() function
## User information
## User A is male and fifties.
## User B is female and forties.
### Output variable name : user_df
'''
file_uploaded = files.upload()
df = pd.read_csv(io.BytesIO(file_uploaded['users.csv']))
'''
user_df = pd.read_csv('users.csv')
user_df.head()



Unnamed: 0,user_id,sex,age
0,1369335,M,50
1,2965149,F,40
2,2107571,F,50
3,22182386,F,50
4,3082967,F,40


In [41]:
# Load Item information data (item_id, large_category) using pandas read_csv() function
## Item A belongs to the fish category
## Item B belongs to the vegetable category
### Output variable name : item_df
'''
file_uploaded = files.upload()
df = pd.read_csv(io.BytesIO(file_uploaded['items.csv']))
'''
item_df = pd.read_csv('items.csv')
item_df.head()



Unnamed: 0,item_id,large_category
0,26615000,fish
1,30626000,vegetable
2,23917000,vegetable
3,26563000,vegetable
4,1294201000,dried_seafood


In [42]:
# Check the number of users/items/interactions and print the results
# Hint : use pandas unique() and tolist() function
# Output variable name : num_user, num_item, num_interactions

### Write your code ###
num_user = user_df.user_id.nunique()
num_item = item_df.item_id.nunique()
num_interactions = len(interaction_df)
#######################

print("# of Users : {}".format(num_user))
print("# of Items : {}".format(num_item))
print("# of interactions : {}".format(num_interactions))

# of Users : 514
# of Items : 5196
# of interactions : 22354


## (DP1) Create dummy variables to represent categorical variables


### User information

In [43]:
# Create dummy variables of user information using pandas get_dummies() function (excluding user_id)
## Output variable name : user_features

### Write your code ###
user_features = pd.get_dummies(user_df, columns = ['sex','age'])
user_features.head()
#######################

user_features.head()

Unnamed: 0,user_id,sex_F,sex_M,age_20,age_30,age_40,age_50,age_60,age_70,age_80
0,1369335,0,1,0,0,0,1,0,0,0
1,2965149,1,0,0,0,1,0,0,0,0
2,2107571,1,0,0,0,0,1,0,0,0
3,22182386,1,0,0,0,0,1,0,0,0
4,3082967,1,0,0,0,1,0,0,0,0


### Item information

In [44]:
# Create dummy variables of item information using pandas get_dummies() function (excluding item_id)
## Output variable name : item_features

### Write your code  ###
item_features = pd.get_dummies(item_df, columns = ['large_category'])

#######################

item_features.head()

Unnamed: 0,item_id,large_category_H&B,large_category_baby_product,large_category_bathroom&cleaning,large_category_beef,large_category_beverage,large_category_chandlery,large_category_chicken,large_category_cracker,large_category_daily,...,large_category_prepared_food,large_category_seasoned_food,large_category_sports,large_category_sports_NB,large_category_stationery,large_category_tea,large_category_toy,large_category_underwear,large_category_vegetable,large_category_vehicle
0,26615000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,30626000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,23917000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,26563000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1294201000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Calculation of the number of dimensions and matrix sparsity 

In [45]:
# Create datasets merged user/item/interaction dataset using pandas merge() function
## Output variable name : merge_df

### Write your code ###
merge_df = pd.merge(interaction_df.drop_duplicates(), user_df, how = 'left', on = 'user_id')
merge_df = pd.merge(merge_df, item_df, how = 'left', on = 'item_id')
#######################
merge_df

Unnamed: 0,user_id,item_id,sex,age,large_category
0,1369335,26615000,M,50,fish
1,1369335,30626000,M,50,vegetable
2,1369335,23917000,M,50,vegetable
3,1369335,26563000,M,50,vegetable
4,1369335,1294201000,M,50,dried_seafood
...,...,...,...,...,...
18172,28359329,1212830000,M,20,kitchenalia
18173,28359329,1328973000,M,20,cracker
18174,28359329,1464753000,M,20,vegetable
18175,1369876,30732000,F,50,fish


In [46]:
# Create dummy variables of merged datasets to represent whole categorical variables using pandas get_dummies() function
# Note: input_matrix is not used to train the FM model, this matrix is just used for calculating the number of dimensions and sparsity
### Output variable name : input_matrix
### Write your code ###
input_matrix = pd.get_dummies(merge_df, columns = merge_df.columns)

#######################
input_matrix

Unnamed: 0,user_id_35598,user_id_85886,user_id_95297,user_id_132034,user_id_157322,user_id_174847,user_id_212567,user_id_219871,user_id_235768,user_id_292508,...,large_category_prepared_food,large_category_seasoned_food,large_category_sports,large_category_sports_NB,large_category_stationery,large_category_tea,large_category_toy,large_category_underwear,large_category_vegetable,large_category_vehicle
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18172,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
18175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
# Print the shape of merged_datasets (variable name : input_matrix)

print("Shape of input matrix : {}".format(input_matrix.shape))

Shape of input matrix : (18177, 5753)


- The final data format is **wide** (the number of dimensions is large; 5753)
- This kind of datasets requires high computational cost during training the ML model
- This causes **'curse of dimensionality'** that makes model performance low

### FM's unique trick to reduce dimensions of interaction matrix


![screensh](https://drive.google.com/uc?export=view&id=1wnmcd1PeH0Le6U2gg6_nZVPslBALapxK)

- FM does not estimates W matrix (5753 \* 5753), but V matrix (5753 \* K) for the interaction term
- It allows for reducing the computational cost and memory

In [48]:
# Calculate sparisity of matrix using numpy count_nonzero() function
## sparsity == 1 means that the matrix only consists of zeros
## Output variable name : sparsity

### Write your code ###
from numpy import count_nonzero
sparsity = 1 - count_nonzero(input_matrix) / input_matrix.size
#######################

print("Sparsity : {}".format(round(sparsity,3)))

Sparsity : 0.999


- The matrix which consists of dummy variables has high sparsity.
- High sparsity occurs low performance.
- This is called **sparsity problem** in recommendation research domain

## (DP2) Split the data into Training / Test sets

In [49]:
# Split the Data into training and test sets in your own way
## Output variable name : train_interaction, test_interaction

### Write your code ###
from random import sample
train_interaction = interaction_df.sample(frac = 0.8, random_state = 2022)
test_interaction = interaction_df.drop(train_interaction.index)

#######################

train_interaction

Unnamed: 0,user_id,item_id
9619,4550913,1030385000
10621,3474487,30661000
22247,2829999,32855000
10233,22943234,1030382000
163,4280764,30983000
...,...,...
320,2526078,1106695000
16393,2926918,32463000
21926,1951571,1351154000
8335,10392785,4626000


In [50]:
# Reset the index of each dataframe using pandas reset_index() function
## Output variable name : train_interaction, test_interaction

### Write your code ###
train_interaction.reset_index(inplace = True, drop = True)
test_interaction.reset_index(inplace = True, drop = True)

#######################

train_interaction

Unnamed: 0,user_id,item_id
0,4550913,1030385000
1,3474487,30661000
2,2829999,32855000
3,22943234,1030382000
4,4280764,30983000
...,...,...
17878,2526078,1106695000
17879,2926918,32463000
17880,1951571,1351154000
17881,10392785,4626000


### Cold start issue
- Cold-start problem is very important issue in recommendation domain.
- Cold-start users : **new users that the model did not observse** during training step 
- Cold-start items : **new items that the model did not observse** during training step 
- For more detail about cold-start problem, please refer to the below links.
  - https://www.yusp.com/blog-posts/cold-start-problem/
  -  https://en.wikipedia.org/wiki/Cold_start_(recommender_systems)

In [51]:
# Identify the users in each interaction datasets
# Hint: Use pandas unique() function
## Output variable name : train_users_list, test_users_list

### Write your code ###
train_users_list = train_interaction.user_id.unique()
test_users_list = test_interaction.user_id.unique()
#######################

# Calculate the number of the cold start users who are in the training datasets but not in the test datasets.
cold_start_users = set(test_users_list) - set(train_users_list)
print("# of cold_start_users : {}".format(len(cold_start_users)))

# of cold_start_users : 0


In [52]:
# Identify the items in each interaction datasets
# Hint: Use pandas unique() function
## Output variable name : train_items_list, test_items_list

### Write your code ###
train_items_list = train_interaction.item_id.unique() 
test_items_list = test_interaction.item_id.unique()

#######################

# Calculate the number of the cold start items who contain in training datasets but not in test datasets.
cold_start_items = set(test_items_list) - set(train_items_list)
print("# of cold_start_items : {}".format(len(cold_start_items)))

# Cold start items cannot be derived from FM models because they are not observed during training.

# of cold_start_items : 548


In [53]:
# Create dummy variables of user and item information for training datasets
# Hint : pandas isin() function
## Output variable name : train_user_features, train_item_features

### Write your code ###
train_user_features = user_features[user_features.user_id.isin(train_users_list)]
train_item_features = item_features[item_features.item_id.isin(train_items_list)]

#######################


In [54]:
train_user_features

Unnamed: 0,user_id,sex_F,sex_M,age_20,age_30,age_40,age_50,age_60,age_70,age_80
0,1369335,0,1,0,0,0,1,0,0,0
1,2965149,1,0,0,0,1,0,0,0,0
2,2107571,1,0,0,0,0,1,0,0,0
3,22182386,1,0,0,0,0,1,0,0,0
4,3082967,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
509,10719804,1,0,0,1,0,0,0,0,0
510,9197345,1,0,0,0,1,0,0,0,0
511,1372018,0,1,0,0,0,1,0,0,0
512,1312671,1,0,0,0,0,1,0,0,0


In [55]:
train_item_features

Unnamed: 0,item_id,large_category_H&B,large_category_baby_product,large_category_bathroom&cleaning,large_category_beef,large_category_beverage,large_category_chandlery,large_category_chicken,large_category_cracker,large_category_daily,...,large_category_prepared_food,large_category_seasoned_food,large_category_sports,large_category_sports_NB,large_category_stationery,large_category_tea,large_category_toy,large_category_underwear,large_category_vegetable,large_category_vehicle
0,26615000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,30626000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,23917000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,26563000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1294201000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,5760012001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5192,5760013001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5193,5762830005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5194,1123137000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# RankFM
#### Ref : https://rankfm.readthedocs.io/en/latest/home.html, https://github.com/etlundquist/rankfm

- RankFM has 9 hyper-parameters.
    - **factors** : the number of latent factors (>1)
    - **loss** : optimization/loss function to use for training: ['bpr', 'warp'] ('warp' recommended.)
       -  For more details about 'warp loss' and 'negative sampling', see https://medium.com/@gabrieltseng/intro-to-warp-loss-automatic-differentiation-and-pytorch-b6aa5083187a)
    - **max_samples** : Maximum number of negative samples to draw for WARP loss (>0)
    - **alpha** : L2 regularization penalty on [user, item] model weights (>0.0)
    - **beta** : L2 regularization penalty on [user-feature, item-feature] model weights (>0.0)
    - **sigma** : standard deviation to use for random initialization of factor weights (>0.0)
    - **learning_rate** : initial learning rate for gradient step updates (>0.0)
    - **learning_schedule** : schedule for adjusting learning rates by training epoch: ['constant', 'invscaling']
    - **learning_exponent** : exponent applied to epoch number to adjust learning rate (>0.0): scaling = 1 / pow(epoch + 1, learning_exponent) 

- You can change the value of each hyper-parameter to get the highly performed model.

## 1) Training step for FM model

In [56]:
pip install rankfm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [57]:
import rankfm
from rankfm.rankfm import RankFM
# Call the RankFM function and Set the hyper-parameters in your own ways.
## Output variable name : model

### Write your code ###
model = RankFM(factors = 50, loss = 'warp', max_samples = 50, alpha = 0.01, beta = 0.1, sigma = 0.1, learning_rate = 0.1, learning_schedule = 'invscaling', learning_exponent = 0.25)
#######################


In [58]:
# Train the model using fit() function. you can choose the number of epochs.
# You must input interaction data, user_features, and item_features (for training datasets) into the fit() function.
%%time

### Write your code ###
model.fit(train_interaction, user_features = train_user_features, item_features = train_item_features, epochs = 30, verbose = True)
#######################


training epoch: 0
log likelihood: -10740.3095703125

training epoch: 1
log likelihood: -9804.490234375

training epoch: 2
log likelihood: -9500.3203125

training epoch: 3
log likelihood: -9260.080078125

training epoch: 4
log likelihood: -8994.5703125

training epoch: 5
log likelihood: -8664.7197265625

training epoch: 6
log likelihood: -8435.330078125

training epoch: 7
log likelihood: -8114.75

training epoch: 8
log likelihood: -7843.16015625

training epoch: 9
log likelihood: -7604.22998046875

training epoch: 10
log likelihood: -7335.830078125

training epoch: 11
log likelihood: -7162.10009765625

training epoch: 12
log likelihood: -6974.6201171875

training epoch: 13
log likelihood: -6840.68017578125

training epoch: 14
log likelihood: -6708.490234375

training epoch: 15
log likelihood: -6573.080078125

training epoch: 16
log likelihood: -6461.47021484375

training epoch: 17
log likelihood: -6358.3798828125

training epoch: 18
log likelihood: -6249.14013671875

training epoch: 19

In [60]:
# FM model is a kind of regression model.
# Therefore, it calculates the score of each interaction (user/item pair:[user_id, item_id]) (The larger score is, The higher probability is)
## Calculate the score each interaction in test interaction datasets using rankfm predict() function.
### Output variable name : test_scores

### Write your code ###
test_scores = model.predict(test_interaction, cold_start = 'nan')
#######################

test_scores

array([1.4026626, 1.1999476, 3.5191357, ..., 2.0579286, 2.030844 ,
       1.2133348], dtype=float32)

## 2) Generating TopK recommendation
-  Based on the score, TopK recommendations can be generated.
-  'recommend()' function in rankfm package provide each users' TopK recommended items in descending order
  -  The best recommended item is in column 0.

In [61]:
# Generate TopK recommendation for each users in test interaction data using rankfm recommend() function
## Output variable name : test_recommendation
TopK = 10

### Write your code ###
test_recommendation = model.recommend(test_interaction.user_id, n_items = TopK)

#######################

test_recommendation

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1369335,1348734000,10286000,1482397000,22005000,30982000,16446000,8750000,33828000,1233248000,1247441000
2965149,1030385000,1296301000,1374958000,1101576000,1030387000,1418533000,32537000,4626000,1091055000,1182128000
2965149,1030385000,1296301000,1374958000,1101576000,1030387000,1418533000,32537000,4626000,1091055000,1182128000
2965149,1030385000,1296301000,1374958000,1101576000,1030387000,1418533000,32537000,4626000,1091055000,1182128000
2965149,1030385000,1296301000,1374958000,1101576000,1030387000,1418533000,32537000,4626000,1091055000,1182128000
...,...,...,...,...,...,...,...,...,...,...
3802775,1263094000,1394279000,1290046000,1348110000,8750000,1086024000,1496185000,9214000,1127356000,1305193000
3802775,1263094000,1394279000,1290046000,1348110000,8750000,1086024000,1496185000,9214000,1127356000,1305193000
1098386,1473457000,1186327000,33379000,2888000,8750000,1226723000,30792000,31312000,1343872000,1108430000
2160403,1310895000,4201000,1130886000,5762830005,1341021000,1310939000,30528000,5725116002,1377991000,5742059001


In [116]:
### Write your code ###
user1_info = test_recommendation.iloc[0,:]
recommendations = []
for index in user1_info:
    recommendations.append(item_df[item_df.item_id == index].large_category.values[0])

recommendations_for_user1 = pd.DataFrame(user1_info)
recommendations_for_user1['Item Category'] = recommendations
recommendations_for_user1

Unnamed: 0,1369335,Item Category
0,1348734000,chicken
1,10286000,fish
2,1482397000,bathroom&cleaning
3,22005000,vegetable
4,30982000,vegetable
5,16446000,vegetable
6,8750000,vegetable
7,33828000,vegetable
8,1233248000,vegetable
9,1247441000,frozen_food


## 3) Evaluating FM model

- For more information about metrics for recommendation, please refer to https://towardsdatascience.com/ranking-evaluation-metrics-for-recommender-systems-263d0a66ef54

- For this practice, Hit ratio, reciprocal rank, discounted cumulative gain (DCG), precision, recall and F1 score are used to measure the performance of the FM model (The higher the better)

In [62]:
from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall

In [63]:
# Evaluate the trained FM model using all metrics (Hit rate, reciprocal rank, DCG, precision, recall, F1 score) when K is equal to 10.

K = 10

Hit_rate = hit_rate(model, test_interaction, k=K)
Reciprocal_rank = reciprocal_rank(model, test_interaction, k=K)
Dcg = discounted_cumulative_gain(model, test_interaction, k=K)
Precision = precision(model, test_interaction, k=K)
Recall = recall(model, test_interaction, k=K)

print("*"*5 + " Performance of RankFM " + "*"*5)
print("Hit_ratio: {}".format(round(Hit_rate, 3)))
print("Reciprocal_rank: {}".format(round(Reciprocal_rank, 3)))
print("Dcg: {}".format(round(Dcg, 3)))
print("Precision: {}".format(round(Precision, 3)))
print("Recall: {}".format(round(Recall, 3)))
print("F1 score: {}".format(round((2*Recall*Precision)/(Recall+Precision),3)))

***** Performance of RankFM *****
Hit_ratio: 0.591
Reciprocal_rank: 0.321
Dcg: 0.525
Precision: 0.095
Recall: 0.129
F1 score: 0.11


## 4) Performance Comparison

- Here, we compare the trained RankFM model and baseline; POP
- POP is Popularity based recommendation model (very simple)


- **The evaluation result of your FM model must be better than the results of the baseline model; PoP, For this, you need to tune the hyper-parameters of the FM model appropriately.**


In [64]:
popular_items = train_interaction.groupby('item_id')['user_id'].count().sort_values(ascending=False)[:K]
popular_items

item_id
1030385000    245
1482397000    178
4626000       145
33379000      140
30661000      139
1108018000    134
8750000       122
32537000      105
22624000       88
32463000       84
Name: user_id, dtype: int64

In [65]:
test_user_items = test_interaction.groupby('user_id')['item_id'].apply(set).to_dict()
test_user_items = {key: val for key, val in test_user_items.items() if key in set(train_interaction.user_id.unique())}

In [66]:
base_pre = np.mean([len(set(popular_items.index) & set(val)) / len(set(popular_items.index)) for key, val in test_user_items.items()])
base_rec = np.mean([len(set(popular_items.index) & set(val)) / len(set(val))                for key, val in test_user_items.items()])

In [67]:
print("Performance Comparision\n")

print("*"*5 + " Performance of PoP " + "*"*5)
print("Precision: {:.3f}".format(base_pre))
print("Recall: {:.3f}".format(base_rec))
print("F1 score: {}\n".format(round((2*base_rec*base_pre)/(base_rec + base_pre),3)))

print("*"*5 + " Performance of RankFM " + "*"*5)
print("Precision: {}".format(round(Precision, 3)))
print("Recall: {}".format(round(Recall, 3)))
print("F1 score: {}".format(round((2*Recall*Precision)/(Recall+Precision),3)))

Performance Comparision

***** Performance of PoP *****
Precision: 0.064
Recall: 0.082
F1 score: 0.072

***** Performance of RankFM *****
Precision: 0.095
Recall: 0.129
F1 score: 0.11
