This notebook uses the package rankfm found at: https://github.com/etlundquist/rankfm

Also using as reference the following article: https://towardsdatascience.com/factorization-machines-for-item-recommendation-with-implicit-feedback-data-5655a7c749db

This code uses the training, validation and test sets created on the notebook: Implicit_Rating_Calculation_category.ipynb to train and test the model.

In [1]:
! pip install rankfm

Collecting rankfm
  Downloading rankfm-0.2.5.tar.gz (145 kB)
[K     |████████████████████████████████| 145 kB 6.5 MB/s eta 0:00:01
Building wheels for collected packages: rankfm
  Building wheel for rankfm (setup.py) ... [?25ldone
[?25h  Created wheel for rankfm: filename=rankfm-0.2.5-cp36-cp36m-linux_x86_64.whl size=399145 sha256=970384e1d9a80f1e1f8625c26d9e2c7d8783dd2091b9e984cd68801d0d234229
  Stored in directory: /home/ec2-user/.cache/pip/wheels/44/3a/5e/8a30a8716f170f3920c0e433a7de48690fa5a1854027ad3a4b
Successfully built rankfm
Installing collected packages: rankfm
Successfully installed rankfm-0.2.5


In [2]:
# Loading needed libraries
import numpy as np
import pandas as pd
import datetime as dt
from datetime import date
import gc

from rankfm.rankfm import RankFM
from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall

# Loading libraries for S3 bucket connection
import boto3
import io
from io import StringIO,BytesIO, TextIOWrapper
import gzip

client = boto3.client('s3') 
resource = boto3.resource('s3') 

#### Data preparation

In [3]:
# Reading Training,validation and testing dfs - Using the created implicit_cat data with timestamp
train_df = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/input/T_implicit_cat_rating_train.csv')
test_df = pd.read_csv('s3://myaws-capstone-bucket/data/modeling/input/T_implicit_cat_rating_test.csv')

In [4]:
# Transform product and user ids to needed format
train_df['catID'] = train_df['catID'].astype("int64")
train_df['user_id'] = train_df['user_id'].astype("int64")

In [5]:
# Transform product and user ids to needed format
test_df['catID'] = test_df['catID'].astype("int64")
test_df['user_id'] = test_df['user_id'].astype("int64")

#### Model Data Analysis and Training 

In [6]:
# RankFM two columns: a user_id and a category_id 
interactions_train = train_df[['user_id', 'catID']]
interactions_test = test_df[['user_id', 'catID']]

In [7]:
train_users = np.sort(interactions_train.user_id.unique())
test_users = np.sort(interactions_test.user_id.unique())
cold_start_users = set(test_users) - set(train_users)

train_items = np.sort(interactions_train.catID.unique())
test_items = np.sort(interactions_test.catID.unique())
cold_start_items = set(test_items) - set(train_items)

In [8]:
model = RankFM(factors=20, loss='warp', max_samples=20, alpha=0.01, sigma=0.1, learning_rate=0.1, learning_schedule='invscaling')
model.fit(interactions_train, epochs=20, verbose=True)


training epoch: 0
log likelihood: -903029.9375

training epoch: 1
log likelihood: -914999.8125

training epoch: 2
log likelihood: -910992.875

training epoch: 3
log likelihood: -907428.375

training epoch: 4
log likelihood: -901442.75

training epoch: 5
log likelihood: -895959.125

training epoch: 6
log likelihood: -888654.0625

training epoch: 7
log likelihood: -882423.5625

training epoch: 8
log likelihood: -873665.0

training epoch: 9
log likelihood: -866238.6875

training epoch: 10
log likelihood: -856602.8125

training epoch: 11
log likelihood: -848097.25

training epoch: 12
log likelihood: -839488.4375

training epoch: 13
log likelihood: -831698.375

training epoch: 14
log likelihood: -824022.875

training epoch: 15
log likelihood: -815688.0

training epoch: 16
log likelihood: -808081.25

training epoch: 17
log likelihood: -799818.3125

training epoch: 18
log likelihood: -792022.0625

training epoch: 19
log likelihood: -784786.875


In [9]:
# Generating top 10 recommended products for each user
test_recs = model.recommend(test_users, n_items=10, filter_previous=True, cold_start='drop')

In [10]:
test_recs = test_recs.reset_index()# Reseting the index as the index is the user_id
test_recs.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9
0,128968633,668.0,107.0,605.0,714.0,84.0,725.0,76.0,662.0,743.0,699.0
1,145611266,668.0,107.0,605.0,84.0,714.0,725.0,743.0,729.0,76.0,662.0
2,200985178,107.0,605.0,84.0,714.0,725.0,76.0,662.0,743.0,751.0,657.0
3,221480173,107.0,605.0,84.0,714.0,725.0,76.0,743.0,662.0,751.0,200.0
4,237973968,668.0,107.0,605.0,84.0,714.0,725.0,76.0,662.0,743.0,375.0


In [11]:
# Renaming column
test_recs.columns = ['user_id', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [12]:
# Saving results of recommendations in S3
test_recs.to_csv('s3://myaws-capstone-bucket/data/modeling/output/rankFM_category.csv',index=False)

In [13]:
# Transforming recommendations from being in columns to being in rows
recs = test_recs.melt(id_vars=["user_id"], 
        var_name="predicted_Rank", 
        value_name="predicted_recs")
recs['predicted_recs'] = recs['predicted_recs'].astype("int64")
recs.head()

Unnamed: 0,user_id,predicted_Rank,predicted_recs
0,128968633,0,668
1,145611266,0,668
2,200985178,0,107
3,221480173,0,107
4,237973968,0,668


In [14]:
# Saving results of recommendations in S3
recs.to_csv('s3://myaws-capstone-bucket/data/modeling/output/rankFM_T_category.csv',index=False)

In [15]:
recs.nunique()

user_id           473469
predicted_Rank        10
predicted_recs       474
dtype: int64