In [1]:
import numpy as np
import pandas as pd

### Preprocess data

In [2]:
path = "../data/petdata_1000_100.csv"
raw_data = pd.read_csv(path, index_col="doc_uri")
assert raw_data.shape == (1000,100), "Import error, df has false shape"

Conversion and cleaning

In [3]:
data = raw_data.unstack().to_frame().reset_index()
data.columns = ["user", "doc_uri", "rating"]

data.fillna(0, inplace=True)

print("Shape:", data.shape)
data.head()

Shape: (100000, 3)


Unnamed: 0,user,doc_uri,rating
0,Aaron Keith III,http://www.vargas.biz/login.php,1.0
1,Aaron Keith III,http://wallace-walker.info/index/,1.0
2,Aaron Keith III,http://www.jimenez.biz/,3.0
3,Aaron Keith III,http://www.logan.com/about.html,0.0
4,Aaron Keith III,http://cox.org/list/tag/faq.html,5.0


In [4]:
combinations_to_predict = [("Aaron Keith III", "http://gregory.com/"),
                           ("Abigail Wong", "http://hicks.com/"),
                           ("Julie Bullock", "https://www.garcia.com/"),
                           ("Victoria Perez", "http://lee-phillips.org/register/")]

# Recommendation Engines

## Memory-based Collaborative Filtering

### Item-based CF

In [5]:
#FIXME

## Matrix Factorization-based Collaborative Filtering

In [6]:
from surprise import SVD, Dataset, Reader, NMF
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1, 5))
ds = Dataset.load_from_df(data[["user", "doc_uri", "rating"]], reader)

### Model Evaluation
Don't expect accurate models <- they are trained with random noise

In [7]:
# Models, tune parameters, if you'd like ;)
svd = SVD() # Singular Value Decomposition
nmf = NMF() # Non-negative Matrix factorization

for algo in [svd, nmf]:
    cross_validate(algo, ds, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.9180  1.9148  1.9169  1.9173  1.9096  1.9153  0.0031  
MAE (testset)     1.6461  1.6436  1.6414  1.6399  1.6330  1.6408  0.0045  
Fit time          5.62    6.64    5.24    6.26    6.20    5.99    0.50    
Test time         0.15    0.15    0.14    0.15    0.14    0.15    0.00    
Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.8316  1.8258  1.8210  1.8296  1.8278  1.8272  0.0036  
MAE (testset)     1.5750  1.5721  1.5662  1.5768  1.5727  1.5726  0.0036  
Fit time          6.35    6.81    5.17    6.28    4.88    5.90    0.74    
Test time         0.11    0.14    0.11    0.12    0.20    0.14    0.03    


### Final model and predictions

In [8]:
# Train final model
trainset = ds.build_full_trainset()
nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f93f4ad17f0>

In [9]:
# Predictions
for combination in combinations_to_predict:
    user = combination[0]
    doc = combination[1]
    pred = nmf.predict(user, doc)
    print(pred[0], "should rate", pred[1], "with", int(round(pred[3])), "stars")

Aaron Keith III should rate http://gregory.com/ with 2 stars
Abigail Wong should rate http://hicks.com/ with 1 stars
Julie Bullock should rate https://www.garcia.com/ with 1 stars
Victoria Perez should rate http://lee-phillips.org/register/ with 2 stars
