In [1]:
import numpy as np
import pandas as pd

### Preprocess data

In [2]:
path = "../data/petdata_1000_100.csv"
raw_data = pd.read_csv(path, index_col="doc_uri")
assert raw_data.shape == (1000,100), "Import error, df has false shape"

Conversion and cleaning

In [3]:
data = raw_data.unstack().to_frame().reset_index()
data.columns = ["user", "doc_uri", "rating"]

data.fillna(0, inplace=True)

print("Shape:", data.shape)
data.head()

Shape: (100000, 3)


Unnamed: 0,user,doc_uri,rating
0,Aaron Keith III,http://www.vargas.biz/login.php,1.0
1,Aaron Keith III,http://wallace-walker.info/index/,1.0
2,Aaron Keith III,http://www.jimenez.biz/,3.0
3,Aaron Keith III,http://www.logan.com/about.html,0.0
4,Aaron Keith III,http://cox.org/list/tag/faq.html,5.0


# Recommendation Engines

## Model-based Collaborative Filtering

In [5]:
from surprise import SVD, Dataset, Reader, NMF, accuracy
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.random_pred import NormalPredictor

reader = Reader(rating_scale=(1, 5))
ds = Dataset.load_from_df(data[["user", "doc_uri", "rating"]], reader)

baseline_model = NormalPredictor() # Baseline model, predicts labels based on distribution of ratings

### Matrix factorization-based CF

In [6]:
# Models, tune parameters, if you'd like ;)
svd = SVD() # Singular Value Decomposition
nmf = NMF() # Non-negative Matrix factorization

#### Model Evaluation
Don't expect accurate models <- they are trained with random noise

In [7]:
for algo in [baseline_model, svd, nmf]:
    cross_validate(algo, ds, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.2287  2.2567  2.2318  2.2418  2.2391  2.2396  0.0098  
MAE (testset)     1.8516  1.8831  1.8537  1.8687  1.8620  1.8638  0.0114  
Fit time          0.07    0.06    0.07    0.07    0.07    0.07    0.00    
Test time         0.11    0.11    0.11    0.11    0.10    0.11    0.00    
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.9066  1.9137  1.8958  1.9219  1.9217  1.9120  0.0099  
MAE (testset)     1.6347  1.6388  1.6235  1.6465  1.6490  1.6385  0.0091  
Fit time          6.22    6.18    5.56    4.68    6.20    5.77    0.60    
Test time         0.14    0.13    0.14    0.14    0.13    0.13    0.00    
Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
R

### k-NN-based CF

In [8]:
from surprise.prediction_algorithms.knns import KNNBasic

sim_options = {"name": "pearson", # pearson's r
               "user_based": False  # item-based
               }

knn = KNNBasic(sim_options=sim_options)

#### Model Evaluation
Don't expect accurate models <- they are trained with random noise

In [9]:
cross_validate(knn, ds, measures=["RMSE", "MAE"], cv=5, verbose=True)

Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.7774  1.7803  1.7871  1.7842  1.7867  1.7831  0.0038  
MAE (testset)     1.5437  1.5473  1.5555  1.5523  1.5535  1.5504  0.0043  
Fit time          6.93    6.36    6.73    5.65    5.63    6.26    0.54    
Test time         16.44   15.78   16.34   15.59   15.77   15.98   0.34    


{'fit_time': (6.9294939041137695,
  6.358834981918335,
  6.726486444473267,
  5.651165008544922,
  5.634487628936768),
 'test_mae': array([1.54365298, 1.547256  , 1.5554516 , 1.55232624, 1.55353504]),
 'test_rmse': array([1.77739804, 1.78027534, 1.78708917, 1.78420089, 1.78667148]),
 'test_time': (16.439178466796875,
  15.778387308120728,
  16.336272954940796,
  15.594653606414795,
  15.772710084915161)}

## Final model and predictions

#### Train & evaluate final model

In [10]:
# Train final model
trainset = ds.build_full_trainset()
knn.fit(trainset)

# RMSE of final model
testset = trainset.build_testset()
test_pred = knn.test(testset)
accuracy.rmse(test_pred, verbose=True) # should be very bad ;)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.3321


1.3321412404968649

#### Predict some document ratings

In [11]:
combinations_to_predict = [("Aaron Keith III", "http://gregory.com/"),
                           ("Abigail Wong", "http://hicks.com/"),
                           ("Julie Bullock", "https://www.garcia.com/"),
                           ("Victoria Perez", "http://lee-phillips.org/register/")]

In [12]:
# Predictions
for combination in combinations_to_predict:
    user = combination[0]
    doc = combination[1]
    pred = knn.predict(user, doc)
    print(pred[0], "should rate", pred[1], "with", int(round(pred[3])), "stars")

Aaron Keith III should rate http://gregory.com/ with 1 stars
Abigail Wong should rate http://hicks.com/ with 1 stars
Julie Bullock should rate https://www.garcia.com/ with 1 stars
Victoria Perez should rate http://lee-phillips.org/register/ with 2 stars


## Memory-based Collaborative Filtering

### Item-based CF

In [13]:
#FIXME