In [1]:
import pandas as pd
import numpy as np
import random
%matplotlib inline
import matplotlib.pyplot as plt
import surprise
from collections import defaultdict
from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise import Reader
from surprise import evaluate, print_perf
from surprise import KNNBasic
from surprise import AlgoBase, BaselineOnly
from get_top_n import get_top_n
from model.hybrid_model import HybridModel
from model.evaluation import evaluation
from Association_rule.association_model import generate_rules,predict
from Association_rule.deal_meta import also_bought
from model.content_based import ContentBasedModel

In [2]:
df = pd.read_csv('sample_data.csv',index_col=0)
df_meta = pd.read_csv('sample_data_meta.csv',index_col=0)

In [3]:
# Turn data frame into dictionary
df_records = df[['reviewerID','productID']].to_dict('records')
df_dict = defaultdict(list)
for row in df_records:
    df_dict[row['reviewerID']].append(row['productID'])

In [4]:
# Select % as holdout data
holdout = []
for reviewer in df_dict:
    hd_product = df_dict[reviewer][:max(1,int(0.25*len(df_dict[reviewer])))]
    for product in hd_product:
        holdout.append((reviewer,product))
df_tupleindex = df.set_index(['reviewerID','productID'])

In [5]:
# Develop training and test data
testdata = df_tupleindex.loc[holdout].reset_index()
traindata = df_tupleindex[~df_tupleindex.index.isin(holdout)].reset_index()

## Hybrid Model

In [6]:
# Divide training data into dense and sparse data, and handle them separately 
sparse_dt, dense_dt = HybridModel.divide_data(traindata,10)

In [7]:
sparse_dt.describe()

Unnamed: 0,rating
count,4189.0
mean,3.815469
std,1.092253
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [8]:
dense_dt.describe()

Unnamed: 0,rating
count,3607.0
mean,3.818686
std,1.075804
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


For dense data, we use a mixed hybrid model with our own Content-based model, 
along with the Matrix Factorization model, neighborhood based model we built in Part I

In [9]:
# Content-based model and its set of prediction
sim_matrix = ContentBasedModel.product_similarity(df_meta)
content_recommendation_dense = ContentBasedModel.content_prediction(sim_matrix,dense_dt,n=7)

In [10]:
# Part I model: SVD model and its set of prediction
reader = Reader(rating_scale=(1,5))
df_surprise = Dataset.load_from_df(dense_dt[['reviewerID','productID','rating']],reader)
df_surprise.split(n_folds=5)

In [11]:
#Construct missing ratings we need to predict
dense_data_select = dense_dt.pivot_table('rating',index=['reviewerID','productID'],dropna=False)
dense_data_select = dense_data_select.loc[dense_data_select['rating'].isnull()]
dense_data_select.reset_index(inplace=True)
missing_values = dense_data_select[['reviewerID','productID']].values

In [12]:
#Function to predict missing values based on an algorithm
def mv_prediction(algo,missing_values):
    predictions = [algo.predict(uid, iid)
                       for (uid, iid) in missing_values]
    return predictions

In [13]:
algo1 = SVDpp()
evaluate(algo1,df_surprise,measures=['RMSE','MAE'], verbose= 0)

CaseInsensitiveDefaultDict(list,
                           {'mae': [0.72127434780679112,
                             0.71124389472378668,
                             0.74925698391476181,
                             0.77478785366761083,
                             0.73683655851066721],
                            'rmse': [0.92635288753231326,
                             0.91990076128425902,
                             0.96155513391082459,
                             0.97428970339349186,
                             0.95014640910207793]})

In [14]:
# Extract product recommendation list for each user
def extract_topk_surpise(prediction):
    topk = get_top_n(prediction,n=7)
    topk_norating = defaultdict(list)
    for user, i_r in topk.items():
        for item, rating in i_r:
            topk_norating[user].append(item)
    return topk_norating

In [15]:
# Get top k recommendation from SVD model
mv_svdprediction = mv_prediction(algo1,missing_values)
svd_topk = extract_topk_surpise(mv_svdprediction)

In [16]:
# Part I Model: KNN and its set of prediction
algo_name = KNNBasic ##  KNNWithMeans,KNNBaseline
sim_option={'name': 'cosine', ## cosine, msd, pearson, personbaseline
                 'user_based': 'False', ## False for item-based
                 'min_surpport':0 }##  if |Iuv|<min_support then sim(u,v)=0
max_k = 10 ## The (max) number of neighbors to take into account for aggregation
min_k = 7 ##  If there are not enough neighbors, the prediction is set the the global mean of all ratings
knn_default = algo_name(k = max_k, min_k = min_k, sim_options=sim_option)
#Train model
evaluate(knn_default, df_surprise, measures=['RMSE','MAE'], verbose= 0)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.77300343145511174,
                             0.80189637735636354,
                             0.83214480643583355,
                             0.83424495884933803,
                             0.79420809888989474],
                            'rmse': [0.9846738070683464,
                             1.0248678490150547,
                             1.0651146726383325,
                             1.0448437138173421,
                             1.0284486537004782]})

In [17]:
# Get top k recommendation from KNN
mv_knnprediction = mv_prediction(knn_default,missing_values)
knn_topk = extract_topk_surpise(mv_knnprediction)

For sparse data, we use a mixed hybrid model of our own Association rule and Content-based model.
We already trained the content-based model above, now we need to train the Association rule model

In [18]:
#Association rule model and its set of prediction
bought_together = also_bought(df_meta)
rules = generate_rules(traindata, information=bought_together, minsupport=10)
rule_prediction = predict(sparse_dt, rules)
HybridModel.fill_prediction(rule_prediction,n=7)

In [19]:
#Content based model prediction for sparse data
content_recommendation_sparse = ContentBasedModel.content_prediction(sim_matrix,sparse_dt,n=7)

In [20]:
# Final set of recommendation as mixed recommendation of Content-based, SVD, KNN for dense matrix,
# and mixed recommendation of Content-based, Association rule for sparse matrix
# The recommendations are presented side-by-side to each user
sparse_prediction = HybridModel.recommendation_mixer(rule_prediction,content_recommendation_sparse,n=7)
dense_prediction = HybridModel.recommendation_mixer(svd_topk,knn_topk,content_recommendation_dense,n=7)
hybrid_prediction = HybridModel.combine_prediction(sparse_prediction,dense_prediction)

## Evaluation

Recall at top-k:
For each user, check if the prediction contains any of products in the holdout set. If yes, we count
the prediction as a success, and a failure otherwise. Recall at top-k is measured as percentage of
users with sucessful recommendation out of total number of users. This measurement is based on the 
same idea as in this paper: https://arxiv.org/pdf/1703.02344.pdf

In [21]:
# Recall at top k
evaluation.recall_at_topk(hybrid_prediction,testdata)

0.26174496644295303

In [22]:
#SVD only
df_surprise_all = Dataset.load_from_df(traindata[['reviewerID','productID','rating']],reader)
df_surprise_all.split(n_folds=5)
#Construct missing ratings for whole set
traindata_select = traindata.pivot_table('rating',index=['reviewerID','productID'],dropna=False)
traindata_select = traindata_select.loc[traindata_select['rating'].isnull()]
traindata_select.reset_index(inplace=True)
missing_values_all = traindata_select[['reviewerID','productID']].values
mv_svdprediction_all = mv_prediction(algo1,missing_values_all)
svd_topk_all = extract_topk_surpise(mv_svdprediction_all)

In [23]:
evaluation.recall_at_topk(svd_topk_all,testdata)

0.09904761904761905

In [24]:
#SVD and KNN on whole training set
mv_knnprediction_all = mv_prediction(knn_default,missing_values_all)
knn_topk_all = extract_topk_surpise(mv_knnprediction_all)
svd_knn_hybrid_prediction = HybridModel.recommendation_mixer(svd_topk_all,knn_topk_all,n=7)

In [25]:
evaluation.recall_at_topk(svd_knn_hybrid_prediction,testdata)

0.10571428571428572

In [26]:
# Coverage ratio
# Coverage ratio is measured as number of products recommended over total number of products
evaluation.coverage_ratio(hybrid_prediction,df)

0.93

In [27]:
#SVD only
evaluation.coverage_ratio(svd_topk_all,df)

0.22

In [28]:
evaluation.coverage_ratio(svd_knn_hybrid_prediction,df)

0.32

In [29]:
#Content-based model only
content_recommendation_all = ContentBasedModel.content_prediction(sim_matrix,traindata,n=7)
evaluation.recall_at_topk(content_recommendation_all,testdata)

0.23142857142857143

In [30]:
#Coverage ratio: Content-based model only
evaluation.coverage_ratio(content_recommendation_all,df)

0.93