# 0. Imports

In [2]:
import pandas as pd
import numpy as np

---

# 1. Load datasets

In [3]:
df_reviews = pd.read_csv("datasets/raw/reviews.csv")
df_reviews.to_pickle('datasets/raw/reviews.pkl')

In [4]:
df_reviews = pd.read_pickle('datasets/raw/reviews.pkl')

In [5]:
df_recipes = pd.read_csv("datasets/sample/recipes_sample_main.csv", sep=';')
len(df_recipes)

24748

### Make sample df

df_sample = df_recipes.sample(frac = .1)
len(df_sample)

### Save sample (CSV+PICKLE) 

## 1.1 Load from pickle

### Override if necessary

---

# 2. Refine 

### Review simplify

In [6]:
df_reviews.drop(["ReviewId", "AuthorName", "Review", "DateSubmitted", "DateModified"], axis=1, inplace = True)
df_reviews.columns = ["Item", "User", "Rating"]
df_reviews = df_reviews[["User", "Item", "Rating"]]

### Filtering reviews

In [7]:
df_recipe_review_counts = df_reviews.groupby(['Item']).size()

df_filtered_recipes = df_recipe_review_counts.loc[(df_recipe_review_counts > 10)]

filtered_recipes_list = df_filtered_recipes.index.tolist()

len(filtered_recipes_list)

23429

### Is the reviewed recipe in the main dataset (narrowing)

In [8]:
all_recipes_list = df_recipes.id.tolist()

filtered_all_recipes_list = [value for value in filtered_recipes_list if value in all_recipes_list]

len(filtered_all_recipes_list)

1192

In [9]:
df_recipe_ratings = df_reviews[df_reviews['Item'].isin(filtered_all_recipes_list)]
df_recipe_ratings.count()

User      38234
Item      38234
Rating    38234
dtype: int64

In [10]:
df_recipe_ratings

Unnamed: 0,User,Item,Rating
5,2046,5221,4
12,2369,5221,5
45,2178,4366,5
111,2525,8468,0
170,4074,5221,5
...,...,...,...
1401625,2002889404,87058,2
1401631,2002889744,91762,5
1401817,2002897355,22430,1
1401929,2038463,26370,0


### Save recipe ratings

In [12]:
df_recipe_ratings.to_pickle('serialized-model/recipe_ratings.pkl')

---

# SAMPLE

# Add new user

---

# 3. Model training

### Imports

In [13]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import SVDpp
from surprise import KNNBasic
from surprise.model_selection import cross_validate

import pickle

### Load and read df

In [14]:
reader = Reader(rating_scale=(0, 5))

#data = Dataset.load_from_df(df_concat_v3, reader)
data = Dataset.load_from_df(df_recipe_ratings, reader)
#data = Dataset.load_from_df(df_concat, reader)

### Training set

In [15]:
trainSet = data.build_full_trainset()

In [16]:
trainSet.to_raw_uid(0)

2046

In [17]:
def trainset_contains(id):
    try:
        trainSet.to_inner_uid(id)
        return True
    except:
        print("Item is not part of the trainset.")
        return False

In [18]:
trainset_contains(90221)

Item is not part of the trainset.


False

### Save and load training set

In [19]:
pickle.dump(trainSet, open('serialized-model/trainset.pkl', 'wb'))

### Anti training (test) set for ONE specific user

In [20]:
def make_anti_testset(raw_user_id):
    anti_testset_user = []
    
    #targetUser = 0 #inner_id of the target user
    targetUser = trainSet.to_inner_uid(raw_user_id)
    
    fillValue = trainSet.global_mean
    
    user_item_ratings = trainSet.ur[targetUser]
    user_items = [item for (item,_) in (user_item_ratings)]
    user_items
    
    ratings = trainSet.all_ratings()
    
    for iid in trainSet.all_items():
        if(iid not in user_items):
            anti_testset_user.append((trainSet.to_raw_uid(targetUser),trainSet.to_raw_iid(iid),fillValue))
            
    return anti_testset_user

## 3.1 Validating

### Normal Predictor

In [21]:
algo = NormalPredictor()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5904  1.5857  1.5793  1.5681  1.5949  1.5837  0.0093  
MAE (testset)     1.0686  1.0722  1.0591  1.0593  1.0737  1.0666  0.0063  
Fit time          0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Test time         0.03    0.02    0.02    0.05    0.02    0.03    0.01    


{'test_rmse': array([1.59039088, 1.58566352, 1.57929978, 1.5681242 , 1.59490917]),
 'test_mae': array([1.06856473, 1.07223472, 1.05911124, 1.05929634, 1.07370516]),
 'fit_time': (0.015499591827392578,
  0.01630878448486328,
  0.01741790771484375,
  0.016812562942504883,
  0.016965866088867188),
 'test_time': (0.02936863899230957,
  0.020272254943847656,
  0.02042555809020996,
  0.05369424819946289,
  0.0196840763092041)}

### KNNBasic

In [22]:
sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
algo = KNNBasic(sim_options=sim_options)
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


  sim = construction_func[name](*args)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2899  1.3010  1.3227  1.3045  0.0136  
MAE (testset)     0.7711  0.7783  0.7864  0.7786  0.0062  
Fit time          0.04    0.05    0.04    0.04    0.00    
Test time         0.06    0.10    0.06    0.07    0.02    


{'test_rmse': array([1.28986258, 1.30098233, 1.3226601 ]),
 'test_mae': array([0.77111071, 0.77828397, 0.7863993 ]),
 'fit_time': (0.04423403739929199, 0.046239376068115234, 0.04424023628234863),
 'test_time': (0.06112480163574219, 0.09769225120544434, 0.06223440170288086)}

### SVD

In [23]:
#n_factors=100, n_epochs=25, lr_all=0.005, reg_all=0.1
algo = SVD(n_factors=100, n_epochs=25, lr_all=0.005, reg_all=0.1)
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.3140  1.3119  1.3324  1.3194  0.0092  
MAE (testset)     0.8499  0.8536  0.8620  0.8552  0.0051  
Fit time          2.85    2.57    2.58    2.67    0.13    
Test time         0.18    0.11    0.10    0.13    0.04    


{'test_rmse': array([1.31404924, 1.31189568, 1.33236792]),
 'test_mae': array([0.84987175, 0.85362411, 0.86201313]),
 'fit_time': (2.848823308944702, 2.5730812549591064, 2.5816802978515625),
 'test_time': (0.18066859245300293, 0.1072394847869873, 0.10192370414733887)}

### GridSearchCV

In [12]:
from surprise.model_selection import GridSearchCV

In [None]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors': [100,150],
              'n_epochs': [20,25,30],
              'lr_all':[0.005,0.01,0.1],
              'reg_all':[0.02,0.05,0.1]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=3)
grid_search.fit(data)

In [None]:
print(grid_search.best_score['rmse'])
print(grid_search.best_score['mae'])

In [None]:
print(grid_search.best_params['rmse'])

In [None]:
algo = grid_search.best_estimator['rmse']

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

## 3.2 Fit and test on trainset

### Fitting algorithm

In [23]:
algo = KNNBasic(sim_options = {"name":"cosine", "user_based":False})#SVD()
algo.fit(trainSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f7b9673a3a0>

In [24]:
pickle.dump(algo, open('serialized-model/recrecsys.pkl', 'wb'))

### Checking for one user

In [25]:
user_id = 8

raw_user_id = trainSet.to_raw_uid(user_id)

raw_user_id
#2178

5142

### List user reviewed recipes

In [26]:
df_user_recipes = df_recipe_ratings.loc[df_recipe_ratings["User"] == raw_user_id]
#df_user_recipes = df_concat.loc[df_concat["User"] == raw_user_id]

df_recipes.loc[df_recipes["id"].isin(df_user_recipes.Item.tolist())]

Unnamed: 0,id,name,author_name,submitted,image_url,minutes,description,category,tags,search_terms,steps,ingredients,ingredients_raw_str,serving_size,servings,calories,rating,review_count
6337,8509,Pork Chops With Beer,Lorraine Handlin,2000-11-13,"""https://img.sndimg.com/food/image/upload/w_55...",90,Makes a great brown gravy.,Pork,"{""weeknight"",""time-to-make"",""main-ingredient"",...","{""pork""}",{Season pork chops with garlic powder; salt; p...,"{""pork chops"",""garlic powder"",""salt"",""pepper"",...","{""6 pork chops (lean and thick)"",""1 te...",1 (618 g),3,803.5,4.5,66


In [27]:
df_recipe_ratings.loc[(df_recipe_ratings["Item"].isin(df_user_recipes.Item.tolist())) & (df_recipe_ratings["User"] == raw_user_id)]

Unnamed: 0,User,Item,Rating
235,5142,8509,4


### Make anti testset

In [28]:
anti_testset = make_anti_testset(raw_user_id)

###  Make prediction with locally made algorithm

In [29]:
predictions = algo.test(anti_testset)

### OR with pickled

In [32]:
pickled_model = pickle.load(open('serialized-model/recrecsys.pkl', 'rb'))
predictions = pickled_model.test(anti_testset)

In [33]:
pred = pd.DataFrame(predictions)
pred = pred.loc[pred["est"] == 5]
pred = pd.concat([pred, pred['details'].apply(pd.Series)], axis = 1).drop('details', axis = 1)
pred.sort_values(by=['actual_k'], inplace=True, ascending=False)

KeyError: 'actual_k'

In [34]:
pred

Unnamed: 0,uid,iid,r_ui,est


### List Top-n results

In [35]:
def list_result(predictions):
    pred = pd.DataFrame(predictions)
    pred.sort_values(by=['est'], inplace=True, ascending=False)
    
    recipe_list = pred.head(10)['iid'].to_list()
    
    print(recipe_list)
    
    # return df_recipes.loc[recipe_list]
    return df_recipes.loc[df_recipes["id"].isin(recipe_list)]

In [36]:
def list_result_knn(predictions):
    pred = pd.DataFrame(predictions)
    pred = pred.loc[pred["est"] == 5]
    pred = pd.concat([pred, pred['details'].apply(pd.Series)], axis = 1).drop('details', axis = 1)
    pred.sort_values(by=['actual_k'], inplace=True, ascending=False)
    recipe_list = pred.head(10)['iid'].to_list()
    
    for x, y in zip(recipe_list, pred.head(10)['actual_k']):
        print(x, y)
    # return df_recipes.loc[recipe_list]
    return df_recipes.loc[df_recipes["id"].isin(recipe_list)]

## KNN

In [37]:
df_results = list_result_knn(predictions)
df_results

KeyError: 'actual_k'

### Examination

---

# 4. Save model