### Initialize Modules and Data

##### Import the Needed Modules

In [None]:
import pandas as pd
import numpy as np
import heapq
from math import floor

##### Import Surprise
[Surprise](http://surpriselib.com/) is a Python scikit building and analyzing recommender systems that deal with explicit rating data.

In [None]:
from surprise import Reader, Dataset
from surprise import KNNWithMeans

##### Import Data

In [None]:
recipes_df = pd.read_csv('datasets/recipes-sub.csv')
users_df = pd.read_csv('datasets/users-sub.csv')
master_ratings_df = pd.read_csv('datasets/reviews-sub.csv')

In [None]:
recipes_df.columns

In [None]:
recipes_df.head()

In [None]:
users_df.head()

In [None]:
master_ratings_df.head()

### Data Cleaning

In [None]:
ratings_df = master_ratings_df.copy()
ratings_df.pop('date')
ratings_df.pop('link')
ratings_df.columns = ['user', 'item', 'rating']

In [None]:
ratings_df.head()

##### Clean the ratings (Remove duplicates)

Likes change, and making a recipe a few time can change a users rating. This means that users can rate a recipe multiple times. You can see this in the ratings_df.head() above.

For a recommandation system it is best to only use one rating per user.
For this system, only their most recent review on an item is kept. 

In [None]:
values = ratings_df.values.tolist()
used_user_item_pairs = []

# TODO: Optimize this
clean_values = []
for value in values:
    if value[:2] not in used_user_item_pairs:
        used_user_item_pairs.append(value[:2])
        clean_values.append(value)
    else:
        clean_values[used_user_item_pairs.index(value[:2])] = value

clean_ratings_df = pd.DataFrame(clean_values, columns=['user', 'item', 'rating'])

The results after cleaning the ratings.

In [None]:
clean_ratings_df.head()

##### Define a Ratings scale
This scale is determined by the lowest and highest rating possible. 
In this case the lowest rating is 1, while the highest is 5.

In [None]:
reader = Reader(rating_scale=(1,5)) # This just defines the rating scale
data = Dataset.load_from_df(clean_ratings_df[['user', 'item', 'rating']], reader=reader)

### Build the model

##### KNN with Means - Surprise

[KNN with Means](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans) has been chosen for the recommender, which is a basic collaborative filtering algorithm, taking into account the mean ratings of each user.

In [None]:
def build_recommender(user_based=False, sim_type='cosine'):
    sim_options = {
        "name": sim_type,
        "user_based": user_based
    }

    return KNNWithMeans(sim_options=sim_options)

##### Calculate the Similarity Matrix

Ignoring folds this builds the *Trainset* using [build_full_trainset()](https://surprise.readthedocs.io/en/stable/dataset.html#surprise.dataset.DatasetAutoFolds.build_full_trainset)

The Trainset is built using the data, but then contains more information about the data

In [None]:
trainset = data.build_full_trainset()

# user_based_recommender = build_recommender(user_based=True)
item_based_recommender = build_recommender()

# User based seems to give a memory error when fit, due to the much larger amount of users than recipes.
# user_based_recommender.fit(trainset)
item_based_recommender.fit(trainset)

##### Prediction

Using this test to see how a users might rate a specific recipe.

In [None]:
i = 1
for i in range(150):
    prediction = item_based_recommender.predict(i,167)
    print(round(prediction.est,2), end=', ')
    i = i + 1

### Inference

Here is the whole thing.

In [None]:
def get_r(user_id):
    # Select which system to use. Due to memory constraints, item based is the only viable option
    recommender_system = item_based_recommender

    # User to recommend for
    #user_id = 562

    # N will represent how many items to recommend
    N = 200

    # The setting to a set and back to list is a failsafe.
    rated_items = list(set(clean_ratings_df.loc[clean_ratings_df['user'] == user_id]['item'].tolist()))

    # Self explanitory name
    all_item_ids = list(set(clean_ratings_df['item'].tolist()))

    # New_items just represents all the items not rated by the user
    new_items = [x for x in all_item_ids if x not in rated_items]

    # Estimate ratings for all unrated items
    predicted_ratings = {}
    for item_id in new_items:
        predicted_ratings[item_id] = recommender_system.predict(user_id, item_id).est
        pass

    # Get the item_ids for the top ratings
    recommended_ids = heapq.nlargest(N, predicted_ratings, key=predicted_ratings.get)
    recommended_ids = sorted(recommended_ids)

    # predicted_ratings
    recommended_df = recipes_df.loc[recipes_df['recipe_id'].isin(recommended_ids)].copy()
    recommended_df.set_index('recipe_id', inplace=True)
    recommended_df.insert(1, 'pred_rating', np.zeros(len(recommended_ids)))
    # recommended_df = recipes_df.copy()
    for idx,item_id in enumerate(recommended_ids):
        recommended_df.iloc[idx, recommended_df.columns.get_loc('pred_rating')] =predicted_ratings[item_id]
        pass

    return recommended_df.head(N).sort_values('pred_rating', ascending=False)

### Get a Recommendation Based on Ingredients

The final code that will be impletented in a cleaner fashion through the browser interface.

In [None]:
# ask the user for input
# get their ID number
user_id = int(input('Enter user id: '))

# get them to list some ingredients, currently it breaks if the second or next ingredient is not there
ingredient_list = input('Enter the ingredients separated by commas that you have on hand: ')

# split the input up into an array for the loop
items = np.array(ingredient_list.split(','))

# get the lowest rating
rating = int(input('Enter the lowest rating you\'ll accept: '))

# get their user name
user_name = users_df.loc[users_df['user_id'] == user_id]

# print some details
print('\nuser: ',user_name.iloc[0,1])
print(ingredient_list)
print('\nHere are your recommendations.')
test = get_r(user_id)
for item in items:
    test = test[test['ingredients'].str.contains(item)]
test = test[test['pred_rating'] >= rating]
test

In [None]:
rec_df = pd.read_csv('recipes-sub.csv')
u_df = pd.read_csv('users-sub.csv')
r_df = pd.read_csv('reviews-sub.csv')

In [None]:
a = 99
r_df.loc[master_ratings_df['reviewer_id'] == a]