In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle

In [None]:
# Storing the recipes information into a pandas DataFrame titled 'recipes'
recipes = pd.read_csv('recipes.csv', engine='python', on_bad_lines='skip')

In [None]:
# Storing the reviews information into a pandas DataFrame titled 'reviews'
reviews = pd.read_csv('reviews.csv', engine='python', on_bad_lines='skip')

# ⚡ Data pre-processing

In [None]:
# Truncating the DateSubmitted and DateModified columns
reviews=reviews[['ReviewId','RecipeId','AuthorId','AuthorName','Rating','Review']]

In [None]:
# Function for counting Null values 
def countNullValues(col):
  c = 0
  for i in col:
    if(i != i):
      c+=1
  return c

In [None]:
# Counting null values in the reviews dataframe
c1 = countNullValues(reviews.ReviewId)
c2 = countNullValues(reviews.RecipeId) 
c3 = countNullValues(reviews.AuthorId)
c4 = countNullValues(reviews.AuthorName) 
c5 = countNullValues(reviews.Rating)
c6 = countNullValues(reviews.Review) 

In [None]:
# Function for replacing Null Values
def replaceNullValue(column):
  j = 0
  for i in column:
    if(str(i) == 'nan'):
      column[j] = 'NA'
    j+=1
  return column

In [None]:
replaceNullValue(reviews.Review)

In [None]:
# Checking the column labels in reviews
reviews.columns, reviews.shape

In [None]:
# Printing a summary of the data in reviews data
# .describe() gives the statistics of the numerical data 
reviews.describe()

In [None]:
# Checking the Rating distribution in the range 0-5 for the Data given 
import seaborn as sns
with sns.axes_style('white'):
    g = sns.catplot(x="Rating", data=reviews, aspect=2.0,kind='count')
    g.set_ylabels("Total number of ratings")

In [None]:
print("\nTotal no of ratings :", reviews.shape[0])
print("Total No of authors   :", len(np.unique(reviews.AuthorId)))
print("Total No of recipes  :", len(np.unique(reviews.RecipeId)))

# ⚡ Item-based Collaborative based Recommender

In [None]:
# Setting rating count threshold
threshold = 30
counts = reviews['RecipeId'].value_counts()
valid_recipes = counts[counts >= threshold].index
valid_recipes

In [None]:
filtered_ratings = reviews[reviews['RecipeId'].isin(valid_recipes)]
filtered_ratings.shape

In [None]:
# Reset index value
filtered_ratings.reset_index(inplace = True, drop = True)

In [1]:
# Get user rating matrix
matrix = filtered_ratings.pivot(index = 'AuthorId', columns ='RecipeId', values = 'Rating').fillna(0)
matrix

NameError: name 'filtered_ratings' is not defined

In [None]:
# testing input
users_rating = matrix[56]
users_rating.sort_values(ascending=False).head(20)

In [None]:
similar_recipes = matrix.corrwith(users_rating)
similar_recipes

In [None]:
similar_recipes.sort_values(by='correlation', ascending=False).head(20)

# Weighted Rating calculate

In [None]:
# Calculating the ratings given per recipe
RatingCounts = filtered_ratings.groupby('RecipeId')['Rating'].count()
RatingCounts.sort_values(ascending=False, inplace=True)

In [None]:
m = threshold

In [None]:
# Creating and merging smaller recipes dataframe with individual rating counts for a recipe
#recipesSubDataFrame = recipes.iloc[:9000]
weightedPopularRecipes = recipes.merge(RatingCounts, on='RecipeId')

In [None]:
RatingAverages = weightedPopularRecipes[weightedPopularRecipes['AggregatedRating'].notnull()]['AggregatedRating'].astype('int')
C = RatingAverages.mean()
C

In [None]:
# Columns selected to appear in shortlisted dataframe are RecipeId, Name, RecipeCategory, AggregatedRating, Rating, and ReviewCount
qualifiedRecipes = weightedPopularRecipes[(weightedPopularRecipes['Rating'] >= threshold) & (weightedPopularRecipes['Rating'].notnull()) & (weightedPopularRecipes['AggregatedRating'].notnull())][['RecipeId', 'Name', 'RecipeCategory', 'AggregatedRating', 'Rating', 'ReviewCount']]

# Setting the datatype of 'Rating' and 'AggregatedRating' to int
qualifiedRecipes['Rating'] = qualifiedRecipes['Rating'].astype('int')
qualifiedRecipes['AggregatedRating'] = qualifiedRecipes['AggregatedRating'].astype('int') 

In [None]:
# Function that defines the mathematical formula for the weighted rating method
def weightedRating(x):
    v = x['Rating']
    R = x['AggregatedRating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
# Applying the WR function
qualifiedRecipes['score'] = qualifiedRecipes.apply(weightedRating, axis=1)
#qualifiedRecipes.loc[qualifiedRecipes["RecipeId"] == 56]
qualifiedRecipes.head()

In [None]:
# Convert into same dataframe format 
df_rating = pd.DataFrame(qualifiedRecipes.groupby(by = ['RecipeId'])['score'].agg('mean')) 
df_rating.head(10)

In [None]:
result_recipes = similar_recipes.join(df_rating['score']).sort_values(by='correlation', ascending=False)
result_recipes.head(10)