# **ALS WR based Food Recommender System**

In [1]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/89/db/e18cfd78e408de957821ec5ca56de1250645b05f8523d169803d8df35a64/pyspark-3.1.2.tar.gz (212.4MB)
[K     |████████████████████████████████| 212.4MB 62kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 18.8MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=c0867b25e08a0b2629993c339140420f0159f71e97de68d114b19d178ee1380c
  Stored in directory: /root/.cache/pip/wheels/40/1b/2c/30f43be2627857ab80062bef1527c0128f7b4070b6b2d02139
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


## **Import all necessary libraries**

In [2]:
from pyspark.sql import SparkSession

# $example on$
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS,ALSModel
from pyspark.sql import Row
from pyspark.sql.functions import *
import pickle
from sklearn.externals import joblib
import random
import pandas as pd
import numpy as np

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
import re
import os
from sklearn.preprocessing import MinMaxScaler




##  **Loading Data**

In [3]:
raw_data = pd.read_csv('https://docs.google.com/spreadsheets/d/1dqttLAVj3fSY0a5lEPyyJ8qGAXnISitx7Gc_6jo1IQY/export?format=csv')
raw_data = raw_data.loc[:,['custid','item','ratings']]
raw_data.columns = ['Customer', 'Item', 'Rating']

In [4]:
data = raw_data.dropna()
data = data.copy()

In [5]:
data['Customer'] = data['Customer'].astype("category")
data['Item'] = data['Item'].astype("category")
data['Cust_id'] = data['Customer'].cat.codes
data['Item_id'] = data['Item'].cat.codes

In [6]:
data

Unnamed: 0,Customer,Item,Rating,Cust_id,Item_id
0,91,biryani,5,0,0
1,91,kebab,5,0,3
2,91,chicken bharta,4,0,1
3,92,veg rice,5,1,6
4,92,ice-cream,4,1,2
5,92,paneer,5,1,5
6,93,biryani,5,2,0
7,93,kebab,5,2,3
8,93,mutton,5,2,4
9,94,paneer,5,3,5


## **Creating a categorical look up**

In [7]:
item_lookup = data[['Item_id', 'Item']].drop_duplicates()
item_lookup['Item_id'] = item_lookup.Item_id.astype(str)
item_lookup

Unnamed: 0,Item_id,Item
0,0,biryani
1,3,kebab
2,1,chicken bharta
3,6,veg rice
4,2,ice-cream
5,5,paneer
8,4,mutton


In [8]:
user_lookup = data[['Cust_id', 'Customer']].drop_duplicates()
user_lookup['Cust_id'] = user_lookup.Cust_id.astype(str)
user_lookup

Unnamed: 0,Cust_id,Customer
0,0,91
3,1,92
6,2,93
9,3,94
11,4,95


In [9]:
type(data)

pandas.core.frame.DataFrame

In [10]:
# Dropping records with no ratings
data = data.loc[data.Rating != 0]

## **Creating a sparse matrix for the customers who has given ratings on food items.**

In [11]:
customers = list(np.sort(data.Cust_id.unique()))
items = list(np.sort(data.Item_id.unique()))
ratings = list(data.Rating)
rows = data.Cust_id.astype(int)
cols = data.Item_id.astype(int)
data_sparse_new = sparse.csr_matrix((ratings, (rows, cols)), shape=(len(customers), len(items)))

print(data_sparse_new)
data_sparse_new

  (0, 0)	5
  (0, 1)	4
  (0, 3)	5
  (1, 2)	4
  (1, 5)	5
  (1, 6)	5
  (2, 0)	10
  (2, 3)	10
  (2, 4)	10
  (3, 3)	4
  (3, 5)	10
  (4, 0)	2


<5x7 sparse matrix of type '<class 'numpy.longlong'>'
	with 12 stored elements in Compressed Sparse Row format>

# **Defining the ALS algorithm**

In [12]:
def implicit_als(sparse_data, alpha_val=40, iterations=10, lambda_val=0.1, features=10):
    # Calculate the Confidence for each value in our data
    confidence = sparse_data * alpha_val
    
    # Get the size of customer rows and item columns using numpy array shape
    cust_size, item_size = sparse_data.shape
    
    # We create the customer vectors X of size customers x features, 
    # the item vectors Y of size items x features and randomly assign values to them using np.random.normal
    X = sparse.csr_matrix(np.random.normal(size = (cust_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size = (item_size, features)))
    
    #Identity matrix and lambda * I
    X_I = sparse.eye(cust_size)
    Y_I = sparse.eye(item_size)
    
    I = sparse.eye(features)
    lI = lambda_val * I
    
    for i in range(iterations):
        print('iteration %d of %d' % (i+1, iterations))
        
        # Precompute Y-transpose-Y and X-transpose-X
        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)

        # Run in a loop for entire user data
        for u in range(cust_size):

            # Get the user row.
            u_row = confidence[u,:].toarray() 

            # Calculate the binary preference p(u)
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0

            # Calculate Cu and Cu - I
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I

            # Put it all together and compute the final formula
            yT_CuI_y = Y.T.dot(CuI).dot(Y)
            yT_Cu_pu = Y.T.dot(Cu).dot(p_u.T)
            X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)

    
        for i in range(item_size):

            # Get the item column and transpose it.
            i_row = confidence[:,i].T.toarray()

            # Calculate the binary preference p(i)
            p_i = i_row.copy()
            p_i[p_i != 0] = 1.0

            # Calculate Ci and Ci - I
            CiI = sparse.diags(i_row, [0])
            Ci = CiI + X_I

            # Put it all together and compute the final formula
            xT_CiI_x = X.T.dot(CiI).dot(X)
            xT_Ci_pi = X.T.dot(Ci).dot(p_i.T)
            Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

    return X, Y

## **Customer and FoodItem vectors**

In [13]:
cust_vecs1, item_vecs1 = implicit_als(data_sparse_new, iterations=5, features=20, alpha_val=40)


iteration 1 of 5
iteration 2 of 5
iteration 3 of 5
iteration 4 of 5
iteration 5 of 5


In [14]:
print(cust_vecs1)

  (0, 0)	-0.6481904560780107
  (0, 1)	0.08577650076106114
  (0, 2)	-0.4573854067139783
  (0, 3)	-0.8339356664417787
  (0, 4)	0.8970446968041712
  (0, 5)	0.7267616949645123
  (0, 6)	2.150678198023378
  (0, 7)	-0.7620840714552968
  (0, 8)	-0.9973197725121524
  (0, 9)	-0.182443394396636
  (0, 10)	-0.20074023487488543
  (0, 11)	-0.058193176397202624
  (0, 12)	-0.47793611516163764
  (0, 13)	-0.6829497228671951
  (0, 14)	0.3861377199161134
  (0, 15)	-0.30001996735785946
  (0, 16)	1.009393599546401
  (0, 17)	-0.26967198521785535
  (0, 18)	-0.703212208066198
  (0, 19)	0.43523837130045673
  (1, 0)	0.10417230755059477
  (1, 1)	0.0019390532090803536
  (1, 2)	0.4242604306561099
  (1, 3)	0.3406096488741739
  (1, 4)	0.07339379002951306
  :	:
  (3, 15)	0.017066232313341263
  (3, 16)	0.10543472565324984
  (3, 17)	-0.10294505788471009
  (3, 18)	-0.013414289690726606
  (3, 19)	-0.11310219326459246
  (4, 0)	0.17722847123844282
  (4, 1)	0.1486230652138508
  (4, 2)	0.1767757670325118
  (4, 3)	-0.0767820793

In [15]:
print(item_vecs1)

  (0, 0)	1.1115965642654668
  (0, 1)	0.04902362470734524
  (0, 2)	1.2876913724254686
  (0, 3)	-0.9164620845766105
  (0, 4)	-0.25082914488092517
  (0, 5)	-0.2031739681376303
  (0, 6)	0.2561202418497235
  (0, 7)	-1.1992659403702743
  (0, 8)	0.7618479052169935
  (0, 9)	0.14682111099598397
  (0, 10)	0.6343282318653304
  (0, 11)	-1.3245747985029477
  (0, 12)	-1.1915110113412044
  (0, 13)	-1.6570442319971699
  (0, 14)	-0.49872456345412947
  (0, 15)	-0.844859615486777
  (0, 16)	-0.4471011069997945
  (0, 17)	0.19457322328154358
  (0, 18)	0.6852533958488102
  (0, 19)	1.1780171511373287
  (1, 0)	-0.057279347634515074
  (1, 1)	0.012835435604252035
  (1, 2)	-0.07886406927285318
  (1, 3)	-0.07463306283942545
  (1, 4)	0.050208961924542314
  :	:
  (5, 15)	0.2729517802880324
  (5, 16)	0.33145793761057696
  (5, 17)	-0.27166412953244484
  (5, 18)	-0.2016123115120569
  (5, 19)	-0.28300592056339885
  (6, 0)	0.1098437931817436
  (6, 1)	0.10856851947283679
  (6, 2)	0.17932723761531283
  (6, 3)	0.26074271019

## **Defining the method the recommendations of food items per customer based on collaborative filtering method on ratings**

In [16]:
def recommend(Cust_id, data_sparse_new, cust_vecs1, item_vecs1, item_lookup, num_items=10):

    # Get all interactions by the customers
    cust_interactions = data_sparse_new[Cust_id,:].toarray()

    # We don't want to recommend items the customer has consumed. So let's set them all to 0 and the unknowns to 1.
    cust_interactions = cust_interactions.reshape(-1) + 1 #Reshape to turn into 1D array
    cust_interactions[cust_interactions > 1] = 0

    # This is where we calculate the recommendation by taking the 
    # dot-product of the customer vectors with the item vectors.
    rec_vector = cust_vecs1[Cust_id,:].dot(item_vecs1.T).toarray()

    # Let's scale our scores between 0 and 1 to make it all easier to interpret.
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = cust_interactions*rec_vector_scaled
   
    # Get all the items indices in order of recommendations (descending) and
    # select only the top items. 
    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    items = []
    scores = []

    # Loop through our recommended mcat indicies and look up the actual mcat name
    for idx in item_idx:
        items.append(item_lookup.Item.loc[item_lookup.Item_id == str(idx)].iloc[0])
        scores.append(recommend_vector[idx])

    # Create a new dataframe with recommended mcat names and scores
    customer_name=data[(data.Cust_id == Cust_id)]["Customer"].iloc[0]
    recommendations = pd.DataFrame({'Customer': customer_name,'Item': items, 'score': scores})
    
    return recommendations


## **Adding all the recommendations to a pandas dataframe**

In [23]:
R1=pd.DataFrame(columns=['Customer','Item','score'])
Ids = list(data.Cust_id.unique())

for i in range(len(Ids)):
  Cust_id = Ids[i]
  recommendations = recommend(Cust_id, data_sparse_new, cust_vecs1, item_vecs1, item_lookup)
  R1 = R1.append(recommendations)

In [24]:
type(R1)

pandas.core.frame.DataFrame

In [25]:
def predict(R1,Cust_param):
    return R1[(R1.Customer == Cust_param)].sort_values(by='score',ascending=False)

In [26]:
#Unit testing
predict(R1,91)

Unnamed: 0,Customer,Item,score
0,91,mutton,0.342463
1,91,veg rice,0.244927
2,91,ice-cream,0.244346
3,91,paneer,0.0
4,91,kebab,0.0
5,91,chicken bharta,0.0
6,91,biryani,0.0


## **Saving the recommendations to excel for manual data testing**

In [27]:
R2 = R1[['Customer','Item','score']]
R2 = R2.sort_values(by='Customer')
R2.to_excel('ALS_Food_Recommendations.xlsx',index=False)

## **Saving the model to pickle file for further deployment**

In [28]:
saved_model = pickle.dumps(R2)  
# Save the model as a pickle in a file
joblib.dump(saved_model, 'ALSModel_file.pkl')

['ALSModel_file.pkl']

In [29]:
from flask import Flask, request, jsonify, render_template
import pickle
app = Flask(__name__)
model = pickle.load(open('ALSModel_file.pkl', 'rb'))

## **Little more testing **

In [30]:
ALSmodel_from_pickle = pickle.loads(model)
predict(ALSmodel_from_pickle,94)

Unnamed: 0,Customer,Item,score
0,94,ice-cream,0.388923
1,94,veg rice,0.388903
2,94,mutton,0.382035
3,94,chicken bharta,0.351075
6,94,biryani,0.0
4,94,paneer,0.0
5,94,kebab,0.0


In [31]:
ALSmodel_from_pickle[(ALSmodel_from_pickle.Customer==93)]

Unnamed: 0,Customer,Item,score
5,93,chicken bharta,0.0
6,93,biryani,0.0
4,93,kebab,0.0
3,93,mutton,0.0
2,93,veg rice,0.861857
1,93,ice-cream,0.862476
0,93,paneer,0.891403
