<a href="https://colab.research.google.com/github/kiwihero/icecream-recommendation-system/blob/simpleRNN/icecream_simple_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Icecream Recomendation system using RNN

    1.Import necessary Libraries
    2.Exploratory Data Analysis(Duplicates, Missing Data)
    3.Data Clensing & Feature Engineering
    4.Data preprocessing
    5.Model Building
    6.Model Evaluation

1. Import libraries

In [181]:
#imports
import numpy as np #Mathematical Operations
import pandas as pd #Data analysis
import matplotlib.pyplot as plt 
import seaborn as sns #Visulisation
from __future__ import print_function
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence #To preprocess sequence data
from keras.models import Sequential #Sequential Model
from keras.layers import Dense, Embedding #Fully connected layer
from keras.layers import SimpleRNN #Model
from keras import initializers #To initialize random weights of layers
from keras.preprocessing.text import Tokenizer

In [167]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = 'REPLACE_WITH_YOUR_FILE_ID'
downloaded = drive.CreateFile({'id': file_id})
print('Downloaded content "{}"'.format(downloaded.GetContentString()))

In [145]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [146]:
# processing data set and filtering out columns that have no information
products = pd.read_csv("/content/drive/My Drive/combined/products.csv")
products = products.drop(["subhead"],1)

reviews = pd.read_csv("/content/drive/My Drive/combined/reviews.csv")
reviews=reviews.drop(["ingredients","texture", "likes","taste"],axis=1)



2. Exploratory Data Analysis(Duplicates, Missing Data)

3. Data Cleansing

In [168]:
# Datasets size
print(products.shape)
print(reviews.shape)

(241, 7)
(21674, 9)


In [170]:
# Look for unique count of each feature
products.nunique()

brand             4
key             241
name            240
description     237
rating           29
rating_count    136
ingredients     240
dtype: int64

Take-away: 
1. By looking at unique count name and ingredients having one duplicate each let's see why it has a duplicate
2. Description feature having 4 duplicate records

In [172]:
# Find the duplicate in name columns
products[products['name'].duplicated()]

Unnamed: 0,brand,key,name,description,rating,rating_count,ingredients
221,breyers,49_breyers,Chocolate Chip Cookie Dough,With creamy vanilla and chunks of cookie dough...,1.2,89,"MILK, SUGAR, CORN SYRUP, CREAM, ENRICHED WHEAT..."


In [174]:
#Find the other record with same name
products[products['name'] == 'Chocolate Chip Cookie Dough']

Unnamed: 0,brand,key,name,description,rating,rating_count,ingredients
16,bj,16_bj,Chocolate Chip Cookie Dough,We knew we were onto something big when we mad...,4.6,983,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
221,breyers,49_breyers,Chocolate Chip Cookie Dough,With creamy vanilla and chunks of cookie dough...,1.2,89,"MILK, SUGAR, CORN SYRUP, CREAM, ENRICHED WHEAT..."


Take-away: Same name but different brand, so make sense to keep both

In [175]:
# Check of count of each column
products.count()

brand           241
key             241
name            241
description     237
rating          241
rating_count    241
ingredients     241
dtype: int64

In [176]:
# Look for count of nulls in products dataframe
products.isna().sum()

brand           0
key             0
name            0
description     4
rating          0
rating_count    0
ingredients     0
dtype: int64

In [177]:
# Explore the data where description is null
products[products['description'].isnull()]

Unnamed: 0,brand,key,name,description,rating,rating_count,ingredients
139,talenti,12_talenti,COCONUT CHOCOLATE COOKIE,,4.3,29,"WATER, SUGAR, DESICCATED COCONUT, COCONUT OIL,..."
158,talenti,31_talenti,PACIFIC COAST PISTACHIO GELATO,,4.5,201,"MILK, SUGAR, CREAM, PISTACHIOS, DEXTROSE, CARO..."
165,talenti,38_talenti,SALTED CARAMEL TRUFFLE,,4.7,160,"MILK, SUGAR, SKIM MILK, CREAM, WATER, COCONUT ..."
166,talenti,39_talenti,SEA SALT CARAMEL GELATO,,4.1,146,"MILK, SKIM MILK, SUGAR, CREAM, EGG YOLK AND WH..."


Take-away: As Description feature having nulls not even 2% of the dataset  so 
dropping those records won't impact.

In [178]:
# Drop records where null values in any column
products.dropna(inplace=True)

In [179]:
#Check count of each column
products.count()

brand           237
key             237
name            237
description     237
rating          237
rating_count    237
ingredients     237
dtype: int64

4.Data preprocessing

In [149]:
# Slicing the dependent & independent features
x = products['ingredients']
y = products['key']

In [150]:
# Check for max length of words in x, y
print(x.apply(lambda x:len(str(x).split())).max())
print(y.apply(lambda x:len(str(x).split())).max())

1

In [151]:
# Tokenise the ingridents columns
token =Tokenizer(num_words=150)
token.fit_on_texts(x)
sequences = token.texts_to_sequences(x)

In [152]:
# Lable encoding for ice cream keys
encoded = LabelEncoder()
y_encoded = encoded.fit_transform(y)

In [155]:
# Casting the independent feature to ndarray
x_array = np.array(sequences)

  """Entry point for launching an IPython kernel.


Preparing the data for training using train_test_split.
https://www.bitdegree.org/learn/train-test-split

In [159]:
## split the data 
x_train, x_test, y_train, y_test = train_test_split(x_array,y_encoded, test_size=0.2, random_state=42)
#train, test = train_test_split(products, test_size=0.2, random_state=42)

print("Train data shape:", train.shape)
print("Test shape:", test.shape)

Train data shape: (192, 7)
Test shape: (49, 7)


In [160]:
# variable for unique product key for ice cream brand and flavor
n_product = len(products.key.unique())

In [161]:
max_features = 30000
maxlen = 150
batch_size = 32

In [162]:
# building RNN model
rnn_hidden_dim = 10
word_embedding_dim = 50
model_rnn = Sequential()
model_rnn.add(Embedding(max_features, word_embedding_dim))
model_rnn.add(SimpleRNN(rnn_hidden_dim,
                    kernel_initializer=initializers.RandomNormal(stddev=0.001),
                    recurrent_initializer=initializers.Identity(gain=1.0),
                    activation='relu',
                    input_shape=train.shape[1:]))

model_rnn.add(Dense(1, activation='sigmoid'))

In [163]:
model_rnn.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 50)          1500000   
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 10)                610       
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
Total params: 1,500,621
Trainable params: 1,500,621
Non-trainable params: 0
_________________________________________________________________


In [164]:
rmsprop = keras.optimizers.RMSprop(lr = .0001)

model_rnn.compile(loss='SparseCategoricalCrossentropy',
              optimizer=rmsprop,
              metrics=['accuracy'])

In [181]:
model = model_rnn.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=10,
          validation_data=(x_test, y_test))

In [None]:
score, acc = model_rnn.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
# Write a function to plot the train & test losses
def plot_loss_accuracy(model):
    fig = plt.figure(figsize=(12, 6))
    ax = fig.add_subplot(1, 2, 1)
    ax.plot(history.history["loss"],'r-x', label="Train Loss")
    ax.plot(history.history["val_loss"],'b-x', label="Validation Loss")
    ax.legend()
    ax.set_title('cross_entropy loss')
    ax.grid(True)


    ax = fig.add_subplot(1, 2, 2)
    ax.plot(history.history["accuracy"],'r-x', label="Train Accuracy")
    ax.plot(history.history["val_accuracy"],'b-x', label="Validation Accuracy")
    ax.legend()
    ax.set_title('accuracy')
    ax.grid(True)  

# Calling the function
plot_loss_accuracy(model)