### Libraries

In [2]:
!pip3 install pyspark
!pip3 install lightfm
!pip3 install apyori

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 34 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 74.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=9a92d75b405ea1c035bf7ee4c97fbd6f3700530f20190da60f3d1a83fab27f82
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2
Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 15.0 MB/s 
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
from IPython.display import Markdown,display
import gc
import time
from functools import partial
from os import path
from wordcloud import WordCloud
from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2Vec, NGram

import random
import numpy
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.evaluation import auc_score
from apyori import apriori
from datetime import datetime
from itertools import combinations

plt.rcParams["figure.figsize"] = (20,10)
warnings.simplefilter('ignore')

### Reading & Merging Data

In [5]:
# Read the data
order_products_train = pd.read_csv('/content/drive/MyDrive/Dissertation /order_products__train.csv')
order_products_prior = pd.read_csv('/content/drive/MyDrive/Dissertation /order_products__prior.csv')
orders = pd.read_csv('/content/drive/MyDrive/Dissertation /orders.csv')
products = pd.read_csv('/content/drive/MyDrive/Dissertation /products.csv')
aisles = pd.read_csv('/content/drive/MyDrive/Dissertation /aisles.csv')
departments = pd.read_csv('/content/drive/MyDrive/Dissertation /departments.csv')

# order_products_all = pd.concat([order_products_train, order_products_prior], axis=0)
# order_products_all = pd.merge(order_products_all, products, on='product_id', how='left')
# order_products_all = pd.merge(order_products_all, orders, on='order_id', how='left')
# order_products_all = pd.merge(order_products_all, aisles, on='aisle_id', how='left')
# order_products_all = pd.merge(order_products_all, departments, on='department_id', how='left')

# order_products_prior = pd.merge(order_products_prior, products, on='product_id', how='left')
# order_products_prior = pd.merge(order_products_prior, orders, on='order_id', how='left')
# order_products_prior = pd.merge(order_products_prior, aisles, on='aisle_id', how='left')
# order_products_prior = pd.merge(order_products_prior, departments, on='department_id', how='left')

# train_order_products = pd.merge(order_products_train, orders, on='order_id', how='left')
# train_order_products = pd.merge(train_order_products, products, on='product_id', how='left')

# order_products_train = pd.merge(order_products_train, orders, on='order_id', how='left')
# order_products_train = pd.merge(order_products_train, products, on='product_id', how='left')
# order_products_train = pd.merge(order_products_train, aisles, on='aisle_id', how='left')
# order_products_train = pd.merge(order_products_train, departments, on='department_id', how='left')

# # the product name is a string seperated with whitespace
# # we want to replace all whitespace with underscore, so that each product name is actually one word with no space in btw
# # And saving it in 'order_products_prior' table, column name 'product_name_with_no_space'

# products=order_products_prior['product_name']
# product_name_with_no_space=[]
# for product in products:
#     product=product.replace(" ","_")
#     product_name_with_no_space.append(product)
# order_products_prior['product_name_with_no_space']=product_name_with_no_space

# name_list=[]
# for p_name in order_products_prior.groupby('order_id')['product_name_with_no_space']:
#     name_list.append(' '.join(p_name[1]))

# order_id=order_products_prior.groupby('order_id')['product_name_with_no_space'].agg('count').index
# order_products=pd.DataFrame({'order_id':order_id,'products':name_list})

# order_products.head()

### Preprocessing Aisles and Departments dataset

In [6]:
aisles=aisles[aisles['aisle'].apply(lambda x:x != 'missing' and x != 'other')]
departments=departments[departments['department'].apply(lambda x: x != 'missing' and x != 'other')]

### Creating functions

In [7]:
def get_user_list(df,user_column):
    '''
    Creates the list of users with the user_column that comes from the inputted dataframe
    '''
    return np.sort(df[user_column].unique())

def get_item_list(df,item_name_column):
    '''
    Creates a list of items, using item_name_column which contains items form the given dataframeand then returns the item list
    '''
    item_list=df[item_name_column].unique()
    return item_list

def get_feature_list(aisle_df,department_df,aisle_name_column,department_name_column):
    aisle = aisle_df[aisle_name_column]
    department = department_df[department_name_column]
    
    return pd.concat([aisle, department], ignore_index = True).unique()

# creating user_id, item_id, and features_id

def id_mappings(user_list, item_list, feature_list):
    """
    
    converting userId, itemId and featureId by mapping ids
    
    """
    user_to_index_mapping = {}
    index_to_user_mapping = {}
    
    # loop through user list and assign it to either 
    # user-index mapping or index-user mapping
    for user_index, user_id in enumerate(user_list):
        user_to_index_mapping[user_id] = user_index
        index_to_user_mapping[user_index] = user_id
        
    item_to_index_mapping = {}
    index_to_item_mapping = {}
    
    # loop thorugh item list and assign it to either
    # item-index mapping or idex-item mapping
    for item_index, item_id in enumerate(item_list):
        item_to_index_mapping[item_id] = item_index
        index_to_item_mapping[item_index] = item_id
        
    feature_to_index_mapping = {}
    index_to_feature_mapping = {}
    
    # loop thorugh feature list and assign it to either
    # feature-index mapping or idex-feature mapping
    for feature_index, feature_id in enumerate(feature_list):
        feature_to_index_mapping[feature_id] = feature_index
        index_to_feature_mapping[feature_index] = feature_id
        
        
    #return the data that was mapped
    return user_to_index_mapping, index_to_user_mapping, \
           item_to_index_mapping, index_to_item_mapping, \
           feature_to_index_mapping, index_to_feature_mapping


def get_user_product_interaction(orders_df, order_products_train_df, order_products_test_df, products_df):
    
    # create user-product df by merging product and user dataset for the trainig data
    user_to_product_train_df = orders_df[orders_df["eval_set"] == "prior"][["user_id", "order_id"]].\
    merge(order_products_train_df[["order_id", "product_id"]]).merge(products_df[["product_id", "product_name"]])\
    [["user_id", "product_name"]].copy()
    
    # rate product as number purchases goes up
    user_to_product_train_df["product_count"] = 1
    user_to_product_rating_train = user_to_product_train_df.groupby(["user_id", "product_name"], as_index = False)["product_count"].sum()
    
    # create user-product df by merging product 
    # and user dataset for the testing data
    user_to_product_test_df = orders_df[orders_df["eval_set"] == "train"][["user_id", "order_id"]].\
    merge(order_products_test_df[["order_id", "product_id"]]).merge(products_df[["product_id", "product_name"]])\
    [["user_id", "product_name"]].copy()
    
    # giving rating as the number of product purchase count
    # (including the previous purchase in the training data)
    user_to_product_test_df["product_count"] = 1
    user_to_product_rating_test = user_to_product_test_df.groupby(["user_id", "product_name"], as_index = False)["product_count"].sum()
    
    # Merge first df user-product train with test 
    user_to_product_rating_test = user_to_product_rating_test.\
    merge(user_to_product_rating_train.rename(columns = {"product_count" : "previous_product_count"}), how = "left").fillna(0)
    user_to_product_rating_test["product_count"] = user_to_product_rating_test.apply(lambda x: x["previous_product_count"] + \
                                                                                    x["product_count"], axis = 1)
    user_to_product_rating_test.drop(columns = ["previous_product_count"], inplace = True)
    
    # return user-product rating train and test
    return user_to_product_rating_train, user_to_product_rating_test

# this function returns the interaction matrix
def get_interaction_matrix(df, df_column_as_row, df_column_as_col, df_column_as_value, row_indexing_map, 
                          col_indexing_map):
    
    row = df[df_column_as_row].apply(lambda x: row_indexing_map[x]).values
    col = df[df_column_as_col].apply(lambda x: col_indexing_map[x]).values
    value = df[df_column_as_value].values
    
    return coo_matrix((value, (row, col)), shape = (len(row_indexing_map), len(col_indexing_map)))

# this function returns the productFeature interaction dataframe
def get_product_feature_interaction(product_df, aisle_df, department_df, aisle_weight = 1, department_weight = 1):
    item_feature_df = product_df.merge(aisle_df).merge(department_df)[["product_name", "aisle", "department"]]
    
    item_feature_df["product_name"] = item_feature_df["product_name"]
    item_feature_df["aisle"] = item_feature_df["aisle"]
    item_feature_df["department"] = item_feature_df["department"]
    
    # fit aisle and departments under new column named feature
    product_aisle_df = item_feature_df[["product_name", "aisle"]].rename(columns = {"aisle" : "feature"})
    # adding weight to aisle feature
    product_aisle_df["feature_count"] = aisle_weight
    product_department_df = item_feature_df[["product_name", "department"]].rename(columns = {"department" : "feature"})
    product_department_df["feature_count"] = department_weight # adding weight to department feature
    
    # merge/concatinate product aisle and product department
    # while ignoring index
    product_feature_df = pd.concat([product_aisle_df, product_department_df], ignore_index=True)
    
    # This will allow the program to save memory and
    # not crash due to the amount of data been processed
    del item_feature_df
    del product_aisle_df
    del product_department_df
    
    # now we group the data and return the final result
    # grouping for summing over feature_count
    product_feature_df = product_feature_df.groupby(["product_name", "feature"], as_index = False)["feature_count"].sum()
    
    return product_feature_df

### Create the Lists for user,item and features.

In [8]:
users=get_user_list(orders,'user_id')
users

array([     1,      2,      3, ..., 206207, 206208, 206209])

In [9]:
items=get_item_list(products,'product_name')
items

array(['Chocolate Sandwich Cookies', 'All-Seasons Salt',
       'Robust Golden Unsweetened Oolong Tea', ..., 'Artisan Baguette',
       'Smartblend Healthy Metabolism Dry Cat Food',
       'Fresh Foaming Cleanser'], dtype=object)

In [10]:
features=get_feature_list(aisles,departments,'aisle','department')
features

array(['prepared soups salads', 'specialty cheeses',
       'energy granola bars', 'instant foods',
       'marinades meat preparation', 'packaged meat', 'bakery desserts',
       'pasta sauce', 'kitchen supplies', 'cold flu allergy',
       'fresh pasta', 'prepared meals', 'tofu meat alternatives',
       'packaged seafood', 'fresh herbs', 'baking ingredients',
       'bulk dried fruits vegetables', 'oils vinegars', 'oral hygiene',
       'packaged cheese', 'hair care', 'popcorn jerky', 'fresh fruits',
       'soap', 'coffee', 'beers coolers', 'red wines',
       'honeys syrups nectars', 'latino foods', 'refrigerated',
       'packaged produce', 'kosher foods', 'frozen meat seafood',
       'poultry counter', 'butter', 'ice cream ice', 'frozen meals',
       'seafood counter', 'dog food care', 'cat food care',
       'frozen vegan vegetarian', 'buns rolls', 'eye ear care',
       'candy chocolate', 'mint gum', 'vitamins supplements',
       'breakfast bars pastries', 'packaged poultry

### Map the features to index so we can use the LightFM library that requires integer index

In [11]:
# Generating the mapping through LightFM algorithm and it can only read integer indexes
user_to_index_mapping, index_to_user_mapping,item_to_index_mapping, index_to_item_mapping,feature_to_index_mapping, index_to_feature_mapping = id_mappings(users, items, features)


In [12]:
index_to_feature_mapping

{0: 'prepared soups salads',
 1: 'specialty cheeses',
 2: 'energy granola bars',
 3: 'instant foods',
 4: 'marinades meat preparation',
 5: 'packaged meat',
 6: 'bakery desserts',
 7: 'pasta sauce',
 8: 'kitchen supplies',
 9: 'cold flu allergy',
 10: 'fresh pasta',
 11: 'prepared meals',
 12: 'tofu meat alternatives',
 13: 'packaged seafood',
 14: 'fresh herbs',
 15: 'baking ingredients',
 16: 'bulk dried fruits vegetables',
 17: 'oils vinegars',
 18: 'oral hygiene',
 19: 'packaged cheese',
 20: 'hair care',
 21: 'popcorn jerky',
 22: 'fresh fruits',
 23: 'soap',
 24: 'coffee',
 25: 'beers coolers',
 26: 'red wines',
 27: 'honeys syrups nectars',
 28: 'latino foods',
 29: 'refrigerated',
 30: 'packaged produce',
 31: 'kosher foods',
 32: 'frozen meat seafood',
 33: 'poultry counter',
 34: 'butter',
 35: 'ice cream ice',
 36: 'frozen meals',
 37: 'seafood counter',
 38: 'dog food care',
 39: 'cat food care',
 40: 'frozen vegan vegetarian',
 41: 'buns rolls',
 42: 'eye ear care',
 43: '

### How many times a product was ordered by a user in train and test set

In [13]:
# convert all the lists into indexes in order to read the indexes of the matrices
user_to_product_rating_train, user_to_product_rating_test = get_user_product_interaction(orders, order_products_prior, order_products_train, products)

In [14]:
# Display the user-product rating train set
user_to_product_rating_train.head()

Unnamed: 0,user_id,product_name,product_count
0,1,0% Greek Strained Yogurt,1
1,1,Aged White Cheddar Popcorn,2
2,1,Bag of Organic Bananas,2
3,1,Bartlett Pears,1
4,1,Cinnamon Toast Crunch,3


In [15]:
# Display the user-product rating test set
user_to_product_rating_test.head()

Unnamed: 0,user_id,product_name,product_count
0,1,0% Greek Strained Yogurt,2.0
1,1,Aged White Cheddar Popcorn,3.0
2,1,Cinnamon Toast Crunch,4.0
3,1,Milk Chocolate Almonds,2.0
4,1,Organic Half & Half,3.0


### Create Different matrices
- Product-Feature matrix: this will allow to know how many products were ordered and what feature it is.
- User-Product matrix(train&test): Relationship between user and product in train and test sets.

In [16]:
# product feature matrix
product_to_feature=get_product_feature_interaction(product_df=products,aisle_df=aisles,department_df=departments,aisle_weight=1,department_weight=1)

In [17]:
product_to_feature.head()

Unnamed: 0,product_name,feature,feature_count
0,#2 Coffee Filters,beverages,1
1,#2 Coffee Filters,coffee,1
2,#2 Cone White Coffee Filters,beverages,1
3,#2 Cone White Coffee Filters,coffee,1
4,#2 Mechanical Pencils,household,1


In [18]:
# user-item matrix for training data
user_to_product_interaction_train = get_interaction_matrix(user_to_product_rating_train, "user_id", 
                                                    "product_name", "product_count", user_to_index_mapping, item_to_index_mapping)

# user-item matrix for testing data
user_to_product_interaction_test = get_interaction_matrix(user_to_product_rating_test, "user_id", 
                                                    "product_name", "product_count", user_to_index_mapping, item_to_index_mapping)

# create item-feature interaction
product_to_feature_interaction = get_interaction_matrix(product_to_feature, "product_name", "feature",  "feature_count", 
                                                        item_to_index_mapping, feature_to_index_mapping)

## LightFM Cross Validation

#### Using pure collaborative filtering, not adding item features

In [19]:
# initialization of model through the warp function
model_without_feature=LightFM(loss='warp')

In [21]:
# fit user-product matrix, this is only done through a pure collaborative filtering factor
start=time.time()

model_without_feature.fit(user_to_product_interaction_train,
                          user_features=None,
                          item_features=None,
                          sample_weight=None,
                          epochs=1,
                          num_threads=4,
                          verbose=False)

end=time.time()
print('Time taken = {0:.{1}f} seconds'.format(end-start,2))

Time taken = 13.46 seconds


In [23]:
# AUC metric score

start=time.time()

auc_without_features=auc_score(model=model_without_feature,
                               test_interactions=user_to_product_interaction_test,
                               num_threads=4,
                               check_intersections=False)
end=time.time()
print('Time taken = {0:.{1}f} seconds'.format(end-start,2))

Time taken = 195.52 seconds


In [24]:
print('Average AUC without adding item feature interaction = {0:{1}f}'.format(auc_without_features.mean(),2))

Average AUC without adding item feature interaction = 0.947792


After modeling the data through LightFM algorithm without features we found out the Average AUC is 0.94

#### Include item features

In [25]:
#model initialization
model_with_features = LightFM(loss = "warp")

In [26]:
# fit the model but this time using a hybrid collaborative filtering and content based (product + features)
start = time.time()
model_with_features.fit(user_to_product_interaction_train,
          user_features=None, 
          item_features=product_to_feature_interaction, 
          sample_weight=None, 
          epochs=1, 
          num_threads=4,
          verbose=False)
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 17.10 seconds


In [27]:
start = time.time()
auc_with_features = auc_score(model = model_with_features, 
                        test_interactions = user_to_product_interaction_test,
                        train_interactions = user_to_product_interaction_train, 
                        item_features = product_to_feature_interaction,
                        num_threads = 4, check_intersections=False)
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))


time taken = 165.81 seconds


In [28]:
print("average AUC with adding item-feature interaction = {0:.{1}f}".format(auc_with_features.mean(), 2))

average AUC with adding item-feature interaction = 0.80


- After addind the features and training the model we found out that the AUC value is reduced to .80 when compared to the model without features

## Requesting Products / Items Recommendation

#### We will retrain the model and will be combining train and test sets

In [29]:
def combined_train_test(train, test):
    """
    
    Combine the follwoing sets
    training: contains previous number of order with rating by user
    testing: contains most recent number of order with rating by user
    
    """
    # initialize and asign train dictionary
    train_dict = {}
    for train_row, train_col, train_data in zip(train.row, train.col, train.data):
        train_dict[(train_row, train_col)] = train_data
        
    # replaces data with test data set
    
    for test_row, test_col, test_data in zip(test.row, test.col, test.data):
        train_dict[(test_row, test_col)] = max(test_data, train_dict.get((test_row, test_col), 0))
        
    
    # convert row, column and data elements to element array
    row_element = []
    col_element = []
    data_element = []
    for row, col in train_dict:
        row_element.append(row)
        col_element.append(col)
        data_element.append(train_dict[(row, col)])
        
    # convert elements to np array
    row_element = np.array(row_element)
    col_element = np.array(col_element)
    data_element = np.array(data_element)
    
    # return final data
    return coo_matrix((data_element, (row_element, col_element)), shape = (train.shape[0], train.shape[1]))

In [50]:
# combining train and test user-product interaction
user_to_product_interaction = combined_train_test(user_to_product_interaction_train, 
                                                 user_to_product_interaction_test)

In [51]:
# now we retrain the model

final_model = LightFM(loss = "warp")

start = time.time()
final_model.fit(user_to_product_interaction,
          user_features=None, 
          item_features=None, 
          sample_weight=None, 
          epochs=1, 
          num_threads=4,
          verbose=False)
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 13.54 seconds


### Creating a class that contains function in order to predict the recommendation based on user

In [56]:
class recommendation_sampling:
    
    def __init__(self, model, items = items, user_to_product_interaction_matrix = user_to_product_interaction, 
                user2index_map = user_to_index_mapping):
        
        self.user_to_product_interaction_matrix = user_to_product_interaction_matrix
        self.model = model
        self.items = items
        self.user2index_map = user2index_map
    
    def recommendation_for_user(self, user):
        
        # get user index
        
        userindex = self.user2index_map.get(user, None)
        
        if userindex == None:
            return None
        
        users = userindex
        
        # get purchased products
        
        known_positives = self.items[self.user_to_product_interaction_matrix.tocsr()[userindex].indices]
        
        # get score from predicted model
        scores = self.model.predict(user_ids = users, item_ids = np.arange(self.user_to_product_interaction_matrix.shape[1]))
        
        # retrieve top scoring items
        
        top_items = self.items[np.argsort(-scores)]
        
        print("User ",user)
        print("     Known positives:")
        
        for x in known_positives[:10]:
            print("                  " ,x)
            
            
        print("     Recommended:")
        
        for x in top_items[:10]:
            print("                   ",x)

In [57]:
recom=recommendation_sampling(model=final_model)

#### Displaying the Recommended products for user 7 and user 20(sample)

In [58]:
recom.recommendation_for_user(7)

User  7
     Known positives:
                   Snack Bags
                   Antioxidant Infusions Brasilia Blueberry
                   Seedless Red Grapes
                   Large Pineapple Chunks
                   Soft Potato Bread
                   Apple Honeycrisp Organic
                   Organic Red Onion
                   Gogo Squeez Organic Apple Strawberry Applesauce on the Go
                   Yukon Gold Potatoes 5lb Bag
                   Mexican Finely Shredded Cheese
     Recommended:
                    Banana
                    Organic Strawberries
                    Organic Garlic
                    Organic Hass Avocado
                    Organic Baby Spinach
                    Organic Avocado
                    Bag of Organic Bananas
                    Limes
                    Large Lemon
                    Organic Red Onion


In [62]:
recom.recommendation_for_user(20)

User  20
     Known positives:
                   Clementines
                   Granny Smith Apples
                   Apples
                   Cheez-It Baked Snack Crackers
                   Original Rice Krispies Treats
                   Crunchy Oats 'n Honey Granola Bars
                   Popcorn
     Recommended:
                    Soda
                    Clementines
                    Real Mayonnaise
                    Roasted Pine Nut Hummus
                    Apples
                    Cereal
                    Organic Simply Naked Pita Chips
                    Sweet Kale Salad Mix
                    Baby Cucumbers
                    Crunchy Oats 'n Honey Granola Bars


## Conclusion
- From the above two tested predictions for user 7 and 20, we found out that some predicted products were bought by the customer.
- For instance, in user 20 we can see that the model predicted a list of products, out these few products a few have be bought by the customer.
- This concludes that atleast 1 out of 10 item are bought by the customers from the recommended list of products which are predicted by our model.
- Moreover, our model AUC is 95%, which is a good accuracy and very accurate but the performance is very slow due to large data set which is over 8GB.