## FP Growth

### Loading Libraries

In [2]:
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext,SparkConf
from pyspark.sql.functions import collect_set, col, count
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.session import SparkSession

In [1]:
!pip3 install pyspark

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 34 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 55.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=68d00476751a05bf620ef27e226670ea4b551ab18511fe8c67e2f164f44e5666
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

### Define SparkSession

In [5]:
sc=SparkContext('local')
spark=SparkSession(sc)

### Creating Dataframes

In [6]:
# Import Data
als = spark.read.csv("/content/drive/MyDrive/Dissertation /aisles.csv", header=True, inferSchema=True)
depts = spark.read.csv("/content/drive/MyDrive/Dissertation /departments.csv", header=True, inferSchema=True)
OrdProdPr = spark.read.csv("/content/drive/MyDrive/Dissertation /order_products__prior.csv", header=True, inferSchema=True)
OrdProdTr = spark.read.csv("/content/drive/MyDrive/Dissertation /order_products__train.csv", header=True, inferSchema=True)
ords = spark.read.csv("/content/drive/MyDrive/Dissertation /orders.csv", header=True, inferSchema=True)
prds = spark.read.csv("/content/drive/MyDrive/Dissertation /products.csv", header=True, inferSchema=True)

# Create Temporary Tables
als.createOrReplaceTempView("als")
depts.createOrReplaceTempView("depts")
OrdProdPr.createOrReplaceTempView("OrdProdPr")
OrdProdTr.createOrReplaceTempView("OrdProdTr")
ords.createOrReplaceTempView("ords")
prds.createOrReplaceTempView("prds")

### Creating a Basket dataframe before training a model or algorithm

In [9]:
'''
    create basket df contains order id and the items in it
'''
d=spark.sql("select p.product_name, o.order_id from prds p inner join OrdProdTr o where o.product_id = p.product_id")
cart=d.groupBy('order_id').agg(collect_set('product_name').alias('products'))
cart.createOrReplaceTempView('cart')
cart_pd = cart.toPandas()
cart_pd.head(10)

Unnamed: 0,order_id,products
0,1342,"[Raw Shrimp, Seedless Cucumbers, Versatile Stain Remover, Organic Strawberries, Organic Mandarins, Chicken Apple Sausage, Pink Lady Apples, Bag of Organic Bananas]"
1,1591,"[Cracked Wheat, Strawberry Rhubarb Yoghurt, Organic Bunny Fruit Snacks Berry Patch, Goodness Grapeness Organic Juice Drink, Honey Graham Snacks, Spinach, Granny Smith Apples, Oven Roasted Turkey Breast, Pure Vanilla Extract, Chewy 25% Low Sugar Chocolate Chip Granola, Banana, Original Turkey Burgers Smoke Flavor Added, Twisted Tropical Tango Organic Juice Drink, Navel Oranges, Lower Sugar Instant Oatmeal Variety, Ultra Thin Sliced Provolone Cheese, Natural Vanilla Ice Cream, Cinnamon Multigrain Cereal, Garlic, Goldfish Pretzel Baked Snack Crackers, Original Whole Grain Chips, Medium Scarlet Raspberries, Lemon Yogurt, Original Patties (100965) 12 Oz Breakfast, Nutty Bars, Strawberry Banana Smoothie, Green Machine Juice Smoothie, Coconut Dreams Cookies, Buttermilk Waffles, Uncured Genoa Salami, Organic Greek Whole Milk Blended Vanilla Bean Yogurt]"
2,4519,[Beet Apple Carrot Lemon Ginger Organic Cold Pressed Juice Beverage]
3,4935,[Vodka]
4,6357,"[Globe Eggplant, Panko Bread Crumbs, Fresh Mozzarella Ball, Grated Parmesan, Gala Apples, Italian Pasta Sauce Basilico Tomato, Basil & Garlic, Organic Basil, Banana, Provolone]"
5,10362,"[Organic Baby Spinach, Organic Spring Mix, Organic Leek, Slow Roasted Lightly Seasoned Chick'n, Organic Basil, Organic Shredded Mild Cheddar, Bag of Organic Bananas, Sliced Baby Bella Mushrooms, Organic Tapioca Flour, Organic Gala Apples, Lemons, Limes, Pitted Dates, Jalapeno Peppers, Original Tofurky Deli Slices, Organic Red Bell Pepper, Organic Shredded Carrots, Roma Tomato, Crinkle Cut French Fries, Large Greenhouse Tomato, Organic Pinto Beans, Organic Three Grain Tempeh, Organic Garnet Sweet Potato (Yam), Organic Coconut Milk, Organic Extra Firm Tofu, Ground Sausage Style Veggie Protein, Extra Virgin Olive Oil, Hass Avocados, Multigrain Tortilla Chips, The Ultimate Beefless Burger, Yellow Bell Pepper, Coconut Flour, Light Brown Sugar, Organic Harissa Seasoning, Crushed Garlic, Organic Whole Cashews]"
6,19204,"[Reduced Fat Crackers, Dishwasher Cleaner, Peanut Powder, Disinfecting Wipes Lemon & Fresh Scent, Lemon Lime Thirst Quencher, Fat Free & Lower Sodium Chicken Broth, American Blend Salad, Cinnamon Cereal, Extra Nasal Strips, Reduced Fat Creamy Peanut Butter Spread, Mozzarella String Cheese Sticks, Light Low-Moisture Part Skim, Electrolyte Enhanced Water, Original Petroleum Jelly, High Efficiency Complete Dual Formula]"
7,29601,"[Organic Red Onion, Small Batch Authentic Taqueria Tortilla Chips, Hummus, Hope, Original Recipe, Unsweetened Whole Milk Peach Greek Yogurt, Toasted Coconut Almondmilk Blend, Skillet Refried Red Beans Sautéed With Onion & Tomatillo, Almondmilk, Pure, Chocolate Protein, Organic Greek Lowfat Yogurt With Strawberries, Bag of Organic Bananas, California Orange Juice, Mini Whole Wheat Pita Bread, Coconut Almond Creamer Blend, Banana Chia Pod, Tomatillo Salsa, SALSA FRNTR CHPTL SALSA, Guacamole, Water]"
8,31035,"[Organic Cripps Pink Apples, Organic Golden Delicious Apple, Organic Navel Orange, Bag of Organic Bananas]"
9,40011,"[Organic Baby Spinach, Organic Blues Bread with Blue Cornmeal Crust, Sea Salt Macadamias, Natural Calm Magnesium Supplement Raspberry Lemon Flavor Powder, Chocolate Coconut Protein Bar, Sport Chocolate Mint Protein Bar]"


In [12]:
# get products from basket and display the first 10 of the dataframe
products=spark.sql('select products from cart').toDF('products')
products_pd=products.toPandas()
products_pd.head()

Unnamed: 0,products
0,"[Raw Shrimp, Seedless Cucumbers, Versatile Stain Remover, Organic Strawberries, Organic Mandarins, Chicken Apple Sausage, Pink Lady Apples, Bag of Organic Bananas]"
1,"[Cracked Wheat, Strawberry Rhubarb Yoghurt, Organic Bunny Fruit Snacks Berry Patch, Goodness Grapeness Organic Juice Drink, Honey Graham Snacks, Spinach, Granny Smith Apples, Oven Roasted Turkey Breast, Pure Vanilla Extract, Chewy 25% Low Sugar Chocolate Chip Granola, Banana, Original Turkey Burgers Smoke Flavor Added, Twisted Tropical Tango Organic Juice Drink, Navel Oranges, Lower Sugar Instant Oatmeal Variety, Ultra Thin Sliced Provolone Cheese, Natural Vanilla Ice Cream, Cinnamon Multigrain Cereal, Garlic, Goldfish Pretzel Baked Snack Crackers, Original Whole Grain Chips, Medium Scarlet Raspberries, Lemon Yogurt, Original Patties (100965) 12 Oz Breakfast, Nutty Bars, Strawberry Banana Smoothie, Green Machine Juice Smoothie, Coconut Dreams Cookies, Buttermilk Waffles, Uncured Genoa Salami, Organic Greek Whole Milk Blended Vanilla Bean Yogurt]"
2,[Beet Apple Carrot Lemon Ginger Organic Cold Pressed Juice Beverage]
3,[Vodka]
4,"[Globe Eggplant, Panko Bread Crumbs, Fresh Mozzarella Ball, Grated Parmesan, Gala Apples, Italian Pasta Sauce Basilico Tomato, Basil & Garlic, Organic Basil, Banana, Provolone]"


### Model training

In [37]:
fpg=FPGrowth(itemsCol='products',minSupport=0.001,minConfidence=0,numPartitions=20)
fpg_model=fpg.fit(cart)

In [39]:
freq_products=fpg_model.freqItemsets
freq_products.createOrReplaceTempView('freq_products')
freq_products_1=spark.sql('select items,freq from freq_products where size(items)>1 order by freq desc')
freq_products_pd=freq_products_1.toPandas()
freq_products_pd.head()

Unnamed: 0,items,freq
0,"[Organic Strawberries, Bag of Organic Bananas]",3074
1,"[Organic Hass Avocado, Bag of Organic Bananas]",2420
2,"[Organic Baby Spinach, Bag of Organic Bananas]",2236
3,"[Organic Avocado, Banana]",2216
4,"[Organic Strawberries, Banana]",2174


### Generating Assiciation Rule Mining

#### Predicting what a customer bought(antecedent), then we can recommend what the user will purchase next (consequent)

In [40]:
rules=fpg_model.associationRules
rules.createOrReplaceTempView('rules')
rules_1=spark.sql('SELECT antecedent AS `antecedent (if)`,consequent AS `consequent (then)`,confidence, lift FROM rules ORDER BY confidence DESC')
rules_pd=rules_1.toPandas()
rules_pd.head()

Unnamed: 0,antecedent (if),consequent (then),confidence,lift
0,"[Organic Raspberries, Organic Hass Avocado, Organic Strawberries]",[Bag of Organic Bananas],0.598425,5.072272
1,"[Organic Cucumber, Organic Hass Avocado, Organic Strawberries]",[Bag of Organic Bananas],0.546875,4.635331
2,"[Organic Kiwi, Organic Hass Avocado]",[Bag of Organic Bananas],0.545977,4.627719
3,"[Organic Navel Orange, Organic Raspberries]",[Bag of Organic Bananas],0.541219,4.587387
4,"[Yellow Onions, Strawberries]",[Banana],0.535714,3.753633


- we can see that the above result has confidence of 0.50 and above, the model will recommend the purchase of Organic bananas to anyone who have Organic Raspberries, Organic Hass Avocado, Organic Strawberries,Organic Kiwi,Organic Navel Orange,Yellow Onions, Strawberries ect in their basket.

#### Display actual items purchased vs predicted items

In [42]:
pred=fpg_model.transform(cart)
pred.createOrReplaceTempView('pred')
pred_1=spark.sql('SELECT order_id, products, prediction FROM pred')
pred_pd=pred_1.toPandas()
pred_pd.head()

Unnamed: 0,order_id,products,prediction
0,1342,"[Raw Shrimp, Seedless Cucumbers, Versatile Stain Remover, Organic Strawberries, Organic Mandarins, Chicken Apple Sausage, Pink Lady Apples, Bag of Organic Bananas]","[Strawberries, Organic Tomato Basil Pasta Sauce, Total 0% Greek Yogurt, Hass Avocado, Yellow Onions, Clementines, Organic Garnet Sweet Potato (Yam), Organic Diced Tomatoes, Organic Russet Potato, Extra Virgin Olive Oil, Organic Brown Rice, Organic Sticks Low Moisture Part Skim Mozzarella String Cheese, Organic Tomato Cluster, Organic Thompson Seedless Raisins, Uncured Hickory Smoked Sunday Bacon, Yellow Bell Pepper, Organic Ginger Root, Organic Large Grade A Brown Eggs, Organic Coconut Milk, Organic Black Beans, Organic Baby Carrots, Roasted Turkey Breast, Organic Egg Whites, Green Beans, Organic Whole String Cheese, Organic Leek, Organic Red Radish, Bunch, Honeycrisp Apple, Organic Cheese Frozen Pizza, Organic Hothouse Cucumbers, Green Bell Pepper, Organic Reduced Fat 2% Milk, Organic Cilantro, Organic Butternut Squash, Sweet Onion, Shredded Parmesan, Carrots, Organic Shredded Carrots, Organic Romaine Lettuce, Organic Navel Orange, Air Chilled Organic Boneless Skinless Chicken Breasts, Organic Baby Spinach, Shredded Mild Cheddar Cheese, Organic Baby Rainbow Carrots, 100% Raw Coconut Water, Organic Peeled Whole Baby Carrots, Celery Hearts, Organic Riced Cauliflower, Organic Romaine, Lime Sparkling Water, Large Lemon, Organic Honey Sweet Whole Wheat Bread, Organic Creamy Peanut Butter, Grape White/Green Seedless, Organic Lemon, Original Veggie Straws, Sustainably Soft Bath Tissue, Brussels Sprouts, Original Pure Creamy Almond Milk, Roma Tomato, Organic Avocado, Lemon Hummus, Sparkling Water Grapefruit, Fresh Ginger Root, Half & Half, Organic Shredded Mozzarella, Total 0% Nonfat Greek Yogurt, Red Raspberries, Raspberries, Organic Baby Spinach Salad, Oven Roasted Turkey Breast, Vanilla Almond Breeze Almond Milk, Cucumber Kirby, Baby Spinach, Garlic, Organic Hass Avocado, Trilogy Kombucha Drink, Marinara Sauce, Sharp Cheddar Cheese, Organic Italian Parsley Bunch, Crackers Cheddar Bunnies Snack Packs, Organic Fat Free Milk, Organic Garbanzo Beans, Bunched Cilantro, Super Greens Salad, Lightly Salted Baked Snap Pea Crisps, Clementines, Bag, Organic Fuji Apples, Organic Fuji Apple, Organic Large Brown Grade AA Cage Free Eggs, Jalapeno Peppers, Limes, Whole Milk Plain Yogurt, Organic Frozen Peas, Pure Irish Butter, Small Hass Avocado, Sea Salt Pita Chips, No Salt Added Black Beans, Organic Red Bell Pepper, Pretzel Crisps Original Deli Style Pretzel Crackers, ...]"
1,1591,"[Cracked Wheat, Strawberry Rhubarb Yoghurt, Organic Bunny Fruit Snacks Berry Patch, Goodness Grapeness Organic Juice Drink, Honey Graham Snacks, Spinach, Granny Smith Apples, Oven Roasted Turkey Breast, Pure Vanilla Extract, Chewy 25% Low Sugar Chocolate Chip Granola, Banana, Original Turkey Burgers Smoke Flavor Added, Twisted Tropical Tango Organic Juice Drink, Navel Oranges, Lower Sugar Instant Oatmeal Variety, Ultra Thin Sliced Provolone Cheese, Natural Vanilla Ice Cream, Cinnamon Multigrain Cereal, Garlic, Goldfish Pretzel Baked Snack Crackers, Original Whole Grain Chips, Medium Scarlet Raspberries, Lemon Yogurt, Original Patties (100965) 12 Oz Breakfast, Nutty Bars, Strawberry Banana Smoothie, Green Machine Juice Smoothie, Coconut Dreams Cookies, Buttermilk Waffles, Uncured Genoa Salami, Organic Greek Whole Milk Blended Vanilla Bean Yogurt]","[Blueberry Yoghurt, Organic Tomato Basil Pasta Sauce, Total 0% Greek Yogurt, Hass Avocado, Yellow Onions, Total 2% Lowfat Plain Greek Yogurt, Pure Coconut Water, Organic 2% Reduced Fat Milk, Raisin Bran Cereal, Organic Garnet Sweet Potato (Yam), Organic Russet Potato, Extra Virgin Olive Oil, Hint Of Sea Salt Almond Nut Thins, Total 2% Greek Strained Yogurt with Cherry 5.3 oz, Organic Sticks Low Moisture Part Skim Mozzarella String Cheese, Organic Tomato Cluster, Frozen Broccoli Florets, Uncured Hickory Smoked Sunday Bacon, Yellow Bell Pepper, Organic Ginger Root, Total 2% Lowfat Greek Strained Yogurt With Blueberry, Organic Large Grade A Brown Eggs, Organic Coconut Milk, Grade A Large Eggs Cage Free Omega 3, Organic Black Beans, Organic Baby Carrots, Diced Tomatoes, Iceberg Lettuce, Roasted Turkey Breast, Organic Egg Whites, Green Beans, Organic Whole String Cheese, Grade A Large White Eggs, Organic Strawberries, Organic Red Radish, Bunch, 2% Reduced Fat Milk, Honeycrisp Apple, Organic Hothouse Cucumbers, Marinara Pasta Sauce, Whole Milk, Green Bell Pepper, Organic Green Leaf Lettuce, 2% Reduced Fat Organic Milk, Organic Reduced Fat 2% Milk, Organic Cilantro, European Cucumber, Organic Butternut Squash, Shredded Parmesan, Carrots, Organic Shredded Carrots, Organic Romaine Lettuce, Organic Navel Orange, Organic Baby Spinach, Shredded Mild Cheddar Cheese, Organic Baby Rainbow Carrots, 100% Raw Coconut Water, Organic Peeled Whole Baby Carrots, Citrus Mandarins Organic, Organic Riced Cauliflower, Organic Romaine, Lime Sparkling Water, Large Lemon, Organic Creamy Peanut Butter, Cherubs Heavenly Salad Tomatoes, Grape White/Green Seedless, Organic Lemon, Sparkling Water Berry, Original Veggie Straws, Brussels Sprouts, Mandarin Oranges, Roma Tomato, Organic Avocado, Total 2% Lowfat Greek Strained Yogurt with Peach, Vanilla Skyr Nonfat Yogurt, Eggo Homestyle Waffles, Sour Cream, Sparkling Water Grapefruit, Macaroni & Cheese, Peach Pear Flavored Sparkling Water, Fresh Ginger Root, Half & Half, Total 0% Nonfat Greek Yogurt, Red Raspberries, Total 2% All Natural Low Fat 2% Milkfat Greek Strained Yogurt, Raspberries, Strawberry Preserves, Organic Baby Spinach Salad, Vanilla Almond Breeze Almond Milk, Cucumber Kirby, Original No Pulp 100% Florida Orange Juice, Baby Spinach, Mini Original Babybel Cheese, Organic Hass Avocado, Marinara Sauce, Vine Ripe Tomatoes, Sharp Cheddar Cheese, Organic Italian Parsley Bunch, Organic Fat Free Milk, Organic Garbanzo Beans, Bunched Cilantro, ...]"
2,4519,[Beet Apple Carrot Lemon Ginger Organic Cold Pressed Juice Beverage],[]
3,4935,[Vodka],[]
4,6357,"[Globe Eggplant, Panko Bread Crumbs, Fresh Mozzarella Ball, Grated Parmesan, Gala Apples, Italian Pasta Sauce Basilico Tomato, Basil & Garlic, Organic Basil, Banana, Provolone]","[Organic Tomato Basil Pasta Sauce, Total 0% Greek Yogurt, Hass Avocado, Yellow Onions, Total 2% Lowfat Plain Greek Yogurt, Pure Coconut Water, Organic 2% Reduced Fat Milk, Raisin Bran Cereal, Organic Garnet Sweet Potato (Yam), Organic Russet Potato, Extra Virgin Olive Oil, Hint Of Sea Salt Almond Nut Thins, Total 2% Greek Strained Yogurt with Cherry 5.3 oz, Organic Sticks Low Moisture Part Skim Mozzarella String Cheese, Organic Tomato Cluster, Frozen Broccoli Florets, Uncured Hickory Smoked Sunday Bacon, Yellow Bell Pepper, Organic Ginger Root, Total 2% Lowfat Greek Strained Yogurt With Blueberry, Organic Large Grade A Brown Eggs, Organic Coconut Milk, Grade A Large Eggs Cage Free Omega 3, Organic Black Beans, Organic Baby Carrots, Diced Tomatoes, Iceberg Lettuce, Roasted Turkey Breast, Organic Egg Whites, Green Beans, Organic Whole String Cheese, Grade A Large White Eggs, Organic Strawberries, Organic Red Radish, Bunch, 2% Reduced Fat Milk, Honeycrisp Apple, Organic Hothouse Cucumbers, Marinara Pasta Sauce, Whole Milk, Green Bell Pepper, Organic Green Leaf Lettuce, 2% Reduced Fat Organic Milk, Organic Reduced Fat 2% Milk, Organic Cilantro, European Cucumber, Organic Butternut Squash, Shredded Parmesan, Carrots, Organic Shredded Carrots, Organic Romaine Lettuce, Organic Navel Orange, Organic Baby Spinach, Shredded Mild Cheddar Cheese, Organic Baby Rainbow Carrots, 100% Raw Coconut Water, Organic Peeled Whole Baby Carrots, Citrus Mandarins Organic, Organic Riced Cauliflower, Organic Romaine, Lime Sparkling Water, Large Lemon, Organic Creamy Peanut Butter, Cherubs Heavenly Salad Tomatoes, Grape White/Green Seedless, Organic Lemon, Sparkling Water Berry, Original Veggie Straws, Brussels Sprouts, Mandarin Oranges, Roma Tomato, Organic Avocado, Total 2% Lowfat Greek Strained Yogurt with Peach, Vanilla Skyr Nonfat Yogurt, Eggo Homestyle Waffles, Sour Cream, Sparkling Water Grapefruit, Macaroni & Cheese, Peach Pear Flavored Sparkling Water, Fresh Ginger Root, Half & Half, Total 0% Nonfat Greek Yogurt, Red Raspberries, Total 2% All Natural Low Fat 2% Milkfat Greek Strained Yogurt, Raspberries, Strawberry Preserves, Organic Baby Spinach Salad, Oven Roasted Turkey Breast, Vanilla Almond Breeze Almond Milk, Cucumber Kirby, Original No Pulp 100% Florida Orange Juice, Baby Spinach, Mini Original Babybel Cheese, Garlic, Organic Hass Avocado, Marinara Sauce, Vine Ripe Tomatoes, Sharp Cheddar Cheese, Organic Italian Parsley Bunch, Organic Fat Free Milk, Organic Garbanzo Beans, ...]"


### Conclusion
- Based on the FP growth model, we found out that most of the customers tend to buy `Oragnic bananas` they have bought any organic items in the past. This is evident from the confidence value of the model which is above 50%.
- From above result which is actual items purchased versus predicted items, we can see that the `Organic Bananas` are present in the above result
- This prediction model is the best one so far since it provides customer with a list full of items, but the accuracy for this model may be less when calculated when compared to others.