In [None]:
!pip install pyspark
!pip install pyngrok

In [None]:
import time
import pyspark
import numpy as np
from pyngrok import ngrok
from pyspark.sql import SparkSession , Window
from pyspark.sql import functions as F


In [None]:
# Create a SparkSession with custom memory settings
spark = SparkSession.builder.appName("instamart_analysis") \
    .config("spark.driver.memory","25g") \
    .getOrCreate()


In [None]:
def show_time(start):
    return time.time()-start

In [44]:
departments_df = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/departments.csv")
products_df = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/products.csv")
prior_product_orders = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/order_products__prior.csv").repartition(12)
train_product_orders = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/order_products__train.csv").repartition(8)
orders_df = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/orders.csv").repartition(8)
aisels_df = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/aisles.csv")


[Stage 131:>                                                        (0 + 4) / 5]

KeyboardInterrupt: 



In [None]:
# Create a tunnel to the Spark UI
ngrok.set_auth_token('2kvaYw5ZiG5bL8iM8YJBVJPk1Ru_3C16mMgmpKEBYb28PPLUe')  # Optional: set your Ngrok auth token if you have one
tunnel = ngrok.connect(4040)
print("Ngrok tunnel \"{}\" -> \"http://localhost:4040\"".format(tunnel.public_url))


In [None]:
prior_product_orders.printSchema()

In [None]:
orders_df.printSchema()

In [None]:
orders_df.cache()

In [None]:
train_orders_df = orders_df.filter(orders_df["eval_set"] =='train').drop("eval_set")
prior_orders_df = orders_df.filter(orders_df["eval_set"] == 'prior').drop("eval_set")
train_orders_df.cache()
train_product_orders.cache()
prior_orders_df.cache()
prior_product_orders.cache()

In [None]:
# how often user has reorderd
prior_product_orders.select("reordered","order_id").join(
        prior_orders_df.select("user_id","order_id"),how="left",on="order_id"
    ).select("user_id","reordered") \
     .groupBy("user_id").agg(
            F.count(F.col("reordered")).alias("frequency of reorder")
        )

In [None]:
# time since privious order
prior_orders_df.select("user_id","days_since_prior_order","order_hour_of_day","order_number","order_id") \
                .withColumn("privious_order_hour",
                            F.lag("order_hour_of_day",1) \
                            .over(Window.partitionBy("user_id").orderBy("order_number"))) \
                .withColumn("time_since_Last_order",
                            F.col("days_since_prior_order") * 24 + 
                            F.col("order_hour_of_day") - 
                            F.col("privious_order_hour") 
                           ) \
                .select("order_id","time_since_last_order")


In [None]:
#time of the day user visits
prior_orders_df.select("user_id" , "order_hour_of_day","order_id") \
                .groupBy("user_id","order_hour_of_day") \
                .agg(F.count("order_id").alias("frequency")) \
                .groupBy("user_id") \
                .agg(F.max("frequency").alias("maximum_frquency"))

In [None]:
# whether user has ordered glutan free , organic , Asian item or not
prior_product_orders.printSchema()

In [None]:
products_df.printSchema()

In [None]:
# does the user have ordered asian , gluten free, or organic item 
prior_product_orders.select("order_id","product_id") \
            .join(products_df.select("product_id","product_name"), on="product_id", how='left') \
            .join(prior_orders_df.select("user_id","order_id"), on="order_id", how='left') \
            .groupBy("user_id", "order_id") \
            .agg(F.collect_list("product_name").alias("list_of_products")) \
            .withColumn("normalized_list", F.expr("transform(list_of_products, x -> lower(x))")) \
            .withColumn("contains_or_not", 
                F.expr("exists(normalized_list,x -> x like '%organic%')")
              | F.expr("exists(normalized_list, x -> x like '%asian%')")
              | F.expr("exists(normalized_list, x-> x like '%gluten free%')")
            ) \
            .filter(F.col("contains_or_not") == True) \
            .select("user_id", "order_id") 

In [None]:
# feature based on order size 
prior_product_orders.select("product_id","order_id") \
                    .join(prior_orders_df.select("user_id","order_id") , on="order_id", how="left") \
                    .groupBy("user_id",'order_id') \
                    .agg(
                            F.count(F.col("product_id")).alias("count_of_product")
                        ) \
                    .groupBy("user_id") \
                    .agg(
                            F.max(F.col("count_of_product")).alias("max_count_of_products"),
                            F.min(F.col("count_of_product")).alias("min_count_of_products"),
                            F.mean(F.col("count_of_product")).alias("mean_count_of_products")
                        ) 

In [None]:
# How many of the user’s orders contained no previously purchased items
prior_product_orders.select("order_id","reordered") \
                    .join(prior_orders_df.select("order_id","user_id") , on = 'order_id' , how = 'left') \
                    .groupBy("user_Id","order_id") \
                    .agg(
                            F.collect_list(F.col("reordered")).alias("reordered_array")
                        ) \
                    .withColumn("doesnt_contains_reordered" ,
                            F.when(F.array_contains("reordered_array",1),0).otherwise(1)
                        ) 

In [None]:
# how often the item has purchaced 
prior_product_orders.select("product_id","order_id") \
                     .groupBy("product_id") \
                     .agg(
                             F.count(F.col("order_id")).alias("product_count")
                        ) 

In [45]:
# position of product 
prior_product_orders.select("product_id","add_to_cart_order") \
                    .groupBy("product_id") \
                    .agg(
                            F.mean(F.col("add_to_cart_order")).alias("product_mean_of_position")
                        ) 

DataFrame[product_id: int, product_mean_of_position: double]

                                                                                

In [55]:
# How many users buy it as "one shot" item
prior_product_orders.select("order_id","product_id") \
                    .groupBy("order_id") \
                    .agg(F.collect_list("product_id").alias("list_of_products")) \
                    .withColumn("is_one_shot_order",
                                   F.when(F.size(F.col("list_of_products")) == 1,1).otherwise(0)
                               ) \
                    .withColumn("product_id",F.explode(F.col("list_of_products"))) \
                    .join(prior_orders_df.select("user_id","order_id"),on="order_id",how='left') \
                    .groupBy("product_id","user_id") \
                    .agg(F.collect_list(F.col("is_one_shot_order")).alias("is_one_shot_order_list")) \
                    .withColumn("has_user_purchased_one_shot",F.when(F.array_contains("is_one_shot_order_list",1),1).otherwise(0)) \
                    .groupBy("product_id") \
                    .agg(
                            F.sum(F.col("has_user_purchased_one_shot")).alias("number_of_user_purchased_item")
                        ) 

DataFrame[product_id: int, number_of_user_purchased_item: bigint]

In [None]:
# Stats on the number of items that co-occur with this item
