In [2]:
!pip install pyspark
!pip install pyngrok

Collecting pyspark
  Using cached pyspark-3.5.3.tar.gz (317.3 MB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840629 sha256=46bcf3b6d1916c0ea21fd437388bdfb3689fe3c3232bcd5dc76c0b9aeaddee45
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [3]:
import time
import pyspark
import numpy as np
from pyngrok import ngrok
from pyspark.sql import SparkSession , Window
from pyspark.sql import functions as F


In [4]:
# Create a SparkSession with custom memory settings
spark = SparkSession.builder.appName("instamart_analysis") \
    .config("spark.driver.memory","25g") \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/10 11:08:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
def show_time(start):
    return time.time() - start

In [6]:
departments_df = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/departments.csv")
products_df = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/products.csv")
prior_product_orders = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/order_products__prior.csv").repartition(12)
train_product_orders = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/order_products__train.csv").repartition(8)
orders_df = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/orders.csv").repartition(8)
aisels_df = spark.read.options(header=True,inferSchema=True).csv("/kaggle/input/instacart-market-basket-analysis/aisles.csv")


                                                                                

In [7]:
# Create a tunnel to the Spark UI
ngrok.set_auth_token('2kvaYw5ZiG5bL8iM8YJBVJPk1Ru_3C16mMgmpKEBYb28PPLUe')  # Optional: set your Ngrok auth token if you have one
tunnel = ngrok.connect(4040)
print("Ngrok tunnel \"{}\" -> \"http://localhost:4040\"".format(tunnel.public_url))


Ngrok tunnel "https://2027-34-147-67-56.ngrok-free.app" -> "http://localhost:4040"                  


In [8]:
prior_product_orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- add_to_cart_order: integer (nullable = true)
 |-- reordered: integer (nullable = true)



In [9]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)



In [10]:
orders_df.cache()

DataFrame[order_id: int, user_id: int, eval_set: string, order_number: int, order_dow: int, order_hour_of_day: int, days_since_prior_order: double]

In [11]:
train_orders_df = orders_df.filter(orders_df["eval_set"] =='train').drop("eval_set")
prior_orders_df = orders_df.filter(orders_df["eval_set"] == 'prior').drop("eval_set")
train_orders_df.cache()
train_product_orders.cache()
prior_orders_df.cache()
prior_product_orders.cache()

DataFrame[order_id: int, product_id: int, add_to_cart_order: int, reordered: int]

In [12]:
# how often user has reorderd
df_with_num_of_reord = (
    prior_product_orders.select("reordered","order_id").join(
        prior_orders_df.select("user_id","order_id"),how="left",on="order_id"
    ).select("user_id","reordered") 
     .groupBy("user_id").agg(
        F.count(F.col("reordered")).alias("frequency of reorder")
    )
)

In [13]:
# time since privious order

df_with_time_since_prev_ord = (
    prior_orders_df.select("user_id","days_since_prior_order","order_hour_of_day","order_number","order_id") 
                .withColumn("privious_order_hour",
                            F.lag("order_hour_of_day",1) 
                            .over(Window.partitionBy("user_id").orderBy("order_number"))) 
                .withColumn("time_since_Last_order",
                            F.col("days_since_prior_order") * 24 + 
                            F.col("order_hour_of_day") - 
                            F.col("privious_order_hour") 
                           ) 
                .select("order_id","time_since_last_order")
)


In [14]:
#time of the day user visits

df_with_time_of_day_usr_visits = (
    prior_orders_df.select("user_id" , "order_hour_of_day","order_id") 
                .groupBy("user_id","order_hour_of_day") 
                .agg(F.count("order_id").alias("frequency")) 
                .groupBy("user_id") 
                .agg(F.max("frequency").alias("maximum_frquency"))
)

In [44]:
# does the user have ordered asian , gluten free, or organic item 

df_with_does_usr_asian_gluten_orga_items_ord = (
    prior_product_orders.select("order_id","product_id") 
            .join(products_df.select("product_id","product_name"), on="product_id", how='left') 
            .join(prior_orders_df.select("user_id","order_id"), on="order_id", how='left') 
            .groupBy("user_id", "order_id") 
            .agg(F.collect_list("product_name").alias("list_of_products")) 
            .withColumn("normalized_list", F.expr("transform(list_of_products, x -> lower(x))")) 
            .withColumn("contains_or_not", 
                F.expr("exists(normalized_list,x -> x like '%organic%')")
              | F.expr("exists(normalized_list, x -> x like '%asian%')")
              | F.expr("exists(normalized_list, x-> x like '%gluten free%')")
            ) 
            .select("order_id","contains_or_not")
)

In [16]:
# feature based on order size 

df_with_fets_of_ord_size = (
    prior_product_orders.select("product_id","order_id") 
                    .join(prior_orders_df.select("user_id","order_id") , on="order_id", how="left") 
                    .groupBy("user_id",'order_id') 
                    .agg(
                            F.count(F.col("product_id")).alias("count_of_product")
                        ) 
                    .groupBy("user_id") 
                    .agg(
                            F.max(F.col("count_of_product")).alias("max_count_of_products"),
                            F.min(F.col("count_of_product")).alias("min_count_of_products"),
                            F.mean(F.col("count_of_product")).alias("mean_count_of_products")
                        ) 
)

In [46]:
# How many of the user’s orders contained no previously purchased items

df_with_freq_ord_that_hasnt_prev_purch_items = (
    prior_product_orders.select("order_id","reordered") 
                    .join(prior_orders_df.select("order_id","user_id") , on = 'order_id' , how = 'left') 
                    .groupBy("user_Id","order_id") 
                    .agg(
                            F.collect_list(F.col("reordered")).alias("reordered_array")
                        ) 
                    .withColumn("doesnt_contains_reordered" ,
                            F.when(F.array_contains("reordered_array",1),0).otherwise(1)
                        ) 
                    .select("order_id","doesnt_contains_reordered")
)

In [18]:
# how often the item has been purchaced 

df_with_freq_purch= (
    prior_product_orders.select("product_id","order_id") 
                     .groupBy("product_id") 
                     .agg(
                             F.count(F.col("order_id")).alias("product_count")
                        ) 
)

In [19]:
# position of product 

df_with_avg_position_of_prod = (
    prior_product_orders.select("product_id","add_to_cart_order") 
                    .groupBy("product_id") 
                    .agg(
                            F.mean(F.col("add_to_cart_order")).alias("product_mean_of_position")
                        ) 
)

In [20]:
# How many users buy it as "one shot" item

df_with_freq_one_shot_ord_prods = (
    prior_product_orders.select("order_id","product_id") 
                    .groupBy("order_id") 
                    .agg(F.collect_list("product_id").alias("list_of_products")) 
                    .withColumn("is_one_shot_order",
                                   F.when(F.size(F.col("list_of_products")) == 1,1).otherwise(0)
                               ) 
                    .withColumn("product_id",F.explode(F.col("list_of_products"))) 
                    .join(prior_orders_df.select("user_id","order_id"),on="order_id",how='left') 
                    .groupBy("product_id","user_id") 
                    .agg(F.collect_list(F.col("is_one_shot_order")).alias("is_one_shot_order_list")) 
                    .withColumn("has_user_purchased_one_shot",F.when(F.array_contains("is_one_shot_order_list",1),1).otherwise(0)) 
                    .groupBy("product_id") 
                    .agg(
                            F.sum(F.col("has_user_purchased_one_shot")).alias("number_of_user_purchased_item")
                        ) 
)

In [21]:
# Stats on the number of items that co-occur with this item

# 1. number of time that a item has co occured.

# Perform a self-join on prior_product_orders
df_with_freq_co_ocrd = (
    prior_product_orders
    .select("product_id", "order_id")
    .alias("df1")
    .join(
        prior_product_orders.select("product_id", "order_id")
        .withColumnRenamed("product_id", "product_id_1")
        .alias("df2"),
        (F.col("df1.order_id") == F.col("df2.order_id")) & (F.col("df1.product_id") != F.col("df2.product_id_1")),
        "left"
    )
    .groupBy("df1.product_id")
    .agg(F.count(F.col("df2.product_id_1")).alias("number_of_product_co_occurred"))
)

# 2 average number of items that is co ocuured with this item in single order

df_with_avg_num_item_co_ocrd_in_ord = (
                prior_product_orders.select("product_id","order_id").alias("ppo1") 
                .join(
                    prior_product_orders.select("product_id","order_id")
                    .alias("ppo2"),
                    (F.col("ppo1.order_id") == F.col("ppo2.order_id")) & 
                    (F.col("ppo1.product_id") != F.col("ppo2.product_id")),
                    how='left'
                ) 
                .groupBy("ppo1.product_id","ppo1.order_id")
                .agg(F.count(F.col("ppo2.product_id")).alias("count_of_co_ocuured_product_per_order"))
                .groupBy("ppo1.product_id")
                .agg(
                    F.mean(F.col("count_of_co_ocuured_product_per_order")).alias("mean_of_co_ocuured_product_per_order"),
                    F.min(F.col("count_of_co_ocuured_product_per_order")).alias("min_of_co_ocuured_product_per_order"),
                    F.max(F.col("count_of_co_ocuured_product_per_order")).alias("max_of_co_ocuured_product_per_order"),

                )
)


In [22]:
# Stats on the order streak

# 1. let's add the flag whether streak is continued or not

df_with_flag= (

    prior_product_orders.select("product_id","order_id")
                        .join(
                                prior_orders_df.select("user_id","order_number","order_id"),
                                how ='left',
                                on = 'order_id' 
                            )
                        .withColumn("next_order_number",
                            F.lead(F.col("order_number"),1).over(Window.partitionBy("user_id","product_id").orderBy("order_number"))
                        )
                        .withColumn("is_streak_continued_flag",
                               F.when(F.col("next_order_number") - F.col("order_number") == 1,1)
                                    .otherwise(0)
                            )
)
# 2. let's assign an unique id to each streak of a perticular user and product.

w1 = Window.partitionBy("user_id","product_id").orderBy("order_number")
w2 = Window.partitionBy("user_id","product_id","is_streak_continued_flag").orderBy("order_number")

# by using the above window we can create unique id for streak named grp then can find streak leangth.
df_with_streak_length = (
    df_with_flag.withColumn("grp",F.row_number().over(w1) - F.row_number().over(w2))
                .groupBy("user_id","product_id","grp")
                .agg(
                    F.count("order_number").alias("length_of_streaks")
                )
)

# finally , summarize it over each prodcut rather than per user per product.
df_with_stats_of_streaks = (
    df_with_streak_length.select("product_id","length_of_streaks","grp")
                         .groupBy("product_id")
                         .agg(
                             F.count('grp').alias("Total_streak_of_this_product"),
                             F.mean("length_of_streaks").alias("mean_of_streaks_of_this_product"),
                             F.min("length_of_streaks").alias("max_of_streaks_of_this_product"),
                             F.max("length_of_streaks").alias("min_of_streaks_of_this_product")
                         
                         )
)


In [23]:
# Probability of being reordered within N orders

# we have already counted the lenght of the streaks so if it is >= 5 then it will be added in probability.

df_with_prob_greater_5 = (
    df_with_streak_length.withColumn("is_streak_length_greater_than_5",
                                        F.when(F.col("length_of_streaks") >= 5,1).otherwise(0) 
                                    )
                         .groupBy("product_id")
                         .agg(
                             F.count("length_of_streaks").alias("total_streaks"),
                             F.sum("is_streak_length_greater_than_5").alias("total_streaks_greater_than_5")
                         )
                         .withColumn("prob_of_reordered_5",
                             ( F.col("total_streaks_greater_than_5") / F.col("total_streaks"))
                         )
                         .select("product_id","prob_of_reordered_5")
)

In [24]:
# we have already counted the lenght of the streaks so if it is >= 2 then it will be added in probability.

df_with_prob_greater_2 = (
    df_with_streak_length.withColumn("is_streak_length_greater_than_2",
                                        F.when(F.col("length_of_streaks") >= 2,1).otherwise(0) 
                                    )
                         .groupBy("product_id")
                         .agg(
                             F.count("length_of_streaks").alias("total_streaks"),
                             F.sum("is_streak_length_greater_than_2").alias("total_streaks_greater_than_2")
                         )
                         .withColumn("prob_of_reordered_2",
                             ( F.col("total_streaks_greater_than_2") / F.col("total_streaks"))
                         )
                         .select("product_id","prob_of_reordered_2")
)

In [25]:
# we have already counted the lenght of the streaks so if it is >= 3 then it will be added in probability.

df_with_prob_greater_3 = (
    df_with_streak_length.withColumn("is_streak_length_greater_than_3",
                                        F.when(F.col("length_of_streaks") >= 3,1).otherwise(0) 
                                    )
                         .groupBy("product_id")
                         .agg(
                             F.count("length_of_streaks").alias("total_streaks"),
                             F.sum("is_streak_length_greater_than_3").alias("total_streaks_greater_than_3")
                         )
                         .withColumn("prob_of_reordered_3",
                             ( F.col("total_streaks_greater_than_3") / F.col("total_streaks"))
                         )
                         .select("product_id","prob_of_reordered_3")
)



In [64]:
# Distribution of the day of week it is ordered
pivoted_prior_orders_df = (
    prior_orders_df.select("order_id","order_dow")
                    .groupBy("order_id")
                    .pivot("order_dow")
                    .agg(F.lit(1)).na.fill(0)
)
            
df_with_count_of_dow = (
    prior_product_orders.select("order_id","product_id")
                            .join(
                                pivoted_prior_orders_df , on = "order_id",how='left'
                            )
                            .groupBy("product_id")
                            .agg(
                                F.sum("0").alias("count_of_dow_0"),
                                F.sum("1").alias("count_of_dow_1"),
                                F.sum("2").alias("count_of_dow_2"),
                                F.sum("3").alias("count_of_dow_3"),
                                F.sum("4").alias("count_of_dow_4"),
                                F.sum("5").alias("count_of_dow_5"),
                                F.sum("6").alias("count_of_dow_6")
                            )
)


                                                                                

In [27]:
#  Probability it is reordered after the first order
total_orders = prior_orders_df.select("order_id").distinct().count()

df_with_prob_reord = (
    prior_orders_df.select("order_id","user_id")
                    .join(prior_product_orders.select("product_id","order_id"),on="order_id",how='left')
                    .groupBy("product_id","user_id")
                    .agg(
                        F.count("order_id").alias("order_count")
                    )
                    .groupBy("product_id")
                    .agg(
                        ( 
                            (F.sum("order_count") / total_orders).alias("prob_of_being_reordered") 
                        )
                    )
)

                                                                                

In [28]:
# Number of orders in which the user purchases the item

df_with_num_of_order_p_product = (
    
    prior_product_orders.select("order_id","product_id")
                        .join(
                            prior_orders_df.select("order_id","user_id")
                            , how = 'left' , on = 'order_id'
                        )
                        .groupBy("user_id","product_id")
                        .agg(
                            F.count("order_id").alias("num_of_ord_purch_p_prod")
                        )
)

In [29]:
# # Days since the user last purchased the item

# w1 = Window.partitionBy("user_id","product_id").orderBy("order_number")
# df_with_next_order_p_prod = (
#     prior_product_orders.select("product_id","order_id")
#                         .join(
#                             prior_orders_df.select("user_id","order_id","order_number","days_since_prior_order")
#                                             .groupBy("user_id")
#                                             .agg(
#                                                 F.collect_list("days_since_prior_order").alias("list_of_days_since_prior_ord"),
#                                                 F.collect_list("order_number").alias("list_of_order_number")
#                                             )
#                             ,
#                             how='left',on='order_id'
#                         )
#                         .withColumn("pre_order_number",
#                             F.lag(F.col("order_number")).over(w1)
                                
#                         ).na.fill({'pre_order_number':0})
#                         .sort("user_id","product_id","order_number")
#                         .show()
                        
# )

# w2 = Window.partitionBy("user_id").orderBy("order_number") 
#                         .rowsBetween(
#                             F.when(F.col("pre_order_number") == 0,Window.unboundedPreceding).otherwise(F.col("pre_order_number")) ,
#                             F.col("order_number")
#                         )

# df_with_days_since_last_ord_p_prod = (
#     df_with_next_order_p_prod.join(
#                                 prior_orders_df.select("user_id","days_since_prior_orders"),
#                                 how='left',on='user_id'
#                             ).sort("user_id","order")
#                             .groupBy("user_id")
#                             .withColum("sum_day_since_last_order",
#                                 F.sum("days_since_prior_order").over(w2)
#                             )
# )

In [74]:
# Position in the cart
df_with_position_cart_p_usr_p_prod = (
    prior_product_orders.select("product_id","add_to_cart_order","order_id") 
                    .join(
                        prior_orders_df.select("user_id","order_id"),
                        how = 'left' , on = 'order_id'
                    )
                    .groupBy("user_id","product_id") 
                    .agg(
                            F.mean(F.col("add_to_cart_order")).alias("prod_mean_of_position_p_user")
                        )
)

In [85]:
# Co-occurrence statistics

df_with_co_ocrd_stats_p_user_p_prod = (
    prior_product_orders
    .select("product_id", "order_id")
    .alias("df1")
    .join(prior_orders_df.select("user_id","order_id"),
         on = 'order_id',how='left'
         )
    .join(
        prior_product_orders.select("product_id", "order_id")
        .withColumnRenamed("product_id", "product_id_1")
        .alias("df2"),
        (F.col("df1.order_id") == F.col("df2.order_id")) & (F.col("df1.product_id") != F.col("df2.product_id_1")),
        "left"
    )
    .groupBy("user_id","df1.product_id")
    .agg(
        F.count(F.col("df2.product_id_1")).alias("num_of_prod_co_ocrd_p_usr_p_prod"),
    )
)


In [92]:
#Counts by day of wee

df_with_count_of_dow = (
        prior_orders_df.select("order_id","order_dow")
                        .groupBy("order_id")
                        .pivot("order_dow")
                        .agg(F.lit(1))
                        .na.fill(0)
                        .agg(*
                              [
                                F.sum(f"{i}").alias(f"count_dow_{i}") for i in range(7)
                                ]
                            )
)

#Counts by hour

df_with_count_of_ohod = (
        prior_orders_df.select("order_id","order_hour_of_day")
                        .groupBy("order_id")
                        .pivot("order_hour_of_day")
                        .agg(F.lit(1))
                        .na.fill(0)
                        .agg(*
                              [
                                F.sum(f"{i}").alias(f"count_ohod_{i}") for i in range(24)
                                ]
                            )
)

                                                                                

In [49]:
result_df = (
    prior_orders_df.join(
        df_with_num_of_reord , on = "user_id" ,how = 'left'
    )
    .join(
        df_with_time_since_prev_ord , on = "order_id" , how = "left"
    )
    .join(
        df_with_does_usr_asian_gluten_orga_items_ord , on = "order_id" , how = 'left'
    )
    .join(
        df_with_fets_of_ord_size , on = 'user_id' , how = 'left'
    )
    .join(
        df_with_freq_ord_that_hasnt_prev_purch_items , on = "order_id",how='left'
    )
)
result_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)
 |-- frequency of reorder: long (nullable = true)
 |-- time_since_last_order: double (nullable = true)
 |-- contains_or_not: boolean (nullable = true)
 |-- max_count_of_products: long (nullable = true)
 |-- min_count_of_products: long (nullable = true)
 |-- mean_count_of_products: double (nullable = true)
 |-- doesnt_contains_reordered: integer (nullable = true)



In [52]:
result_df.cache()

DataFrame[order_id: int, user_id: int, order_number: int, order_dow: int, order_hour_of_day: int, days_since_prior_order: double, frequency of reorder: bigint, time_since_last_order: double, contains_or_not: boolean, max_count_of_products: bigint, min_count_of_products: bigint, mean_count_of_products: double, doesnt_contains_reordered: int]

In [53]:
result_df.count()

                                                                                

3214874

In [67]:
result_product_df = (
    df_with_avg_position_of_prod
    .join(
        df_with_freq_one_shot_ord_prods , on = 'product_id' , how = 'left'
    )
    .join(
        df_with_freq_co_ocrd , on = "product_id" , how = 'left'
    )
    .join(
        df_with_avg_num_item_co_ocrd_in_ord , df_with_avg_num_item_co_ocrd_in_ord["ppo1.product_id"] == prior_product_orders["product_id"] , how ="left"
    )
    .join(
        df_with_stats_of_streaks , on = 'product_id' , how = 'left'
    )
    .join(
        df_with_prob_greater_5 , on = 'product_id' , how = "left"
    )
    .join(
        df_with_prob_greater_3 , on = 'product_id' , how = "left"
    )
    .join(
        df_with_prob_greater_2 , on = 'product_id' , how = "left"
    )
    .join(
        df_with_count_of_dow , on = 'product_id', how = 'left'
    )
    .join(
        df_with_prob_reord , on = 'product_id' , how = 'left'
    )
    .join(
        df_with_num_of_order_p_product , on = 'product_id' , how = 'left'
    )
)

result_product_df.cache()
result_product_df.count()

                                                                                

13307953

In [87]:
result_usr_prod_df = (
    prior_product_orders
    .withColumnRenamed("product_id","product_id_p")
    .alias("ppo")
    .join(
        prior_orders_df.select("user_id","order_id").withColumnRenamed("user_id","user_id_p").alias("pod") , on = 'order_id' , how = 'left'
    )
    .join(
        df_with_num_of_order_p_product ,
            (F.col("pod.user_id_p") == df_with_num_of_order_p_product['user_id']) &
            (F.col("ppo.product_id_p") == df_with_num_of_order_p_product['product_id']) 
        , how = 'left'
    ).drop("user_id","product_id")
    .join(
        df_with_position_cart_p_usr_p_prod ,
        (df_with_position_cart_p_usr_p_prod["user_id"] == F.col("pod.user_id_p")) &
        (df_with_position_cart_p_usr_p_prod["product_id"] == F.col("ppo.product_id_p"))
        ,how = 'left'
    ).drop("user_id","product_id")
    .join(
        df_with_co_ocrd_stats_p_user_p_prod ,
        (df_with_co_ocrd_stats_p_user_p_prod["user_id"] == F.col("pod.user_id_p") ) &
        (df_with_co_ocrd_stats_p_user_p_prod["product_id"] == F.col("ppo.product_id_p"))
        , how = 'left'
    ).drop("user_id","product_id")
)

result_usr_prod_df.cache()
result_usr_prod_df.count()


                                                                                

32434489

In [94]:
result_df_with_time_df = (
    result_df.crossJoin(
        F.broadcast(df_with_count_of_dow)
    )
    .crossJoin(
        F.broadcast(df_with_count_of_ohod)
    )
)
result_df_with_time_df.cache()
result_df_with_time_df.count()

                                                                                

3214874

In [96]:
result_product_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_mean_of_position: double (nullable = true)
 |-- number_of_user_purchased_item: long (nullable = true)
 |-- number_of_product_co_occurred: long (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- mean_of_co_ocuured_product_per_order: double (nullable = true)
 |-- min_of_co_ocuured_product_per_order: long (nullable = true)
 |-- max_of_co_ocuured_product_per_order: long (nullable = true)
 |-- Total_streak_of_this_product: long (nullable = true)
 |-- mean_of_streaks_of_this_product: double (nullable = true)
 |-- max_of_streaks_of_this_product: long (nullable = true)
 |-- min_of_streaks_of_this_product: long (nullable = true)
 |-- prob_of_reordered_5: double (nullable = true)
 |-- prob_of_reordered_3: double (nullable = true)
 |-- prob_of_reordered_2: double (nullable = true)
 |-- count_of_dow_0: long (nullable = true)
 |-- count_of_dow_1: long (nullable = true)
 |-- count_of_dow_2: long (nullable = true)
 |-- count_

In [97]:
result_usr_prod_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- product_id_p: integer (nullable = true)
 |-- add_to_cart_order: integer (nullable = true)
 |-- reordered: integer (nullable = true)
 |-- user_id_p: integer (nullable = true)
 |-- num_of_ord_purch_p_prod: long (nullable = true)
 |-- prod_mean_of_position_p_user: double (nullable = true)
 |-- num_of_prod_co_ocrd_p_usr_p_prod: long (nullable = true)



In [109]:
final_prior_ord_train_df = (
    result_usr_prod_df.join(
        result_df_with_time_df.drop("user_id"),
        on = "order_id",how='left'
    )
    .join(
        result_product_df.drop("user_id") ,
        (F.col("product_id_p") == result_product_df['ppo1.product_id'])
        , how = 'left'
    ).drop("product_id")
)
final_prior_ord_train_df.cache()
final_prior_ord_train_df.count()

24/10/10 13:07:23 WARN CacheManager: Asked to cache already cached data.
24/10/10 13:08:42 WARN BlockManager: Putting block rdd_932_0 failed due to exception org.apache.spark.TaskKilledException.
24/10/10 13:08:42 WARN BlockManager: Block rdd_932_0 could not be removed as it was not found on disk or in memory
24/10/10 13:08:42 WARN TaskSetManager: Lost task 0.0 in stage 909.0 (TID 3237) (8da198fe9dfc executor driver): TaskKilled (Stage cancelled: Job 176 cancelled )
24/10/10 13:08:42 WARN BlockManager: Putting block rdd_932_3 failed due to exception org.apache.spark.TaskKilledException.
24/10/10 13:08:42 WARN BlockManager: Block rdd_932_3 could not be removed as it was not found on disk or in memory
24/10/10 13:08:42 WARN TaskSetManager: Lost task 3.0 in stage 909.0 (TID 3240) (8da198fe9dfc executor driver): TaskKilled (Stage cancelled: Job 176 cancelled )
24/10/10 13:08:42 WARN BlockManager: Putting block rdd_932_2 failed due to exception org.apache.spark.TaskKilledException.
24/10/10

Py4JJavaError: An error occurred while calling o2837.count.
: org.apache.spark.SparkException: Job 176 cancelled 
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3013)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)


In [None]:
final_prior_ord_train_df.printSchema()

In [None]:
final_prior_ord_train_df.write.csv('/kaggle/working/final_prior_ord_train_df.csv',header=True)

In [None]:
from IPython.display import FileLink

# Show the file link
FileLink("/kaggle/working/final_prior_ord_train_df.csv")