# Data Analysis on Brazilian E-Commerce Public Dataset by Olist

Dataset link: https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce?select=olist_customers_dataset.csv

In [16]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("project").getOrCreate()

# Loading the dataset

In [17]:
# Defining path to the dataset
customer_data_path = "./Data/olist_customers_dataset.csv"  # Replace with the actual path
order_item_path = "./Data/olist_order_items_dataset.csv"
order_payment_path = "./Data/olist_order_payments_dataset.csv"
product_category_translation_path= "./Data/product_category_name_translation.csv"
product_path = './Data/olist_products_dataset.csv'
seller_path = './Data/olist_sellers_dataset.csv'
geolocation_path = './Data/olist_geolocation_dataset.csv'
orders_path = './Data/olist_orders_dataset.csv'

# Load the Chipotle dataset into a Spark DataFrame
customer_df = spark.read.csv(customer_data_path, header=True, inferSchema=True)
order_item_df = spark.read.csv(order_item_path, header=True, inferSchema=True)
order_payment_df = spark.read.csv(order_payment_path, header=True, inferSchema=True)
product_category_translation_df = spark.read.csv(product_category_translation_path, header=True, inferSchema=True)
seller_df_uncleaned = spark.read.csv(seller_path, header=True, inferSchema=True)
product_df_uncleaned = spark.read.csv(product_path, header=True, inferSchema=True)
geoloacation_df_uncleaned = spark.read.csv(geolocation_path, header=True, inferSchema= True)
orders_df_uncleaned = spark.read.csv(orders_path, header=True, inferSchema= True)

                                                                                

# Data Cleaning and pre-processing

In [18]:
from pyspark.sql.functions import col, trim,regexp_replace, when

### Removing whitespace  

In [19]:
# Remove leading and trailing whitespace from all columns
seller_df_uncleaned.select([trim(col(c)).alias(c) for c in seller_df_uncleaned.columns])

# Remove whitespace characters between words in all columns
seller_df = seller_df_uncleaned.select([regexp_replace(col(c), r'\s+', ' ').alias(c) for c in seller_df_uncleaned.columns])


In [20]:
# Remove leading and trailing whitespace from all columns
geoloacation_df_uncleaned.select([trim(col(c)).alias(c) for c in geoloacation_df_uncleaned.columns])

geoloacation_df_uncleaned.show()

+---------------------------+-------------------+-------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|    geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+-------------------+----------------+-----------------+
|                       1037| -23.54562128115268| -46.63929204800168|       sao paulo|               SP|
|                       1046|-23.546081127035535| -46.64482029837157|       sao paulo|               SP|
|                       1046| -23.54612896641469| -46.64295148361138|       sao paulo|               SP|
|                       1041|  -23.5443921648681| -46.63949930627844|       sao paulo|               SP|
|                       1035|-23.541577961711493| -46.64160722329613|       sao paulo|               SP|
|                       1012|-23.547762303364266| -46.63536053788448|       são paulo|               SP|
|                       1047|-23.546273112412678| -46.6

### Working with inconsistent data

In [21]:
# Replace "são paulo" with "sao paulo" in the geolocation dataframe
geolocation_df = geoloacation_df_uncleaned.replace("são paulo", "sao paulo")

# Show the DataFrame with the replaced values
geolocation_df.show()

+---------------------------+-------------------+-------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|    geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+-------------------+----------------+-----------------+
|                       1037| -23.54562128115268| -46.63929204800168|       sao paulo|               SP|
|                       1046|-23.546081127035535| -46.64482029837157|       sao paulo|               SP|
|                       1046| -23.54612896641469| -46.64295148361138|       sao paulo|               SP|
|                       1041|  -23.5443921648681| -46.63949930627844|       sao paulo|               SP|
|                       1035|-23.541577961711493| -46.64160722329613|       sao paulo|               SP|
|                       1012|-23.547762303364266| -46.63536053788448|       sao paulo|               SP|
|                       1047|-23.546273112412678| -46.6

### Drop null values

In [22]:
# Print the number of rows in the 'orders_df_uncleaned' DataFrame
print("No of rows in uncleaned dataset = ", orders_df_uncleaned.count())

# Drop rows with null values in the 'orders_df_uncleaned' DataFrame
orders_df = orders_df_uncleaned.dropna()

# Print the number of rows in the 'orders_df' DataFrame after dropping null values
print("No of rows of cleaned datset = ", orders_df.count())

No of rows in uncleaned dataset =  99441
No of rows of cleaned datset =  96461


### Replacing column on product dataset with content from product category translation dataset

In [23]:
# Perform a left join between the 'product_df_uncleaned' DataFrame and 'product_category_translation_df'
# based on the 'Product_category_name' column. This operation combines the two DataFrames .
product_joined_df= product_df_uncleaned.join(product_category_translation_df, "Product_category_name", "left")

# Drop "product_category_name" will be removed from the DataFrame.
product_df = product_joined_df.drop("product_category_name")

# Rename the "product_category_name_english" column to "product_category_name"
product_df = product_df.withColumnRenamed("product_category_name_english", "product_category_name")

# Show the 'product_df' DataFrame with the dropped and renamed columns.
product_df.show()

+--------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|          product_id|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|product_category_name|
+--------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|1e9e8ef04dbcff454...|                 40|                       287|                 1|             225|               16|               10|              14|            perfumery|
|3aa071139cb16b67c...|                 44|                       276|                 1|            1000|               30|               18|              20|                  art|
|96bd76ec8810374ed...|                 46|                       250|                 1|       

In [24]:
# Set payment_installment to 0 where payment_type is "not_defined"
order_payment_df = order_payment_df.withColumn("Payment_installments",
                                   when(col("Payment_type") == "not_defined", 0)
                                   .otherwise(col("Payment_installments")))


## Applying Transformation on the Dataframes 


### List of Dataframes:
    -customer_df 
    -order_item_df 
    -order_payment_df 
    -product_category_translation_df 
    -seller_df
    -product_df
    -geoloacation_df
    -orders_df

In [34]:
from pyspark.sql.functions import sum, avg, lag, count, desc, udf,round
from pyspark.sql.types import StringType
from pyspark.sql.window import Window

#### #Payment Analysis: For each payment type (payment_type), calculate the total payment value (sum of payment_value) and the average number of payment installments (payment_installments), rename the columns to 'Total Payment Value' and 'Avg Installments,' and order the result by payment type in ascending order

In [26]:
# Perform the analysis
payment_analysis = order_payment_df.groupBy("Payment_type") \
    .agg(
        sum("Payment_value").alias("Total Payment Value"),
        avg("Payment_installments").alias("Avg Installments")
    ) \
    .orderBy("Payment_type")

# Show the result
payment_analysis.show()

+------------+--------------------+-----------------+
|Payment_type| Total Payment Value| Avg Installments|
+------------+--------------------+-----------------+
|      boleto|  2869361.2700000196|              1.0|
| credit_card|1.2542084189999647E7|3.507155413763917|
|  debit_card|  217989.79000000015|              1.0|
| not_defined|                 0.0|              0.0|
|     voucher|   379436.8700000001|              1.0|
+------------+--------------------+-----------------+



#### #Growth analysis: Determine the month-over-month sales growth percentage, using a window function to compare the current month's total sales with the previous month's total sales for each seller (seller_id).

In [27]:
# Join the orders and order_items datasets
combined_df = orders_df.join(order_item_df, "Order_id")

# Calculate monthly sales for each seller
monthly_sales_window = Window.partitionBy("Seller_id").orderBy("YearMonth")
combined_df = combined_df.withColumn("YearMonth", combined_df["Order_purchase_timestamp"].substr(1, 7))
monthly_sales_df = combined_df.groupBy("Seller_id", "YearMonth").agg(sum("Price").alias("monthly_sales"))
monthly_sales_df = monthly_sales_df.withColumn("monthly_sales", monthly_sales_df["monthly_sales"].cast("float"))
monthly_sales_df = monthly_sales_df.withColumn("prev_month_sales", lag("monthly_sales").over(monthly_sales_window))

# Calculate month-over-month sales growth percentage
monthly_sales_df = monthly_sales_df.withColumn("sales_growth_percentage",
                                               ((monthly_sales_df["monthly_sales"] - monthly_sales_df["prev_month_sales"]) /
                                                monthly_sales_df["prev_month_sales"]) * 100)

# Show the result
monthly_sales_df.select("Seller_id", "YearMonth", "monthly_sales", "prev_month_sales", "sales_growth_percentage").show()

+--------------------+---------+-------------+----------------+-----------------------+
|           Seller_id|YearMonth|monthly_sales|prev_month_sales|sales_growth_percentage|
+--------------------+---------+-------------+----------------+-----------------------+
|0015a82c2db000af6...|  2017-09|        895.0|            null|                   null|
|0015a82c2db000af6...|  2017-10|       1790.0|           895.0|                  100.0|
|001cca7ae9ae17fb1...|  2017-02|       1098.9|            null|                   null|
|001cca7ae9ae17fb1...|  2017-03|       1676.7|          1098.9|     52.579844746649954|
|001cca7ae9ae17fb1...|  2017-04|       1708.2|          1676.7|     1.8786903391977854|
|001cca7ae9ae17fb1...|  2017-05|      2639.99|          1708.2|     54.548066133783976|
|001cca7ae9ae17fb1...|  2017-06|      2213.49|         2639.99|     -16.15536428462503|
|001cca7ae9ae17fb1...|  2017-07|      2483.95|         2213.49|       12.2187117236009|
|001cca7ae9ae17fb1...|  2017-08|

#### #Total number of orders placed by customers in each state (customer_state), rename the column to 'Order Count,' and order the result in ascending order of order count

In [28]:
# Join customers and orders datasets to identify unique customers in the orders dataset
unique_customers_df = customer_df.join(orders_df, "Customer_id")

# Group by customer_state and count the number of orders in each state
order_count_by_state = unique_customers_df.groupBy("customer_state").agg(count("Order_id").alias("Order Count"))

# Order the result in ascending order of order count
order_count_by_state = order_count_by_state.orderBy("Order Count")

# Show the result
order_count_by_state.show()




+--------------+-----------+
|customer_state|Order Count|
+--------------+-----------+
|            RR|         41|
|            AP|         67|
|            AC|         80|
|            AM|        145|
|            RO|        243|
|            TO|        274|
|            SE|        335|
|            AL|        397|
|            RN|        474|
|            PI|        476|
|            PB|        517|
|            MS|        701|
|            MA|        716|
|            MT|        886|
|            PA|        946|
|            CE|       1278|
|            PE|       1593|
|            GO|       1957|
|            ES|       1995|
|            DF|       2080|
+--------------+-----------+
only showing top 20 rows



In [29]:
# Join the Orders DataFrame and Order Payments DataFrame based on "Order_id"
order_payment_joined_df = orders_df.join(order_payment_df , "Order_id")

# Calculate order count per customer
order_count = orders_df.groupBy("Customer_id").agg(count("Order_id").alias("Order_Count"))


# Calculate average order value per customer
average_order_value = order_payment_joined_df.groupBy("Customer_id").agg(avg("Payment_value").alias("Average_Order_Value"))


# Create a customer and order analysis DataFrame
customer_behaviour_df = order_count.join(average_order_value, "Customer_id", "inner").orderBy(desc("Average_Order_Value"))

# Show the resulting DataFrame
customer_behaviour_df.show()
print("Total NUmber of Customer = ", customer_behaviour_df.count())


+--------------------+-----------+-------------------+
|         Customer_id|Order_Count|Average_Order_Value|
+--------------------+-----------+-------------------+
|1617b1357756262bf...|          1|           13664.08|
|ec5b2ba62e5743423...|          1|            7274.88|
|c6e2731c5b391845f...|          1|            6929.31|
|f48d464a0baaea338...|          1|            6922.21|
|3fd6777bbce08a352...|          1|            6726.66|
|05455dfa7cd02f13d...|          1|            6081.54|
|df55c14d1476a9a34...|          1|            4950.34|
|24bbf5fd2f2e1b359...|          1|            4764.34|
|3d979689f636322c6...|          1|            4681.78|
|1afc82cd60e303ef0...|          1|            4513.32|
|cc803a2c412833101...|          1|             4445.5|
|35a413c7ca3c69756...|          1|            4175.26|
|e9b0d0eb3015ef1c9...|          1|            4163.51|
|3be2c536886b2ea46...|          1|            4042.74|
|c6695e3b1e48680db...|          1|            4016.91|
|31e83c01f

### #Create a UDF to categorize products into different size categories based on their dimensions (length, width, and height) and weight? Then, calculate the average order value for each product size category.

In [36]:
# Define a UDF to categorize products into size categories based on dimensions and weight
def categorize_product_size(length, width, height, weight):
    if length is not None and width is not None and height is not None and weight is not None:
        if length <= 20 and width <= 20 and height <= 20 and weight <= 500:
            return "Small"
        elif length <= 40 and width <= 40 and height <= 40 and weight <= 2000:
            return "Medium"
        else:
            return "Large"
    else:
        return "Unknown"

# Register the UDF
categorize_udf = udf(categorize_product_size, StringType())

# Apply the UDF to create a new column "Product_Size_Category"
product_df = product_df.withColumn("Product_Size_Category", categorize_udf(
    product_df["Product_length_cm"],
    product_df["Product_width_cm"],
    product_df["Product_height_cm"],
    product_df["Product_weight_g"]
))

# Join the necessary datasets
joined_df = order_item_df.join(product_df, "Product_id")

# Calculate the average order value for each product size category
average_order_value_by_size = joined_df.groupBy("Product_Size_Category").agg(
    round(avg("Price"),2).alias("Average_Order_Value")
)

# Show the resulting DataFrame
average_order_value_by_size.show()

+---------------------+-------------------+
|Product_Size_Category|Average_Order_Value|
+---------------------+-------------------+
|              Unknown|             138.72|
|               Medium|             101.92|
|                Small|              82.06|
|                Large|             175.04|
+---------------------+-------------------+

