# Data Analysis on Brazilian E-Commerce Public Dataset by Olist

Dataset link: https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce?select=olist_customers_dataset.csv

In [109]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("project").getOrCreate()

# Loading the dataset

In [110]:
# Defining path to the dataset
customer_data_path = "./Data/olist_customers_dataset.csv"  # Replace with the actual path
order_item_path = "./Data/olist_order_items_dataset.csv"
order_payment_path = "./Data/olist_order_payments_dataset.csv"
product_category_translation_path= "./Data/product_category_name_translation.csv"
product_path = './Data/olist_products_dataset.csv'
seller_path = './Data/olist_sellers_dataset.csv'
geolocation_path = './Data/olist_geolocation_dataset.csv'
orders_path = './Data/olist_orders_dataset.csv'

# Load the Chipotle dataset into a Spark DataFrame
customer_df = spark.read.csv(customer_data_path, header=True, inferSchema=True)
order_item_df = spark.read.csv(order_item_path, header=True, inferSchema=True)
order_payment_df = spark.read.csv(order_payment_path, header=True, inferSchema=True)
product_category_translation_df = spark.read.csv(product_category_translation_path, header=True, inferSchema=True)
seller_df_uncleaned = spark.read.csv(seller_path, header=True, inferSchema=True)
product_df_uncleaned = spark.read.csv(product_path, header=True, inferSchema=True)
geolocation_df_uncleaned = spark.read.csv(geolocation_path, header=True, inferSchema= True)
orders_df_uncleaned = spark.read.csv(orders_path, header=True, inferSchema= True)

                                                                                

# Data Cleaning and pre-processing

In [111]:
from pyspark.sql.functions import col, trim,regexp_replace

### Removing whitespace  

In [112]:
# Remove leading and trailing whitespace from all columns
seller_df_uncleaned.select([trim(col(c)).alias(c) for c in seller_df_uncleaned.columns])

# Remove whitespace characters between words in all columns
seller_df = seller_df_uncleaned.select([regexp_replace(col(c), r'\s+', ' ').alias(c) for c in seller_df_uncleaned.columns])


In [113]:
# Remove leading and trailing whitespace from all columns
geolocation_df_uncleaned.select([trim(col(c)).alias(c) for c in geoloacation_df_uncleaned.columns])

geolocation_df_uncleaned.show()

+---------------------------+-------------------+-------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|    geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+-------------------+----------------+-----------------+
|                       1037| -23.54562128115268| -46.63929204800168|       sao paulo|               SP|
|                       1046|-23.546081127035535| -46.64482029837157|       sao paulo|               SP|
|                       1046| -23.54612896641469| -46.64295148361138|       sao paulo|               SP|
|                       1041|  -23.5443921648681| -46.63949930627844|       sao paulo|               SP|
|                       1035|-23.541577961711493| -46.64160722329613|       sao paulo|               SP|
|                       1012|-23.547762303364266| -46.63536053788448|       são paulo|               SP|
|                       1047|-23.546273112412678| -46.6

### Working with inconsistent data

In [114]:
# Replace "são paulo" with "sao paulo" in the geolocation dataframe
geolocation_df = geoloacation_df_uncleaned.replace("são paulo", "sao paulo")

# Show the DataFrame with the replaced values
geolocation_df.show()

+---------------------------+-------------------+-------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|    geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+-------------------+----------------+-----------------+
|                       1037| -23.54562128115268| -46.63929204800168|       sao paulo|               SP|
|                       1046|-23.546081127035535| -46.64482029837157|       sao paulo|               SP|
|                       1046| -23.54612896641469| -46.64295148361138|       sao paulo|               SP|
|                       1041|  -23.5443921648681| -46.63949930627844|       sao paulo|               SP|
|                       1035|-23.541577961711493| -46.64160722329613|       sao paulo|               SP|
|                       1012|-23.547762303364266| -46.63536053788448|       sao paulo|               SP|
|                       1047|-23.546273112412678| -46.6

### Drop null values

In [115]:
# Print the number of rows in the 'orders_df_uncleaned' DataFrame
print("No of rows in uncleaned dataset = ", orders_df_uncleaned.count())

# Drop rows with null values in the 'orders_df_uncleaned' DataFrame
orders_df = orders_df_uncleaned.dropna()

# Print the number of rows in the 'orders_df' DataFrame after dropping null values
print("No of rows of cleaned datset = ", orders_df.count())

No of rows in uncleaned dataset =  99441
No of rows of cleaned datset =  96461


### Replacing column on product dataset with content from product category translation dataset

In [116]:
# Perform a left join between the 'product_df_uncleaned' DataFrame and 'product_category_translation_df'
# based on the 'Product_category_name' column. This operation combines the two DataFrames .
product_joined_df= product_df_uncleaned.join(product_category_translation_df, "Product_category_name", "left")

# Drop "product_category_name" will be removed from the DataFrame.
product_df = product_joined_df.drop("product_category_name")

# Rename the "product_category_name_english" column to "product_category_name"
product_df = product_df.withColumnRenamed("product_category_name_english", "product_category_name")

# Show the 'product_df' DataFrame with the dropped and renamed columns.
product_df.show()

+--------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|          product_id|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|product_category_name|
+--------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|1e9e8ef04dbcff454...|                 40|                       287|                 1|             225|               16|               10|              14|            perfumery|
|3aa071139cb16b67c...|                 44|                       276|                 1|            1000|               30|               18|              20|                  art|
|96bd76ec8810374ed...|                 46|                       250|                 1|       

## Applying Transformation on the Dataframes 


### List of Dataframes:
    -customer_df 
    -order_item_df 
    -order_payment_df 
    -product_category_translation_df 
    -seller_df
    -product_df
    -geolocation_df
    -orders_df

In [117]:
from pyspark.sql.functions import col, sum, avg
from pyspark.sql.window import Window

In [119]:
payment_analysis = order_payment_df.groupBy("Payment_type") \
    .agg(
        sum("Payment_value").alias("Total Payment Value"),
        avg("Payment_installments").alias("Avg Installments")
    ) \
    .orderBy("Payment_type")

payment_analysis.show()

+------------+--------------------+-----------------+
|Payment_type| Total Payment Value| Avg Installments|
+------------+--------------------+-----------------+
|      boleto|  2869361.2700000196|              1.0|
| credit_card|1.2542084189999647E7|3.507155413763917|
|  debit_card|  217989.79000000015|              1.0|
| not_defined|                 0.0|              1.0|
|     voucher|   379436.8700000001|              1.0|
+------------+--------------------+-----------------+

