In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id

# Create Spark session (Databricks does this automatically)
spark = SparkSession.builder.getOrCreate()

# Load datasets
sales_df = spark.read.option("header", True).csv("dbfs:/mnt/xsales/sales_data.csv", inferSchema=True)
segment_df = spark.read.option("header", True).csv("dbfs:/mnt/xsales/global_customer_segments_simple.csv", inferSchema=True)
profile_df = spark.read.option("header", True).csv("dbfs:/mnt/xsales/huge_customer_profile_with_geo.csv", inferSchema=True)



# Clean nulls
sales_df = sales_df.dropna()
segment_df = segment_df.dropna()
profile_df = profile_df.dropna()

# Save cleaned versions
sales_df.write.mode("overwrite").option("header", True).csv("dbfs:/mnt/xsales/cleaned_sales_data")
segment_df.write.mode("overwrite").option("header", True).csv("dbfs:/mnt/xsales/cleaned_customer_segments")
profile_df.write.mode("overwrite").option("header", True).csv("dbfs:/mnt/xsales/cleaned_customer_profiles")

In [0]:

sales_df.write.mode("overwrite").format("delta").save("/mnt/cleaned_data/sales_df")
segment_df.write.mode("overwrite").format("delta").save("/mnt/cleaned_data/segment_df")
profile_df.write.mode("overwrite").format("delta").save("/mnt/cleaned_data/profile_df")

In [0]:
display(sales_df)
display(segment_df)
display(profile_df)

Product_ID,Sale_Date,Sales_Rep,Region,Sales_Amount,Quantity_Sold,Product_Category,Unit_Cost,Unit_Price,Customer_Type,Discount,Payment_Method,Sales_Channel,Region_and_Sales_Rep
1052,2023-02-03,Bob,North,5053.97,18,Furniture,152.75,267.22,Returning,0.09,Cash,Online,North-Bob
1093,2023-04-21,Bob,West,4384.02,17,Furniture,3816.39,4209.44,Returning,0.11,Cash,Retail,West-Bob
1015,2023-09-21,David,South,4631.23,30,Food,261.56,371.4,Returning,0.2,Bank Transfer,Retail,South-David
1072,2023-08-24,Bob,South,2167.94,39,Clothing,4330.03,4467.75,New,0.02,Credit Card,Retail,South-Bob
1061,2023-03-24,Charlie,East,3750.2,13,Electronics,637.37,692.71,New,0.08,Credit Card,Online,East-Charlie
1021,2023-02-11,Charlie,West,3761.15,32,Food,900.79,1106.51,New,0.21,Cash,Online,West-Charlie
1083,2023-04-11,Bob,West,618.31,29,Furniture,2408.81,2624.09,Returning,0.14,Cash,Online,West-Bob
1087,2023-01-06,Eve,South,7698.92,46,Furniture,3702.51,3964.65,New,0.12,Bank Transfer,Online,South-Eve
1075,2023-06-29,David,South,4223.39,30,Furniture,738.06,1095.4499999999998,New,0.05,Bank Transfer,Online,South-David
1075,2023-10-09,Charlie,West,8239.58,18,Clothing,2228.35,2682.34,New,0.13,Bank Transfer,Online,West-Charlie


Customer_ID,Global_Segment,Is_Loyalty_Member
C000,Subscription Members,Yes
C001,Occasional Shoppers,No
C002,Mobile Shoppers,Yes
C003,Subscription Members,No
C004,Frequent Buyers,No
C005,Eco-conscious,Yes
C006,Mobile Shoppers,No
C007,Mobile Shoppers,Yes
C008,Subscription Members,No
C009,Quality Focused,Yes


Customer_ID,First_Purchase_Date,Last_Purchase_Date,Tenure_Months,Total_Orders,Avg_Order_Value,Total_Spend,Reward_Points,Purchase_Frequency,Avg_Discount_Used,Churn_Probability,Category_Loyalty,Preferred_Channel,City,State,Latitude,Longitude
C000,2023-09-27T02:56:56.788485Z,2024-02-26T02:56:56.788485Z,5,41,1794.66,73581.06,3679,8.2,0.22,0.88,Furniture,Retail,Mumbai,Maharashtra,19.076,72.8777
C001,2022-06-20T02:56:56.788485Z,2023-03-22T02:56:56.788485Z,9,83,1594.87,132374.21,6618,9.22,0.08,0.32,Clothing,Retail,Delhi,Delhi,28.6139,77.209
C002,2020-12-16T02:56:56.788485Z,2021-08-24T02:56:56.788485Z,8,82,1523.45,124922.9,6246,10.25,0.06,0.17,Food,Retail,Bengaluru,Karnataka,12.9716,77.5946
C003,2023-07-04T02:56:56.788485Z,2023-09-22T02:56:56.788485Z,3,16,1635.21,26163.36,1308,5.33,0.04,0.74,Electronics,Online,Hyderabad,Telangana,17.385,78.4867
C004,2021-04-22T02:56:56.788485Z,2021-07-31T02:56:56.788485Z,3,4,684.95,2739.8,136,1.33,0.16,0.38,Food,Retail,Chennai,Tamil Nadu,13.0827,80.2707
C005,2023-02-23T02:56:56.788485Z,2023-05-11T02:56:56.788485Z,3,12,1797.4,21568.8,1078,4.0,0.12,0.47,Food,Online,Ahmedabad,Gujarat,23.0225,72.5714
C006,2020-08-08T02:56:56.788485Z,2020-09-25T02:56:56.788485Z,2,29,447.15,12967.35,648,14.5,0.06,0.63,Food,Online,Pune,Maharashtra,18.5204,73.8567
C007,2023-02-03T02:56:56.788485Z,2023-11-21T02:56:56.788485Z,10,7,805.42,5637.94,281,0.7,0.17,0.9,Furniture,Retail,Kolkata,West Bengal,22.5726,88.3639
C008,2020-10-05T02:56:56.788485Z,2021-01-26T02:56:56.788485Z,4,14,1256.15,17586.1,879,3.5,0.05,0.42,Clothing,Retail,Jaipur,Rajasthan,26.9124,75.7873
C009,2023-01-03T02:56:56.788485Z,2023-07-06T02:56:56.788485Z,6,12,385.23,4622.76,231,2.0,0.14,0.78,Furniture,Retail,Lucknow,Uttar Pradesh,26.8467,80.9462
