In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

Generating PySpark Session

In [2]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.master("local[*]").appName("Sales Analysis").getOrCreate()

Import Libraries

In [6]:
from pyspark.sql.functions import col, when, count, lag, avg, stddev
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F
from pyspark.sql.window import Window

Load Data

In [7]:
df = spark.read.csv("./BMW sales data (2010-2024).csv", header=True, inferSchema=True)

In [8]:
df.printSchema()

root
 |-- Model: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Engine_Size_L: double (nullable = true)
 |-- Mileage_KM: integer (nullable = true)
 |-- Price_USD: integer (nullable = true)
 |-- Sales_Volume: integer (nullable = true)
 |-- Sales_Classification: string (nullable = true)



EDA

In [9]:
df.limit(5).toPandas()
df.describe().toPandas()

Unnamed: 0,summary,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,count,50000,50000.0,50000,50000,50000,50000,49999.0,50000.0,50000.0,50000.0,50000
1,mean,,2017.0157,,,,,3.247174943498869,100307.20314,75034.6009,5067.51468,
2,stddev,,4.324459218093149,,,,,1.0090878552174578,57941.509343524616,25998.248881722797,2856.767125229608,
3,min,3 Series,2010.0,Africa,Black,Diesel,Automatic,1.5,3.0,30000.0,100.0,High
4,max,i8,2024.0,South America,White,Petrol,Manual,5.0,199996.0,119998.0,9999.0,Low


In [10]:
#null count
nulls = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas()
nulls

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,0,0,0,0,0,0,1,0,0,0,0


In [11]:
df.show()

+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+
|   Model|Year|       Region| Color|Fuel_Type|Transmission|Engine_Size_L|Mileage_KM|Price_USD|Sales_Volume|Sales_Classification|
+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+
|5 Series|2016|         Asia|   Red|   Petrol|      Manual|         NULL|    151748|    98740|        8300|                High|
|      i8|2013|North America|   Red|   Hybrid|   Automatic|          1.6|    121671|    79219|        3428|                 Low|
|5 Series|2022|North America|  Blue|   Petrol|   Automatic|          4.5|     10991|   113265|        6994|                 Low|
|      X3|2024|  Middle East|  Blue|   Petrol|   Automatic|          1.7|     27255|    60971|        4047|                 Low|
|7 Series|2020|South America| Black|   Diesel|      Manual|          2.1|    122131|    49898|   

In [12]:
df.show(5)

+--------+----+-------------+-----+---------+------------+-------------+----------+---------+------------+--------------------+
|   Model|Year|       Region|Color|Fuel_Type|Transmission|Engine_Size_L|Mileage_KM|Price_USD|Sales_Volume|Sales_Classification|
+--------+----+-------------+-----+---------+------------+-------------+----------+---------+------------+--------------------+
|5 Series|2016|         Asia|  Red|   Petrol|      Manual|         NULL|    151748|    98740|        8300|                High|
|      i8|2013|North America|  Red|   Hybrid|   Automatic|          1.6|    121671|    79219|        3428|                 Low|
|5 Series|2022|North America| Blue|   Petrol|   Automatic|          4.5|     10991|   113265|        6994|                 Low|
|      X3|2024|  Middle East| Blue|   Petrol|   Automatic|          1.7|     27255|    60971|        4047|                 Low|
|7 Series|2020|South America|Black|   Diesel|      Manual|          2.1|    122131|    49898|        308

In [13]:
df.select("Model", "Year").show()

+--------+----+
|   Model|Year|
+--------+----+
|5 Series|2016|
|      i8|2013|
|5 Series|2022|
|      X3|2024|
|7 Series|2020|
|5 Series|2017|
|      i8|2022|
|      M5|2014|
|      X3|2016|
|      i8|2019|
|3 Series|2012|
|      i8|2016|
|7 Series|2020|
|5 Series|2020|
|      X1|2017|
|      M3|2014|
|      X5|2013|
|      M5|2017|
|5 Series|2017|
|      X5|2012|
+--------+----+
only showing top 20 rows


In [14]:
df.filter(df.Year > 2020).show()

+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+
|   Model|Year|       Region| Color|Fuel_Type|Transmission|Engine_Size_L|Mileage_KM|Price_USD|Sales_Volume|Sales_Classification|
+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+
|5 Series|2022|North America|  Blue|   Petrol|   Automatic|          4.5|     10991|   113265|        6994|                 Low|
|      X3|2024|  Middle East|  Blue|   Petrol|   Automatic|          1.7|     27255|    60971|        4047|                 Low|
|      i8|2022|       Europe| White|   Diesel|      Manual|          1.8|    196741|    55064|        7949|                High|
|      X5|2021|South America|   Red|   Diesel|      Manual|          2.2|    184981|    47527|        6273|                 Low|
|      X3|2023|  Middle East| White| Electric|   Automatic|          4.1|    194398|    85370|   

In [15]:
df2 = df.withColumn("Vehicle_age", 2024 - col("Year"))

In [16]:
df2.show()

+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+-----------+
|   Model|Year|       Region| Color|Fuel_Type|Transmission|Engine_Size_L|Mileage_KM|Price_USD|Sales_Volume|Sales_Classification|Vehicle_age|
+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+-----------+
|5 Series|2016|         Asia|   Red|   Petrol|      Manual|         NULL|    151748|    98740|        8300|                High|          8|
|      i8|2013|North America|   Red|   Hybrid|   Automatic|          1.6|    121671|    79219|        3428|                 Low|         11|
|5 Series|2022|North America|  Blue|   Petrol|   Automatic|          4.5|     10991|   113265|        6994|                 Low|          2|
|      X3|2024|  Middle East|  Blue|   Petrol|   Automatic|          1.7|     27255|    60971|        4047|                 Low|          0|
|7 Series|202

In [17]:
df.groupBy("Region").agg(F.sum("Sales_Volume")).show()

+-------------+-----------------+
|       Region|sum(Sales_Volume)|
+-------------+-----------------+
|       Europe|         42555138|
|       Africa|         41565252|
|North America|         42402629|
|South America|         41551818|
|  Middle East|         42326620|
|         Asia|         42974277|
+-------------+-----------------+



In [22]:
df.groupBy("Region").agg(
    F.sum("Sales_Volume").alias("Total_Sales")
).toPandas()

Unnamed: 0,Region,Total_Sales
0,Europe,42555138
1,Africa,41565252
2,North America,42402629
3,South America,41551818
4,Middle East,42326620
5,Asia,42974277


✔️ SUM

In [26]:
df.groupBy("Region").agg(F.sum("Sales_Volume")).show()

+-------------+-----------------+
|       Region|sum(Sales_Volume)|
+-------------+-----------------+
|       Europe|         42555138|
|       Africa|         41565252|
|North America|         42402629|
|South America|         41551818|
|  Middle East|         42326620|
|         Asia|         42974277|
+-------------+-----------------+



✔️ AVG (mean)

In [34]:
df.groupBy("Model").agg(F.avg("Price_USD")).show()

+--------+-----------------+
|   Model|   avg(Price_USD)|
+--------+-----------------+
|      i3|74800.26808142052|
|3 Series| 75566.2339499456|
|      X6|74434.60049129075|
|      X1|75262.21903719913|
|7 Series|75570.19674239177|
|      X3|75016.61685568157|
|5 Series|75287.84407665505|
|      M5|74474.93099598035|
|      M3|74841.58871515976|
|      i8|75366.27095093357|
|      X5|74708.11678181413|
+--------+-----------------+



✔️ MAX

In [33]:
df.groupBy("Fuel_Type").agg(F.max("Mileage_KM")).show()


+---------+---------------+
|Fuel_Type|max(Mileage_KM)|
+---------+---------------+
|   Diesel|         199995|
|   Hybrid|         199996|
| Electric|         199991|
|   Petrol|         199987|
+---------+---------------+



✔️ MIN

In [36]:
df.groupBy("Fuel_Type").agg(F.min("Mileage_KM")).show()

+---------+---------------+
|Fuel_Type|min(Mileage_KM)|
+---------+---------------+
|   Diesel|             42|
|   Hybrid|             23|
| Electric|             48|
|   Petrol|              3|
+---------+---------------+



✔️ COUNT

In [38]:
df.groupBy("Transmission").agg(F.count("*")).show()

+------------+--------+
|Transmission|count(1)|
+------------+--------+
|   Automatic|   24846|
|      Manual|   25154|
+------------+--------+



In [45]:
df.groupby("Region").agg(F.countDistinct("Model").alias("Total distinct model")).toPandas()

Unnamed: 0,Region,Total distinct model
0,Europe,11
1,Africa,11
2,North America,11
3,South America,11
4,Middle East,11
5,Asia,11


Multiple Aggregations at Once

In [51]:
df.groupBy("Region").agg(
    F.sum("Sales_Volume").alias("Total_Sales"),
    F.avg("Price_USD").alias("Avg_Price"),
    F.countDistinct("Model").alias("Unique_Models")
).toPandas()

Unnamed: 0,Region,Total_Sales,Avg_Price,Unique_Models
0,Europe,42555138,74988.356851,11
1,Africa,41565252,74885.771598,11
2,North America,42402629,75070.054709,11
3,South America,41551818,74973.598837,11
4,Middle East,42326620,74726.788487,11
5,Asia,42974277,75554.925006,11


groupBy on multiple columns

In [61]:
df.groupBy("Region","Fuel_type").agg(
    F.sum("Sales_Volume").alias("Total_Sales")
).toPandas()

Unnamed: 0,Region,Fuel_type,Total_Sales
0,Europe,Hybrid,10825662
1,South America,Hybrid,10362003
2,North America,Hybrid,10808682
3,South America,Petrol,10304421
4,Asia,Hybrid,11422396
5,Africa,Hybrid,10486034
6,North America,Petrol,10472845
7,Asia,Petrol,10462998
8,Europe,Diesel,10406133
9,Europe,Electric,10590064


PIVOT (Very Powerful)

In [64]:
df.groupBy("Region").pivot("Fuel_Type").agg(
    F.sum("Sales_Volume")
).toPandas()

Unnamed: 0,Region,Diesel,Electric,Hybrid,Petrol
0,Europe,10406133,10590064,10825662,10733279
1,Africa,10211670,10329085,10486034,10538463
2,North America,10259904,10861198,10808682,10472845
3,South America,10500121,10385273,10362003,10304421
4,Middle East,10491957,10395195,10627320,10812148
5,Asia,10492033,10596850,11422396,10462998


ORDER Results

Sort results by total sales:

In [72]:
df.groupBy("Region").agg(
    F.sum("Sales_Volume").alias("Total_Sales")
).orderBy(F.desc("Total_Sales")).toPandas()

Unnamed: 0,Region,Total_Sales
0,Asia,42974277
1,Europe,42555138
2,North America,42402629
3,Middle East,42326620
4,Africa,41565252
5,South America,41551818


Filtering Before Aggregation

In [73]:
df.filter(df.Year > 2020) \
  .groupBy("Region") \
  .agg(F.sum("Sales_Volume").alias("Sales_After_2020")) \
  .show()

+-------------+----------------+
|       Region|Sales_After_2020|
+-------------+----------------+
|       Europe|        11754801|
|       Africa|        11257467|
|North America|        11519426|
|South America|        11018071|
|  Middle East|        11594599|
|         Asia|        11457756|
+-------------+----------------+

