In [23]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

Generating PySpark Session

In [11]:
from pyspark.sql import SparkSession

In [12]:
spark = SparkSession.builder.master("local[*]").appName("").getOrCreate()

Import Libraries

In [42]:
from pyspark.sql.functions import col, when, count, lag, avg, stddev
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F
from pyspark.sql.window import Window

Load Data

In [43]:
df = spark.read.csv("./BMW sales data (2010-2024).csv", header=True, inferSchema=True)

In [16]:
df.printSchema()

root
 |-- Model: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Engine_Size_L: double (nullable = true)
 |-- Mileage_KM: integer (nullable = true)
 |-- Price_USD: integer (nullable = true)
 |-- Sales_Volume: integer (nullable = true)
 |-- Sales_Classification: string (nullable = true)



EDA

In [29]:
df.limit(5).toPandas()
df.describe().toPandas()

Unnamed: 0,summary,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,count,50000,50000.0,50000,50000,50000,50000,49999.0,50000.0,50000.0,50000.0,50000
1,mean,,2017.0157,,,,,3.247174943498869,100307.20314,75034.6009,5067.51468,
2,stddev,,4.324459218093149,,,,,1.0090878552174578,57941.509343524616,25998.248881722797,2856.767125229608,
3,min,3 Series,2010.0,Africa,Black,Diesel,Automatic,1.5,3.0,30000.0,100.0,High
4,max,i8,2024.0,South America,White,Petrol,Manual,5.0,199996.0,119998.0,9999.0,Low


In [30]:
#null count
nulls = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas()
nulls

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,0,0,0,0,0,0,1,0,0,0,0


In [None]:
df.show()

+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+
|   Model|Year|       Region| Color|Fuel_Type|Transmission|Engine_Size_L|Mileage_KM|Price_USD|Sales_Volume|Sales_Classification|
+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+
|5 Series|2016|         Asia|   Red|   Petrol|      Manual|          3.5|    151748|    98740|        8300|                High|
|      i8|2013|North America|   Red|   Hybrid|   Automatic|          1.6|    121671|    79219|        3428|                 Low|
|5 Series|2022|North America|  Blue|   Petrol|   Automatic|          4.5|     10991|   113265|        6994|                 Low|
|      X3|2024|  Middle East|  Blue|   Petrol|   Automatic|          1.7|     27255|    60971|        4047|                 Low|
|7 Series|2020|South America| Black|   Diesel|      Manual|          2.1|    122131|    49898|   

In [31]:
df.show(5)

+--------+----+-------------+-----+---------+------------+-------------+----------+---------+------------+--------------------+
|   Model|Year|       Region|Color|Fuel_Type|Transmission|Engine_Size_L|Mileage_KM|Price_USD|Sales_Volume|Sales_Classification|
+--------+----+-------------+-----+---------+------------+-------------+----------+---------+------------+--------------------+
|5 Series|2016|         Asia|  Red|   Petrol|      Manual|         NULL|    151748|    98740|        8300|                High|
|      i8|2013|North America|  Red|   Hybrid|   Automatic|          1.6|    121671|    79219|        3428|                 Low|
|5 Series|2022|North America| Blue|   Petrol|   Automatic|          4.5|     10991|   113265|        6994|                 Low|
|      X3|2024|  Middle East| Blue|   Petrol|   Automatic|          1.7|     27255|    60971|        4047|                 Low|
|7 Series|2020|South America|Black|   Diesel|      Manual|          2.1|    122131|    49898|        308

In [32]:
df.select("Model", "Year").show()

+--------+----+
|   Model|Year|
+--------+----+
|5 Series|2016|
|      i8|2013|
|5 Series|2022|
|      X3|2024|
|7 Series|2020|
|5 Series|2017|
|      i8|2022|
|      M5|2014|
|      X3|2016|
|      i8|2019|
|3 Series|2012|
|      i8|2016|
|7 Series|2020|
|5 Series|2020|
|      X1|2017|
|      M3|2014|
|      X5|2013|
|      M5|2017|
|5 Series|2017|
|      X5|2012|
+--------+----+
only showing top 20 rows


In [35]:
df.filter(df.Year > 2020).show()

+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+
|   Model|Year|       Region| Color|Fuel_Type|Transmission|Engine_Size_L|Mileage_KM|Price_USD|Sales_Volume|Sales_Classification|
+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+
|5 Series|2022|North America|  Blue|   Petrol|   Automatic|          4.5|     10991|   113265|        6994|                 Low|
|      X3|2024|  Middle East|  Blue|   Petrol|   Automatic|          1.7|     27255|    60971|        4047|                 Low|
|      i8|2022|       Europe| White|   Diesel|      Manual|          1.8|    196741|    55064|        7949|                High|
|      X5|2021|South America|   Red|   Diesel|      Manual|          2.2|    184981|    47527|        6273|                 Low|
|      X3|2023|  Middle East| White| Electric|   Automatic|          4.1|    194398|    85370|   

In [36]:
df2 = df.withColumn("Vehicle_age", 2024 - col("Year"))

In [40]:
df2.show()

+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+-----------+
|   Model|Year|       Region| Color|Fuel_Type|Transmission|Engine_Size_L|Mileage_KM|Price_USD|Sales_Volume|Sales_Classification|Vehicle_age|
+--------+----+-------------+------+---------+------------+-------------+----------+---------+------------+--------------------+-----------+
|5 Series|2016|         Asia|   Red|   Petrol|      Manual|         NULL|    151748|    98740|        8300|                High|          8|
|      i8|2013|North America|   Red|   Hybrid|   Automatic|          1.6|    121671|    79219|        3428|                 Low|         11|
|5 Series|2022|North America|  Blue|   Petrol|   Automatic|          4.5|     10991|   113265|        6994|                 Low|          2|
|      X3|2024|  Middle East|  Blue|   Petrol|   Automatic|          1.7|     27255|    60971|        4047|                 Low|          0|
|7 Series|202

In [47]:
df.groupBy("Region").agg(F.sum("Sales_Volume")).show()

+-------------+-----------------+
|       Region|sum(Sales_Volume)|
+-------------+-----------------+
|       Europe|         42555138|
|       Africa|         41565252|
|North America|         42402629|
|South America|         41551818|
|  Middle East|         42326620|
|         Asia|         42974277|
+-------------+-----------------+

