In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, round, max, avg, to_timestamp, dayofweek, date_format, expr
from pyspark.sql.types import FloatType
import requests
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from haversine import haversine, Unit

In [3]:
spark = SparkSession.builder.appName("projectFirst").getOrCreate()

your 131072x1 screen size is bogus. expect trouble
23/09/05 16:26:47 WARN Utils: Your hostname, SUSHAN resolves to a loopback address: 127.0.1.1; using 172.31.76.59 instead (on interface eth0)
23/09/05 16:26:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/05 16:26:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
stationInfo_df = spark.read.csv("data/Fuel_Station_Information.csv", header=True, inferSchema=True)

hourlyPrices_df = spark.read.csv("data/Hourly_Gasoline_Prices.csv", header=True, inferSchema=True)

                                                                                

In [8]:
print(hourlyPrices_df.printSchema(), stationInfo_df.printSchema())

root
 |-- Id: integer (nullable = true)
 |-- isSelf: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- Date: timestamp (nullable = true)

root
 |-- Id: string (nullable = true)
 |-- Fuel_station_manager: string (nullable = true)
 |-- Petrol_company: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Station_name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitudine: string (nullable = true)

None None


In [5]:
# Remove Duplicates
hourlyPrices_df = hourlyPrices_df.dropDuplicates()
stationInfo_df = stationInfo_df.dropDuplicates()

In [6]:
# Drop rows with any null values
hourlyPrices_df = hourlyPrices_df.dropna()
stationInfo_df = stationInfo_df.dropna()

In [9]:
stationInfo_df = stationInfo_df.drop("Fuel_station_manager")

In [10]:
hourlyPrices_df = hourlyPrices_df.withColumn("Date", to_timestamp(col("Date"), "yyyy-MM-dd HH:mm:ss"))

In [11]:
hourlyPrices_df.write.parquet("data/cleaned_fuel_prices.parquet")
stationInfo_df.write.parquet("data/cleaned_station_info.parquet")

23/09/05 16:28:00 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [12]:
fuel_prices_df = spark.read.parquet("data/cleaned_fuel_prices.parquet")
station_info_df = spark.read.parquet("data/cleaned_station_info.parquet")

In [13]:
def convert_to_euro(usd_price):
    # API URL to fetch the exchange rate
    api_url = "https://cdn.jsdelivr.net/gh/fawazahmed0/currency-api@1/latest/currencies/usd/eur.json"
    
    # Fetch exchange rate data from the API
    response = requests.get(api_url)
    exchange_rate_data = response.json()
    
    # Extract the exchange rate
    exchange_rate = exchange_rate_data["eur"]
    
    # Convert USD to Euro
    euro_price = usd_price * exchange_rate
    return euro_price


In [13]:
# Apply the UDF to convert prices from USD to Euro
fuel_prices_df = fuel_prices_df.withColumn("Price_Euro", convert_to_euro(col("Price")))
# Round the Euro prices to three decimal units
fuel_prices_df = fuel_prices_df.withColumn("Price_Euro_Rounded", round(col("Price_Euro"), 3))

# Drop the "Price_Euro" column
fuel_prices_df = fuel_prices_df.drop("Price_Euro")

# Show the updated DataFrame
fuel_prices_df.show()

+-----+------+-----+-------------------+------------------+
|   Id|isSelf|Price|               Date|Price_Euro_Rounded|
+-----+------+-----+-------------------+------------------+
| 5079|     1|1.809|2022-01-04 06:39:09|             1.678|
|38752|     1|1.804|2022-01-04 07:18:19|             1.674|
|51635|     0|1.974|2022-01-04 07:39:23|             1.831|
| 6810|     0|2.009|2022-01-04 07:50:04|             1.864|
| 4983|     0|1.758|2022-01-04 07:55:36|             1.631|
|51790|     1|1.819|2022-01-04 08:50:47|             1.687|
|42708|     0|2.064|2022-01-04 08:51:50|             1.915|
|48545|     1|1.799|2022-01-04 09:26:35|             1.669|
|46455|     0|1.954|2022-01-04 12:14:28|             1.813|
|47555|     0|2.048|2022-01-04 12:58:37|               1.9|
| 6711|     1|1.819|2022-01-04 13:15:06|             1.687|
|31878|     1|1.784|2022-01-04 13:15:25|             1.655|
|52822|     1|1.769|2022-01-04 13:19:01|             1.641|
|51870|     0|2.054|2022-01-04 13:40:13|

In [14]:
# Find the maximum price
max_price = fuel_prices_df.agg(max("Price")).collect()[0][0]

# Find the ID with the highest price
highest_price_id = fuel_prices_df.filter(col("Price") == max_price).select("Id").first()[0]

# Order the data by price in descending order
sorted_prices_df = fuel_prices_df.orderBy(col("Price").desc())

# Calculate the average price
average_price = fuel_prices_df.agg({"Price": "avg"}).collect()[0][0]

# Display the results
print("ID with Highest Price:", highest_price_id)
print("Maximum Average Price:", average_price)

ID with Highest Price: 54771
Maximum Average Price: 1.835294686861716


In [15]:
fuel_prices_df = fuel_prices_df.withColumn("Day_of_Week", date_format(col("Date"), "E"))

day_sales_df = fuel_prices_df.groupBy("Day_of_Week").agg(F.count("*").alias("Sales_Count"))

# Order the data by sales count in descending order to rank the days
window_spec = Window.orderBy(col("Sales_Count").desc())
day_sales_ranked_df = day_sales_df.withColumn("Rank", F.rank().over(window_spec))

day_sales_ranked_df.show()

23/09/05 16:28:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/09/05 16:28:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/09/05 16:28:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/09/05 16:28:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/09/05 16:28:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/09/05 16:28:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----------+-----------+----+
|Day_of_Week|Sales_Count|Rank|
+-----------+-----------+----+
|        Mon|     410910|   1|
|        Thu|     385288|   2|
|        Wed|     375737|   3|
|        Tue|     363014|   4|
|        Fri|     362752|   5|
|        Sat|     308091|   6|
|        Sun|     238545|   7|
+-----------+-----------+----+



                                                                                

In [16]:
def calculate_distance(lat1, lon1, lat2, lon2):
    return haversine((lat1, lon1), (lat2, lon2), unit=Unit.KILOMETERS)

In [16]:
# Extract the day of the week from the 'Date' column and create a new column 'Day_of_Week'
fuel_prices_df = fuel_prices_df.withColumn("Day_of_Week", date_format(col("Date"), "E"))

# Pivot the table to get counts for each day of the week
pivot_df = fuel_prices_df.groupBy().pivot("Day_of_Week").agg(F.count("*"))

# Display the result
pivot_df.show()

                                                                                

+------+------+------+------+------+------+------+
|   Fri|   Mon|   Sat|   Sun|   Thu|   Tue|   Wed|
+------+------+------+------+------+------+------+
|362752|410910|308091|238545|385288|363014|375737|
+------+------+------+------+------+------+------+

