# Spark Project


<center>
Gas Consumption in France
2023-2024
</center>

In [48]:
import pyspark
import yaml
import glob
import pyspark.sql.functions as F
from pyspark.sql.functions import year,month,weekofyear
from pyspark.ml.feature import StringIndexer,OneHotEncoder, Imputer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier


Read some configurations from the yaml file

In [49]:
with open('config.yaml') as f:
    configuration = yaml.safe_load(f)


In [50]:
configuration['service_file_parameters']

[{'delimiter': '|'},
 {'schema': 'id_pdv int,cp int,pop string,latitude double,longitude  double,services string'}]

## 1. Create a Spark Session

In [51]:
spark  = pyspark.sql.SparkSession.builder.appName("Gas").getOrCreate()

In [52]:
spark

2. Data preparation

2.1 Read the data from the csv files

Prepare the schema for the different datasets using the yaml configuration file

In [53]:
#get the right delimiter for each each data from the configuration file
gas_delimiter = configuration['gas_file_parameters'][0]['delimiter']
station_delimiter = configuration['station_file_parameters'][0]['delimiter']
service_delimiter = configuration['service_file_parameters'][0]['delimiter']

#get the right schema for each data from the configuration file
gas_schema = configuration['gas_file_parameters'][1]['schema']
station_schema = configuration['station_file_parameters'][1]['schema']
service_schema = configuration['service_file_parameters'][1]['schema']

#collect the file data paths for each data from the configuration file
gas_files = glob.glob("data/Prix*.csv")
station_files = glob.glob("data/Station*.csv")
service_files = glob.glob("data/Service*.csv")



In [54]:
gas_ddf = spark.read.csv(gas_files, schema=gas_schema, sep=gas_delimiter)
station_ddf = spark.read.csv(station_files, schema=station_schema, sep=station_delimiter)
service_ddf = spark.read.csv(service_files, schema=service_schema, sep=service_delimiter)

2.2 Preprocessing the Gas data </br>
We will be doing the following :</br>
* a) Sort date by date column
* b) Split the date in year, month and weak of the year
* c) Prepare latitude & longitude for mapping (divide by the right power of 10)
* d) Create a Table associated with gas data

To make it easier we will use a preprocessing pipeline


2.2.a) Sort the date by date column

In [56]:
spark.sql("SELECT * FROM gas ORDER BY date ASC")

DataFrame[id_pdv: int, cp: int, pop: string, latitude: double, longitude: double, date: timestamp, id_carburant: int, nom_carburant: string, prix: double]

2.2.b) Split the date in year, month and weak of the year

In [57]:
gas_ddf = gas_ddf.withColumn("year",year(gas_ddf.date))
gas_ddf = gas_ddf.withColumn("month",month(gas_ddf.date))
gas_ddf = gas_ddf.withColumn("weekofyear",weekofyear(gas_ddf.date))
gas_ddf.show(5)

+-------+----+---+---------+---------+-------------------+------------+-------------+------+----+-----+----------+
| id_pdv|  cp|pop| latitude|longitude|               date|id_carburant|nom_carburant|  prix|year|month|weekofyear|
+-------+----+---+---------+---------+-------------------+------------+-------------+------+----+-----+----------+
|1000001|1000|  R|4620114.0| 519791.0|2017-01-02 09:37:03|           1|       Gazole|1258.0|2017|    1|         1|
|1000001|1000|  R|4620114.0| 519791.0|2017-01-03 09:54:58|           1|       Gazole|1258.0|2017|    1|         1|
|1000001|1000|  R|4620114.0| 519791.0|2017-01-06 12:33:57|           1|       Gazole|1258.0|2017|    1|         1|
|1000001|1000|  R|4620114.0| 519791.0|2017-01-09 08:59:53|           1|       Gazole|1258.0|2017|    1|         2|
|1000001|1000|  R|4620114.0| 519791.0|2017-01-10 10:38:39|           1|       Gazole|1258.0|2017|    1|         2|
+-------+----+---+---------+---------+-------------------+------------+---------

2.2.c) Prepare latitude & longitude for mapping (divide by the right power of 10)

In [58]:
gas_ddf = gas_ddf.withColumn("latitude",F.col("latitude")/100_000)
gas_ddf = gas_ddf.withColumn("longitude",F.col("longitude")/100_000)
gas_ddf.show(5)

+-------+----+---+--------+---------+-------------------+------------+-------------+------+----+-----+----------+
| id_pdv|  cp|pop|latitude|longitude|               date|id_carburant|nom_carburant|  prix|year|month|weekofyear|
+-------+----+---+--------+---------+-------------------+------------+-------------+------+----+-----+----------+
|1000001|1000|  R|46.20114|  5.19791|2017-01-02 09:37:03|           1|       Gazole|1258.0|2017|    1|         1|
|1000001|1000|  R|46.20114|  5.19791|2017-01-03 09:54:58|           1|       Gazole|1258.0|2017|    1|         1|
|1000001|1000|  R|46.20114|  5.19791|2017-01-06 12:33:57|           1|       Gazole|1258.0|2017|    1|         1|
|1000001|1000|  R|46.20114|  5.19791|2017-01-09 08:59:53|           1|       Gazole|1258.0|2017|    1|         2|
|1000001|1000|  R|46.20114|  5.19791|2017-01-10 10:38:39|           1|       Gazole|1258.0|2017|    1|         2|
+-------+----+---+--------+---------+-------------------+------------+-------------+----

2.2.d) Create a Table associated with gas data

In [59]:
gas_ddf.createOrReplaceTempView('gas')
spark.sql("SELECT * FROM gas").show(5)

+-------+----+---+--------+---------+-------------------+------------+-------------+------+----+-----+----------+
| id_pdv|  cp|pop|latitude|longitude|               date|id_carburant|nom_carburant|  prix|year|month|weekofyear|
+-------+----+---+--------+---------+-------------------+------------+-------------+------+----+-----+----------+
|1000001|1000|  R|46.20114|  5.19791|2017-01-02 09:37:03|           1|       Gazole|1258.0|2017|    1|         1|
|1000001|1000|  R|46.20114|  5.19791|2017-01-03 09:54:58|           1|       Gazole|1258.0|2017|    1|         1|
|1000001|1000|  R|46.20114|  5.19791|2017-01-06 12:33:57|           1|       Gazole|1258.0|2017|    1|         1|
|1000001|1000|  R|46.20114|  5.19791|2017-01-09 08:59:53|           1|       Gazole|1258.0|2017|    1|         2|
|1000001|1000|  R|46.20114|  5.19791|2017-01-10 10:38:39|           1|       Gazole|1258.0|2017|    1|         2|
+-------+----+---+--------+---------+-------------------+------------+-------------+----

Which gas types have some interest for the rest of the project?

In [77]:
spark.sql(
    """
    SELECT nom_carburant,
    count(*) as count,
    round(100 * count(*) / (SELECT count(*) FROM gas),2) as ratio
    FROM gas 
    GROUP BY  nom_carburant
    ORDER BY count DESC
    """
    ).show()



+-------------+--------+--------------------+
|nom_carburant|   count|               ratio|
+-------------+--------+--------------------+
|       Gazole|16637600|  0.3404250025361638|
|          E10|10517151| 0.21519336658221244|
|         SP98|10199139|  0.2086864643903981|
|         SP95| 7125074| 0.14578745338993335|
|         GPLc| 2182050|0.044647327546282894|
|          E85| 2173148|0.044465182082238985|
|         NULL|   38864|7.952034727704398E-4|
+-------------+--------+--------------------+



                                                                                