# Importing useful libraries


In [1]:
import pandas as pd

# Loading the datasets

The datasets are big so we need to do some preprocessing step. We want to merge both datasets on the date and time attributes, and to keep only the 'Consommation' and 'Température' columns.

* the `weather` dataset can be downloaded on the internet via [Meteo-France](https://donneespubliques.meteofrance.fr/?fond=produit&id_produit=90&id_rubrique=32) 
* the `electricity` dataset can be downloaded on the internet via [RTE](https://odre.opendatasoft.com/explore/dataset/eco2mix-national-cons-def/table/?disjunctive.nature&sort=date_heure&dataChart=eyJxdWVyaWVzIjpbeyJjaGFydHMiOlt7InR5cGUiOiJsaW5lIiwiZnVuYyI6IlNVTSIsInlBeGlzIjoiY29uc29tbWF0aW9uIiwiY29sb3IiOiIjZWE1MjU0Iiwic2NpZW50aWZpY0Rpc3BsYXkiOnRydWV9XSwieEF4aXMiOiJkYXRlX2hldXJlIiwibWF4cG9pbnRzIjoyMDAsInRpbWVzY2FsZSI6Im1pbnV0ZSIsInNvcnQiOiIiLCJjb25maWciOnsiZGF0YXNldCI6ImVjbzJtaXgtbmF0aW9uYWwtY29ucy1kZWYiLCJvcHRpb25zIjp7ImRpc2p1bmN0aXZlLm5hdHVyZSI6dHJ1ZSwic29ydCI6Ii1kYXRlX2hldXJlIn19fV0sInRpbWVzY2FsZSI6IiIsImRpc3BsYXlMZWdlbmQiOnRydWUsImFsaWduTW9udGgiOnRydWV9)

In [2]:
weather = pd.read_csv("data/weather.csv", sep=';', low_memory=False)
electricity = pd.read_csv("data/electricity.csv", sep=';', low_memory=False)

In [3]:
weather.head(5)

Unnamed: 0,ID OMM station,Date,Pression au niveau mer,Variation de pression en 3 heures,Type de tendance barométrique,Direction du vent moyen 10 mn,Vitesse du vent moyen 10 mn,Température,Point de rosée,Humidité,...,Altitude,communes (name),communes (code),EPCI (name),EPCI (code),department (name),department (code),region (name),region (code),mois_de_l_annee
0,81405,2013-04-03T17:00:00+02:00,101460.0,,,40.0,7.2,302.75,293.85,59.0,...,4,Matoury,97307,CA du Centre Littoral,249730045.0,Guyane,973,Guyane,3.0,4
1,7027,2013-04-03T20:00:00+02:00,101180.0,-40.0,7.0,20.0,5.7,277.15,272.75,73.0,...,67,Carpiquet,14137,CU Caen la Mer,200065597.0,Calvados,14,Normandie,28.0,4
2,7181,2013-04-03T20:00:00+02:00,100890.0,20.0,3.0,70.0,7.7,278.25,271.15,60.0,...,336,Thuilley-aux-Groseilles,54523,CC du Pays de Colombey et du Sud Toulois,245400510.0,Meurthe-et-Moselle,54,Grand Est,44.0,4
3,7747,2013-04-03T20:00:00+02:00,100350.0,-100.0,6.0,170.0,1.5,286.75,281.95,73.0,...,42,Perpignan,66136,CU Perpignan Méditerranée Métropole,200027183.0,Pyrénées-Orientales,66,Occitanie,76.0,4
4,7207,2013-04-03T23:00:00+02:00,100950.0,100.0,1.0,30.0,10.3,276.95,271.75,69.0,...,34,Bangor,56009,CC de Belle Ile en Mer,245600465.0,Morbihan,56,Bretagne,53.0,4


In [4]:
# Parse dates
def parse_date(date):
    """
    Parse a date string into two parts.
    ex: parse_date("2013-04-03T17:00:00+02:00") = "2013-04-03", "17:00"
    """
    date = date.split("T")
    return date[0], date[1].split("+")[0][:5]

weather["Date"], weather["Heure"] = zip(*weather["Date"].apply(parse_date))
# Keep only the temperatures and the date
weather2 = weather[["Date", "Heure", "Température"]]
# Keep only the consumption and the date
electricity2 = electricity[["Date", "Heure", "Consommation (MW)"]]
# Merge weather and electricity
df = pd.merge(electricity2, weather2, on=["Date", "Heure"])

In [10]:
# Convert temperatures from Kelvin to Celsius
def convert_from_kelvin_to_celcius(temperature):
    return temperature - 273.15

df["Température"] = df["Température"].apply(convert_from_kelvin_to_celcius)
df['Consommation'] = df['Consommation (MW)']

In [15]:
# Compute the average temperature and consumption for each day and hour
df2 = df.groupby(["Date", "Heure"]).mean().reset_index()

In [5]:
df2

Unnamed: 0,Date,Heure,Consommation (MW),Température,Consommation
0,2012-01-01,01:00,56231.0,13.443860,56231.0
1,2012-01-01,04:00,49161.0,13.370175,49161.0
2,2012-01-01,07:00,45580.0,13.743103,45580.0
3,2012-01-01,10:00,47421.0,13.669643,47421.0
4,2012-01-01,13:00,53651.0,15.675439,53651.0
...,...,...,...,...,...
26289,2021-01-01,10:00,62385.0,8.260000,62385.0
26290,2021-01-01,13:00,67987.0,9.055172,67987.0
26291,2021-01-01,16:00,61426.0,9.322414,61426.0
26292,2021-01-01,19:00,69683.0,7.722414,69683.0


In [10]:
# Drop the Consommation (MW) column
df2 = df2.drop(columns=["Consommation (MW)"])

We want to train the models on the periods that were initially used in [this paper](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9382417) : 

* the train set ranges from the beginning of 2012 to the end of August 2019
* the test set is divided into two parts : the first part ranges from March 16th to April 15th 2020, and the second from April 16th to June 7th 2020.

In [11]:
# Train set ranges from the beginning of 2012 to the end of August 2019.
df_train = df2[df2["Date"] < "2019-09-01"]

# Divide the crisis test data in two periods. 
# The first one ranges from March 16 th to April 15th 2020
df_test1 = df2[(df2["Date"] >= "2020-03-16") & (df2["Date"] < "2020-04-16")]

# and the second one from April 16th to June 7th 2020.
df_test2 = df2[(df2["Date"] >= "2020-04-16") & (df2["Date"] < "2020-06-08")]

# Export the sets
df_train.to_csv("data/train.csv", index=False)
df_test1.to_csv("data/test1.csv", index=False)
df_test2.to_csv("data/test2.csv", index=False)

In [16]:
df_train

Unnamed: 0,Date,Heure,Température,Consommation
0,2012-01-01,01:00,13.443860,56231.0
1,2012-01-01,04:00,13.370175,49161.0
2,2012-01-01,07:00,13.743103,45580.0
3,2012-01-01,10:00,13.669643,47421.0
4,2012-01-01,13:00,15.675439,53651.0
...,...,...,...,...
22379,2019-08-31,11:00,22.783051,44961.0
22380,2019-08-31,14:00,25.558333,46554.0
22381,2019-08-31,17:00,25.354237,43636.0
22382,2019-08-31,20:00,22.828814,43340.0


In [17]:
df_test1

Unnamed: 0,Date,Heure,Température,Consommation
23960,2020-03-16,01:00,11.970000,51033.0
23961,2020-03-16,04:00,11.249153,46641.0
23962,2020-03-16,07:00,11.118644,55611.0
23963,2020-03-16,10:00,13.215254,60495.0
23964,2020-03-16,13:00,15.458333,61722.0
...,...,...,...,...
24203,2020-04-15,11:00,15.710169,47011.0
24204,2020-04-15,14:00,19.149153,46330.0
24205,2020-04-15,17:00,21.025862,40661.0
24206,2020-04-15,20:00,18.818644,43993.0


In [18]:
df_test2

Unnamed: 0,Date,Heure,Température,Consommation
24208,2020-04-16,02:00,13.659322,41417.0
24209,2020-04-16,05:00,12.849153,36650.0
24210,2020-04-16,08:00,13.732759,41879.0
24211,2020-04-16,11:00,18.184483,45358.0
24212,2020-04-16,14:00,20.433898,45927.0
...,...,...,...,...
24626,2020-06-07,11:00,16.124138,39907.0
24627,2020-06-07,14:00,17.827586,39615.0
24628,2020-06-07,17:00,18.359322,35689.0
24629,2020-06-07,20:00,18.110526,38968.0
