# Importing useful libraries


In [1]:
import pandas as pd

# Loading the datasets

The datasets are big so we need to do some preprocessing step. We want to merge both datasets on the date and time attributes, and to keep only the 'Consommation' and 'Température' columns.

* the `weather` dataset can be downloaded on the internet via [Meteo-France](https://donneespubliques.meteofrance.fr/?fond=produit&id_produit=90&id_rubrique=32) 
* the `electricity` dataset can be downloaded on the internet via [RTE](https://odre.opendatasoft.com/explore/dataset/eco2mix-national-cons-def/table/?disjunctive.nature&sort=date_heure&dataChart=eyJxdWVyaWVzIjpbeyJjaGFydHMiOlt7InR5cGUiOiJsaW5lIiwiZnVuYyI6IlNVTSIsInlBeGlzIjoiY29uc29tbWF0aW9uIiwiY29sb3IiOiIjZWE1MjU0Iiwic2NpZW50aWZpY0Rpc3BsYXkiOnRydWV9XSwieEF4aXMiOiJkYXRlX2hldXJlIiwibWF4cG9pbnRzIjoyMDAsInRpbWVzY2FsZSI6Im1pbnV0ZSIsInNvcnQiOiIiLCJjb25maWciOnsiZGF0YXNldCI6ImVjbzJtaXgtbmF0aW9uYWwtY29ucy1kZWYiLCJvcHRpb25zIjp7ImRpc2p1bmN0aXZlLm5hdHVyZSI6dHJ1ZSwic29ydCI6Ii1kYXRlX2hldXJlIn19fV0sInRpbWVzY2FsZSI6IiIsImRpc3BsYXlMZWdlbmQiOnRydWUsImFsaWduTW9udGgiOnRydWV9)

In [2]:
weather = pd.read_csv("data/weather.csv", sep=';', low_memory=False)
electricity = pd.read_csv("data/electricity.csv", sep=';', low_memory=False)

In [3]:
weather.head(5)

Unnamed: 0,ID OMM station,Date,Pression au niveau mer,Variation de pression en 3 heures,Type de tendance barométrique,Direction du vent moyen 10 mn,Vitesse du vent moyen 10 mn,Température,Point de rosée,Humidité,...,Altitude,communes (name),communes (code),EPCI (name),EPCI (code),department (name),department (code),region (name),region (code),mois_de_l_annee
0,81405,2013-04-03T17:00:00+02:00,101460.0,,,40.0,7.2,302.75,293.85,59.0,...,4,Matoury,97307,CA du Centre Littoral,249730045.0,Guyane,973,Guyane,3.0,4
1,7027,2013-04-03T20:00:00+02:00,101180.0,-40.0,7.0,20.0,5.7,277.15,272.75,73.0,...,67,Carpiquet,14137,CU Caen la Mer,200065597.0,Calvados,14,Normandie,28.0,4
2,7181,2013-04-03T20:00:00+02:00,100890.0,20.0,3.0,70.0,7.7,278.25,271.15,60.0,...,336,Thuilley-aux-Groseilles,54523,CC du Pays de Colombey et du Sud Toulois,245400510.0,Meurthe-et-Moselle,54,Grand Est,44.0,4
3,7747,2013-04-03T20:00:00+02:00,100350.0,-100.0,6.0,170.0,1.5,286.75,281.95,73.0,...,42,Perpignan,66136,CU Perpignan Méditerranée Métropole,200027183.0,Pyrénées-Orientales,66,Occitanie,76.0,4
4,7207,2013-04-03T23:00:00+02:00,100950.0,100.0,1.0,30.0,10.3,276.95,271.75,69.0,...,34,Bangor,56009,CC de Belle Ile en Mer,245600465.0,Morbihan,56,Bretagne,53.0,4


In [4]:
# Parse dates
def parse_date(date):
    """
    Parse a date string into two parts.
    ex: parse_date("2013-04-03T17:00:00+02:00") = "2013-04-03", "17:00"
    """
    date = date.split("T")
    return date[0], date[1].split("+")[0][:5]

weather["Date"], weather["Heure"] = zip(*weather["Date"].apply(parse_date))
# Keep only the temperatures and the date
weather2 = weather[["Date", "Heure", "Température"]]
# Keep only the consumption and the date
electricity2 = electricity[["Date", "Heure", "Consommation (MW)"]]

In [5]:
# Sort the data by date and hour
electricity3 = electricity2.sort_values(by=["Date", "Heure"])
weather3 = weather2.sort_values(by=["Date", "Heure"])

In [6]:
# Compute the average temperature for each date and hour
weather4 = weather3.groupby(["Date", "Heure"]).mean().reset_index()

In [7]:
# Merge the two dataframes and add NaN values when the temperature is missing
df = pd.merge(electricity3, weather4, on=["Date", "Heure"], how="outer")

# Remove the rows when the consumption is missing
df = df.dropna(subset=["Consommation (MW)"])

# Convert temperatures from Kelvin to Celsius
def convert_from_kelvin_to_celcius(temperature):
    return temperature - 273.15

df["Température"] = df["Température"].apply(convert_from_kelvin_to_celcius)

# Set the date and hour as index
df = df.set_index(pd.to_datetime(df["Date"] + " " + df["Heure"]))

# Interpolate the missing temperatures
df["Température"] = df["Température"].interpolate(method='time')

# Remove the rows with missing values
df = df.dropna()

# Rename the column
df['Consommation'] = df['Consommation (MW)']
df = df.drop(columns=['Consommation (MW)'])

# Add column for the week number
df['Week'] = df.index.week

# Add column that gives the row number
df['Index'] = df.reset_index().index

# Export the data
df.to_csv("data/interpolated.csv")

  df['Week'] = df.index.week


We want to train the models on the periods that were initially used in [this paper](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9382417) : 

* the train set ranges from the beginning of 2012 to the end of August 2019
* the test set is divided into two parts : the first part ranges from March 16th to April 15th 2020, and the second from April 16th to June 7th 2020.

In [8]:
df

Unnamed: 0,Date,Heure,Température,Consommation,Week,Index
2012-01-01 01:00:00,2012-01-01,01:00,13.443860,56231.0,52,0
2012-01-01 01:30:00,2012-01-01,01:30,13.431579,56075.0,52,1
2012-01-01 02:00:00,2012-01-01,02:00,13.419298,55532.0,52,2
2012-01-01 02:30:00,2012-01-01,02:30,13.407018,54911.0,52,3
2012-01-01 03:00:00,2012-01-01,03:00,13.394737,52496.0,52,4
...,...,...,...,...,...,...
2022-05-31 21:30:00,2022-05-31,21:30,6.887931,44043.0,22,182585
2022-05-31 22:00:00,2022-05-31,22:00,6.887931,44019.0,22,182586
2022-05-31 22:30:00,2022-05-31,22:30,6.887931,45067.0,22,182587
2022-05-31 23:00:00,2022-05-31,23:00,6.887931,46703.0,22,182588


In [9]:
# Train set ranges from the beginning of 2012 to the end of August 2019.
df_train = df[df["Date"] < "2019-09-01"]

# Divide the crisis test data in two periods. 
# The first one ranges from March 16 th to April 15th 2020
df_test1 = df[(df["Date"] >= "2020-03-16") & (df["Date"] < "2020-04-16")]

# and the second one from April 16th to June 7th 2020.
df_test2 = df[(df["Date"] >= "2020-04-16") & (df["Date"] < "2020-06-08")]

# Export the sets
df_train.to_csv("data/train.csv", index=False)
df_test1.to_csv("data/test1.csv", index=False)
df_test2.to_csv("data/test2.csv", index=False)