Our goal is to align both weather and air quality data to local Milan time, for this we will implement daylight savings time and add 1 hour to original data(which is in GMT+00), therefore we will be in GMT+01 while in CET (DST Off) and GMT+02 while in CEST (DST On)

In [2]:
import pandas as pd

In [3]:
aq_data = pd.read_csv("./AQ_APIdata.csv")

In [4]:
display(aq_data) #Datetime is in GMT+00

Unnamed: 0.1,Unnamed: 0,time,pm10,pm2_5,nitrogen_dioxide,sulphur_dioxide,ozone
0,0,2019-12-31 00:00:00,85.9,76.6,38.5,4.5,2.0
1,1,2019-12-31 01:00:00,84.5,76.9,35.2,4.9,4.0
2,2,2019-12-31 02:00:00,83.0,76.4,35.1,5.1,5.0
3,3,2019-12-31 03:00:00,80.7,74.9,33.5,4.8,2.0
4,4,2019-12-31 04:00:00,77.6,73.1,29.6,3.9,3.0
...,...,...,...,...,...,...,...
35083,35083,2023-12-31 19:00:00,38.1,31.8,27.2,2.0,2.0
35084,35084,2023-12-31 20:00:00,32.1,29.4,31.0,2.0,1.0
35085,35085,2023-12-31 21:00:00,34.6,30.6,32.2,2.2,1.0
35086,35086,2023-12-31 22:00:00,35.1,27.8,28.4,2.2,1.0


In [5]:
#Verify if DST is applied in data
aq_data['datetime'] = pd.to_datetime(aq_data['time'])

As we can see below, GMT+00 doesn't account for DST, but we want data to represent days in Milan accurately, and to be aligned with people's perception.

In [6]:
#show data from march 29th 2020 to march 30th 2020(start of DST for 2020), datetime is already a pd.dt
aq_data[(aq_data['datetime'] >= '2020-03-29') & (aq_data['datetime'] < '2020-03-30')]

Unnamed: 0.1,Unnamed: 0,time,pm10,pm2_5,nitrogen_dioxide,sulphur_dioxide,ozone,datetime
2136,2136,2020-03-29 00:00:00,65.9,56.4,55.0,3.7,9.0,2020-03-29 00:00:00
2137,2137,2020-03-29 01:00:00,67.5,55.0,52.4,3.4,9.0,2020-03-29 01:00:00
2138,2138,2020-03-29 02:00:00,68.0,52.7,46.7,3.3,4.0,2020-03-29 02:00:00
2139,2139,2020-03-29 03:00:00,56.6,47.6,42.8,2.9,13.0,2020-03-29 03:00:00
2140,2140,2020-03-29 04:00:00,63.6,48.3,36.9,2.8,10.0,2020-03-29 04:00:00
2141,2141,2020-03-29 05:00:00,58.3,49.1,35.0,3.1,10.0,2020-03-29 05:00:00
2142,2142,2020-03-29 06:00:00,54.5,51.2,35.0,3.4,10.0,2020-03-29 06:00:00
2143,2143,2020-03-29 07:00:00,61.3,49.1,32.3,3.3,17.0,2020-03-29 07:00:00
2144,2144,2020-03-29 08:00:00,61.7,50.4,29.7,3.3,24.0,2020-03-29 08:00:00
2145,2145,2020-03-29 09:00:00,62.8,53.2,26.6,3.5,41.0,2020-03-29 09:00:00


In [7]:
#As we can see there is no switch, data stays GMT+00
aq_data = aq_data.drop(columns=['time'])
display(aq_data)

Unnamed: 0.1,Unnamed: 0,pm10,pm2_5,nitrogen_dioxide,sulphur_dioxide,ozone,datetime
0,0,85.9,76.6,38.5,4.5,2.0,2019-12-31 00:00:00
1,1,84.5,76.9,35.2,4.9,4.0,2019-12-31 01:00:00
2,2,83.0,76.4,35.1,5.1,5.0,2019-12-31 02:00:00
3,3,80.7,74.9,33.5,4.8,2.0,2019-12-31 03:00:00
4,4,77.6,73.1,29.6,3.9,3.0,2019-12-31 04:00:00
...,...,...,...,...,...,...,...
35083,35083,38.1,31.8,27.2,2.0,2.0,2023-12-31 19:00:00
35084,35084,32.1,29.4,31.0,2.0,1.0,2023-12-31 20:00:00
35085,35085,34.6,30.6,32.2,2.2,1.0,2023-12-31 21:00:00
35086,35086,35.1,27.8,28.4,2.2,1.0,2023-12-31 22:00:00


In [8]:
from datetime import datetime, timedelta

def find_last_sundays(start_year, end_year):
    """
    This function finds the points at which DST begins and ends for all years given as input,
    that is to say, the last Sundays of March and October, and pinpoints the exact time at which
    the clock is shifted, in both cases it occurs at 1 am GMT+00.
    """
    days = {}
    for year in range(start_year, end_year + 1):
        # Find the last Sunday in March
        march_last_day = datetime(year, 3, 31)
        # By checking what day the last day of March is, we can determine how many days we need to go back to reach the last Sunday
        # note: in datetime.weekday() Monday is 0 and Sunday is 6
        march_last_sunday = march_last_day - timedelta(days=march_last_day.weekday() + 1 if march_last_day.weekday() < 6 else 0)
        # Set the time to 1 am
        march_last_sunday = march_last_sunday.replace(hour=1, minute=0, second=0)

        # Find the last Sunday in October(same procedure)
        october_last_day = datetime(year, 10, 31)
        october_last_sunday = october_last_day - timedelta(days=october_last_day.weekday() + 1 if october_last_day.weekday() < 6 else 0)
        october_last_sunday = october_last_sunday.replace(hour=1, minute=0, second=0)

        # Store in dictionary
        days[year] = {'start': march_last_sunday, 'stop': october_last_sunday}
    
    return days

# Generate the dictionary for the last Sundays of March and October between 2020 and 2023
days = find_last_sundays(2020, 2023)

# Print the results
for year, timestamps in days.items():
    print(f"{year}: {timestamps}")



2020: {'start': datetime.datetime(2020, 3, 29, 1, 0), 'stop': datetime.datetime(2020, 10, 25, 1, 0)}
2021: {'start': datetime.datetime(2021, 3, 28, 1, 0), 'stop': datetime.datetime(2021, 10, 31, 1, 0)}
2022: {'start': datetime.datetime(2022, 3, 27, 1, 0), 'stop': datetime.datetime(2022, 10, 30, 1, 0)}
2023: {'start': datetime.datetime(2023, 3, 26, 1, 0), 'stop': datetime.datetime(2023, 10, 29, 1, 0)}


As you can ee above, for the first row:
- Switch from CET to CEST happend on March 29th at 1:00 AM GMT+00 for the year 2020
- For the same year, clock went back to CET on October 25th at 1:00 AM GMT+00

In [9]:
#Let's set up a flag to indicate if DST is applied
aq_data["DST"] = False

In [10]:
def adjust_for_daylight_saving(df, datetime_column, days):
    def adjust_time(row):
        #get datetime from row
        dt = row[datetime_column]
        #get year
        year = dt.year
        if year in days:
            start, stop = days[year]['start'], days[year]['stop']
            #if row lies inside the DST period, add 1 hour and set DST flag to True
            if start <= dt <= stop:
                row[datetime_column] = dt + timedelta(hours=1)
                row["DST"] = True
        return row

    # Apply the adjustment function to the dataframe
    df = df.apply(adjust_time, axis=1)
    return df

In [11]:
aq_data = adjust_for_daylight_saving(aq_data, 'datetime', days)
#show march 29th 2020
aq_data[(aq_data['datetime'] >= '2020-03-29') & (aq_data['datetime'] < '2020-03-30')]

Unnamed: 0.1,Unnamed: 0,pm10,pm2_5,nitrogen_dioxide,sulphur_dioxide,ozone,datetime,DST
2136,2136,65.9,56.4,55.0,3.7,9.0,2020-03-29 00:00:00,False
2137,2137,67.5,55.0,52.4,3.4,9.0,2020-03-29 02:00:00,True
2138,2138,68.0,52.7,46.7,3.3,4.0,2020-03-29 03:00:00,True
2139,2139,56.6,47.6,42.8,2.9,13.0,2020-03-29 04:00:00,True
2140,2140,63.6,48.3,36.9,2.8,10.0,2020-03-29 05:00:00,True
2141,2141,58.3,49.1,35.0,3.1,10.0,2020-03-29 06:00:00,True
2142,2142,54.5,51.2,35.0,3.4,10.0,2020-03-29 07:00:00,True
2143,2143,61.3,49.1,32.3,3.3,17.0,2020-03-29 08:00:00,True
2144,2144,61.7,50.4,29.7,3.3,24.0,2020-03-29 09:00:00,True
2145,2145,62.8,53.2,26.6,3.5,41.0,2020-03-29 10:00:00,True


In [12]:
# Show october 25th 2020
aq_data[(aq_data['datetime'] >= '2020-10-25') & (aq_data['datetime'] < '2020-10-26')]

Unnamed: 0.1,Unnamed: 0,pm10,pm2_5,nitrogen_dioxide,sulphur_dioxide,ozone,datetime,DST
7175,7175,30.0,28.8,39.8,3.8,1.0,2020-10-25 00:00:00,True
7176,7176,31.0,29.4,36.6,3.6,0.0,2020-10-25 01:00:00,True
7177,7177,32.1,30.6,33.4,3.5,1.0,2020-10-25 02:00:00,True
7178,7178,33.6,32.2,31.9,3.5,1.0,2020-10-25 02:00:00,False
7179,7179,36.8,33.9,28.8,2.8,2.0,2020-10-25 03:00:00,False
7180,7180,39.1,31.4,24.7,2.4,1.0,2020-10-25 04:00:00,False
7181,7181,35.3,30.3,23.5,2.4,1.0,2020-10-25 05:00:00,False
7182,7182,36.6,35.0,21.2,2.4,1.0,2020-10-25 06:00:00,False
7183,7183,35.8,34.2,24.5,2.9,2.0,2020-10-25 07:00:00,False
7184,7184,36.4,32.3,25.4,3.1,6.0,2020-10-25 08:00:00,False


As it stands now, the data is still in GMT+00 with DST.

In GMT time DST starts and stops at 1am, in Milan its at 2am and 3am

We still need to add 1 hour to entire data to represent local Milan time

In [13]:
#Turn into GMT+01/02 with time offset
aq_data['datetime'] = aq_data['datetime'] + timedelta(hours=1)
aq_data[(aq_data['datetime'] >= '2020-03-29') & (aq_data['datetime'] < '2020-03-30')]

Unnamed: 0.1,Unnamed: 0,pm10,pm2_5,nitrogen_dioxide,sulphur_dioxide,ozone,datetime,DST
2135,2135,70.3,53.2,53.5,3.0,5.0,2020-03-29 00:00:00,False
2136,2136,65.9,56.4,55.0,3.7,9.0,2020-03-29 01:00:00,False
2137,2137,67.5,55.0,52.4,3.4,9.0,2020-03-29 03:00:00,True
2138,2138,68.0,52.7,46.7,3.3,4.0,2020-03-29 04:00:00,True
2139,2139,56.6,47.6,42.8,2.9,13.0,2020-03-29 05:00:00,True
2140,2140,63.6,48.3,36.9,2.8,10.0,2020-03-29 06:00:00,True
2141,2141,58.3,49.1,35.0,3.1,10.0,2020-03-29 07:00:00,True
2142,2142,54.5,51.2,35.0,3.4,10.0,2020-03-29 08:00:00,True
2143,2143,61.3,49.1,32.3,3.3,17.0,2020-03-29 09:00:00,True
2144,2144,61.7,50.4,29.7,3.3,24.0,2020-03-29 10:00:00,True


In [14]:
aq_data[(aq_data['datetime'] >= '2020-10-25') & (aq_data['datetime'] < '2020-10-26')]

Unnamed: 0.1,Unnamed: 0,pm10,pm2_5,nitrogen_dioxide,sulphur_dioxide,ozone,datetime,DST
7174,7174,29.3,28.9,41.5,3.9,0.0,2020-10-25 00:00:00,True
7175,7175,30.0,28.8,39.8,3.8,1.0,2020-10-25 01:00:00,True
7176,7176,31.0,29.4,36.6,3.6,0.0,2020-10-25 02:00:00,True
7177,7177,32.1,30.6,33.4,3.5,1.0,2020-10-25 03:00:00,True
7178,7178,33.6,32.2,31.9,3.5,1.0,2020-10-25 03:00:00,False
7179,7179,36.8,33.9,28.8,2.8,2.0,2020-10-25 04:00:00,False
7180,7180,39.1,31.4,24.7,2.4,1.0,2020-10-25 05:00:00,False
7181,7181,35.3,30.3,23.5,2.4,1.0,2020-10-25 06:00:00,False
7182,7182,36.6,35.0,21.2,2.4,1.0,2020-10-25 07:00:00,False
7183,7183,35.8,34.2,24.5,2.9,2.0,2020-10-25 08:00:00,False


In [15]:
aq_data.rename(columns={'datetime': 'localtime'}, inplace=True)

In [16]:
display(aq_data)

Unnamed: 0.1,Unnamed: 0,pm10,pm2_5,nitrogen_dioxide,sulphur_dioxide,ozone,localtime,DST
0,0,85.9,76.6,38.5,4.5,2.0,2019-12-31 01:00:00,False
1,1,84.5,76.9,35.2,4.9,4.0,2019-12-31 02:00:00,False
2,2,83.0,76.4,35.1,5.1,5.0,2019-12-31 03:00:00,False
3,3,80.7,74.9,33.5,4.8,2.0,2019-12-31 04:00:00,False
4,4,77.6,73.1,29.6,3.9,3.0,2019-12-31 05:00:00,False
...,...,...,...,...,...,...,...,...
35083,35083,38.1,31.8,27.2,2.0,2.0,2023-12-31 20:00:00,False
35084,35084,32.1,29.4,31.0,2.0,1.0,2023-12-31 21:00:00,False
35085,35085,34.6,30.6,32.2,2.2,1.0,2023-12-31 22:00:00,False
35086,35086,35.1,27.8,28.4,2.2,1.0,2023-12-31 23:00:00,False


In [17]:
aq_data.drop(columns=['DST'], inplace=True)

In [18]:
aq_data.tail()

Unnamed: 0.1,Unnamed: 0,pm10,pm2_5,nitrogen_dioxide,sulphur_dioxide,ozone,localtime
35083,35083,38.1,31.8,27.2,2.0,2.0,2023-12-31 20:00:00
35084,35084,32.1,29.4,31.0,2.0,1.0,2023-12-31 21:00:00
35085,35085,34.6,30.6,32.2,2.2,1.0,2023-12-31 22:00:00
35086,35086,35.1,27.8,28.4,2.2,1.0,2023-12-31 23:00:00
35087,35087,35.8,33.1,29.1,2.1,0.0,2024-01-01 00:00:00


In [19]:
#let's clip last row
aq_data = aq_data[:-1]

In [20]:
aq_data.tail()

Unnamed: 0.1,Unnamed: 0,pm10,pm2_5,nitrogen_dioxide,sulphur_dioxide,ozone,localtime
35082,35082,35.9,33.6,26.4,1.8,7.0,2023-12-31 19:00:00
35083,35083,38.1,31.8,27.2,2.0,2.0,2023-12-31 20:00:00
35084,35084,32.1,29.4,31.0,2.0,1.0,2023-12-31 21:00:00
35085,35085,34.6,30.6,32.2,2.2,1.0,2023-12-31 22:00:00
35086,35086,35.1,27.8,28.4,2.2,1.0,2023-12-31 23:00:00


We're not clipping data for the last day of 2019 as we will need it to properly aggregate the data.

In [21]:
#aq_data.to_csv("./AQ_APIdata_localtime.csv", index=False)