In [1]:
# Libraries Imports
#-----------------------------------------------------------------------------------------------------
from configparser import ConfigParser
from sqlalchemy import create_engine
import pandas as pd
from datetime import date, timedelta


In [2]:
# Load connexion configuration file
#-----------------------------------------------------------------------------------------------------
config_filename = "./GetWeather.ini"

# Read config.ini file
config_object = ConfigParser()
config_object.read(config_filename)
if  not config_object:
    print("Error while loading configuration !!!")

# Log initialization
cfg = config_object['INFO']

In [3]:
# Establish connection with DB (using sqlalchemy)
#-----------------------------------------------------------------------------------------------------
db_connection_str = "mysql+pymysql://{}:{}@{}/WeatherDB".format(cfg['user'],cfg['password'],cfg['host'])
db_connection = create_engine(db_connection_str)

In [10]:
# Count sorted number of records per locations
#-----------------------------------------------------------------------------------------------------
raw_data = pd.read_sql(
"""SELECT Locations.name as name, COUNT(*) as nbRecords
FROM Records
INNER JOIN Locations ON Records.id_location = Locations.id
GROUP BY id_location
ORDER BY nbRecords DESC;""", con=db_connection)
pdata = pd.DataFrame(raw_data)

# Display header just to be sure we have something
pdata.head(10)


Unnamed: 0,name,nbRecords
0,Hérémence,1883
1,Geneva,1709
2,Lausanne,1708
3,Sion,1561
4,Evolène,1186


In [None]:
# Get average values on all records per locations
#-----------------------------------------------------------------------------------------------------
raw_data = pd.read_sql(
"""SELECT 
	    id_location, 
        Locations.name,
	    ROUND(AVG(pressure),2) AS avgPress,
	    ROUND(AVG(temp - 271.15),2) AS avgTemp,
	    ROUND(AVG(humidity),2) AS avgHumidity
    FROM Records
    INNER JOIN Locations on id_location = Locations.id
    GROUP BY id_location;""", con=db_connection)
pdata = pd.DataFrame(raw_data)

# Display header just to be sure we have something
pdata.head(10)


In [None]:
# Get minimum values per locations
#-----------------------------------------------------------------------------------------------------
raw_data = pd.read_sql(
""" SELECT 
	    id_location, 
        Locations.name,
	    ROUND(MIN(pressure),2) AS MINPress,
	    ROUND(MIN(temp - 271.15),2) AS MINTemp,
	    ROUND(MIN(humidity),2) AS MINHumidity
    FROM Records
    Inner join Locations on id_location = Locations.id
    group by id_location;;""", con=db_connection)
pdata = pd.DataFrame(raw_data)

# Display header just to be sure we have something
pdata.head(10)

In [4]:
# Hourly stats for a given day at a given location
#-----------------------------------------------------------------------------------------------------
my_date='2021-08-10'
loc = 6
raw_data = pd.read_sql(""" call Get_Hourly_Stats('{}',{});""".format(my_date, loc), con=db_connection)
pdata = pd.DataFrame(raw_data)

# Display header just to be sure we have something
pdata.head(30)

Unnamed: 0,date_timestamp,Hour,nbRec,minTemp,maxTemp,avgTemp,minFeelsLike,maxFeelsLike,avgFeelsLike,minPressure,...,avgPressure,minHumidity,maxHumidity,avgHumidity,avgWindSpeed,gust,avgWindDir,avgCloudsCover,sumRain,sumSnow
0,2021-08-10 00:37:09,0,15,13.4,14.24,13.66,12.38,13.2,12.72,1021.0,...,1021.67,63.0,72.0,69.13,1.36,2.57,67.07,12.27,0.0,0.0
1,2021-08-10 02:01:39,2,10,12.82,13.52,13.18,11.87,12.51,12.21,1022.0,...,1022.0,67.0,72.0,69.5,1.02,1.54,0.0,0.0,0.0,0.0
2,2021-08-10 03:00:14,3,10,12.72,13.27,13.03,11.84,12.45,12.16,1022.0,...,1022.0,72.0,75.0,74.3,1.23,1.54,27.0,0.0,0.0,0.0
3,2021-08-10 04:03:31,4,10,12.4,12.95,12.68,11.54,12.07,11.81,1022.0,...,1022.0,74.0,77.0,75.5,1.54,1.54,70.0,0.0,0.0,0.0
4,2021-08-10 05:02:43,5,10,11.24,12.4,11.76,10.32,11.57,10.89,1022.0,...,1022.0,78.0,81.0,79.8,1.85,2.06,80.0,0.0,0.0,0.0
5,2021-08-10 06:04:30,6,10,11.09,11.35,11.25,10.14,10.92,10.61,1022.0,...,1022.0,81.0,82.0,81.7,1.75,2.06,88.0,0.0,0.0,0.0
6,2021-08-10 07:04:09,7,9,11.11,13.09,12.05,9.79,12.35,11.06,1022.0,...,1022.0,79.0,84.0,81.89,2.57,2.57,71.11,20.0,0.0,0.0
7,2021-08-10 08:02:45,8,10,13.34,15.5,14.45,12.58,14.82,13.7,1021.0,...,1021.5,72.0,77.0,73.3,2.57,2.57,75.0,20.0,0.0,0.0
8,2021-08-10 09:04:04,9,5,15.7,17.33,16.44,14.91,16.49,15.64,1021.0,...,1021.0,59.0,67.0,63.8,1.85,2.06,90.0,12.0,0.0,0.0
9,2021-08-10 10:30:56,10,6,19.09,21.27,19.99,18.35,20.31,19.16,1020.0,...,1020.67,39.0,56.0,49.0,1.46,1.54,91.0,33.33,0.0,0.0


In [57]:
# Get hourly stats for a given range of day(s) and a single location
#-----------------------------------------------------------------------------------------------------
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

# work variables for aggregation
start_date = date(2021,1,1)
end_date = date(2022,1,1)
loc = 6
pdata = pd.DataFrame([])
flagFirstRec = True
iter = 0

for my_date in daterange(start_date, end_date):
    iter += 1
    query = "call Get_Hourly_Stats('{}',{});".format(my_date, loc)
    raw_data = pd.read_sql(query, con=db_connection)
   
    if flagFirstRec:
        pdata = pd.DataFrame(raw_data)
        flagFirstRec = False
    else:
        pdata = pdata.append(pd.DataFrame(raw_data))
    
print("{} aggregate(s) created for a period of {} day(s).".format(pdata.shape,iter))

(316, 21) aggregate(s) created for a period of 365 day(s).


In [None]:
#pdata.insert(0, 'idx', range(0,len(pdata)))
pdata.reset_index(drop=True)

In [5]:
my_date = '2021-08-23'
loc = 6
query = "call Get_Monthly_Stats('{}',{});".format(my_date, loc)

pdata = pd.read_sql(query, con=db_connection)
pdata.rename(columns = {'date_timestamp':'timestamp'}, inplace = True)
pdata.insert(0,'id',0)
pdata.head(5)



Unnamed: 0,id,timestamp,month,nbRec,minTemp,maxTemp,avgTemp,minFeelsLike,maxFeelsLike,avgFeelsLike,...,avgPressure,minHumidity,maxHumidity,avgHumidity,avgWindSpeed,maxWindSpeed,avgWindDir,avgCloudsCover,sumRain,sumSnow
0,0,2021-08-01 00:09:49,8,1112,8.36,29.28,18.23,8.36,29.06,17.67,...,1020.12,36.0,97.0,63.72,2.1,8.23,97.35,19.06,28.48,0.0


In [91]:
pdata.to_sql('RecordsByHour', con=db_connection, if_exists='append',
           index=False)

In [None]:
from pytz import all_timezones
for timezone in all_timezones:
    print(timezone)