In [1]:
# Libraries Imports
#-----------------------------------------------------------------------------------------------------
from configparser import ConfigParser
from sqlalchemy import create_engine
import pandas as pd
from datetime import date, timedelta


In [2]:
# Load connexion configuration file
#-----------------------------------------------------------------------------------------------------
config_filename = "./GetWeather.ini"

# Read config.ini file
config_object = ConfigParser()
config_object.read(config_filename)
if  not config_object:
    print("Error while loading configuration !!!")

# Log initialization
cfg = config_object['INFO']

In [3]:
# Establish connection with DB (using sqlalchemy)
#-----------------------------------------------------------------------------------------------------
db_connection_str = "mysql+pymysql://{}:{}@{}/WeatherDB".format(cfg['user'],cfg['password'],cfg['host'])
db_connection = create_engine(db_connection_str)

In [5]:
# Count sorted number of records per locations
#-----------------------------------------------------------------------------------------------------
raw_data = pd.read_sql(
"""SELECT Locations.name as name, COUNT(*) as nbRecords
FROM Records
INNER JOIN Locations ON RawRecords.id_location = Locations.id
GROUP BY id_location
ORDER BY nbRecords DESC;""", con=db_connection)
pdata = pd.DataFrame(raw_data)

# Display header just to be sure we have something
pdata.head(10)


ProgrammingError: (pymysql.err.ProgrammingError) (1146, "Table 'WeatherDB.Records' doesn't exist")
[SQL: SELECT Locations.name as name, COUNT(*) as nbRecords
FROM Records
INNER JOIN Locations ON RawRecords.id_location = Locations.id
GROUP BY id_location
ORDER BY nbRecords DESC;]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [None]:
# Get average values on all records per locations
#-----------------------------------------------------------------------------------------------------
raw_data = pd.read_sql(
"""SELECT 
	    id_location, 
        Locations.name,
	    ROUND(AVG(pressure),2) AS avgPress,
	    ROUND(AVG(temp - 271.15),2) AS avgTemp,
	    ROUND(AVG(humidity),2) AS avgHumidity
    FROM Records
    INNER JOIN Locations on id_location = Locations.id
    GROUP BY id_location;""", con=db_connection)
pdata = pd.DataFrame(raw_data)

# Display header just to be sure we have something
pdata.head(10)


In [None]:
# Get minimum values per locations
#-----------------------------------------------------------------------------------------------------
raw_data = pd.read_sql(
""" SELECT 
	    id_location, 
        Locations.name,
	    ROUND(MIN(pressure),2) AS MINPress,
	    ROUND(MIN(temp - 271.15),2) AS MINTemp,
	    ROUND(MIN(humidity),2) AS MINHumidity
    FROM Records
    Inner join Locations on id_location = Locations.id
    group by id_location;;""", con=db_connection)
pdata = pd.DataFrame(raw_data)

# Display header just to be sure we have something
pdata.head(10)

In [None]:
# Hourly stats for a given day at a given location
#-----------------------------------------------------------------------------------------------------
my_date='2021-08-10'
loc = 6
raw_data = pd.read_sql(""" call Get_Hourly_Stats('{}',{});""".format(my_date, loc), con=db_connection)
pdata = pd.DataFrame(raw_data)

# Display header just to be sure we have something
pdata.head(30)

In [57]:
# Get hourly stats for a given range of day(s) and a single location
#-----------------------------------------------------------------------------------------------------
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

# work variables for aggregation
start_date = date(2021,1,1)
end_date = date(2022,1,1)
loc = 6
pdata = pd.DataFrame([])
flagFirstRec = True
iter = 0

for my_date in daterange(start_date, end_date):
    iter += 1
    query = "call Get_Hourly_Stats('{}',{});".format(my_date, loc)
    raw_data = pd.read_sql(query, con=db_connection)
   
    if flagFirstRec:
        pdata = pd.DataFrame(raw_data)
        flagFirstRec = False
    else:
        pdata = pdata.append(pd.DataFrame(raw_data))
    
print("{} aggregate(s) created for a period of {} day(s).".format(pdata.shape,iter))

(316, 21) aggregate(s) created for a period of 365 day(s).


In [None]:
#pdata.insert(0, 'idx', range(0,len(pdata)))
pdata.reset_index(drop=True)

In [87]:
my_date = '2021-08-23'
loc = 6
query = "call Get_Hourly_Stats('{}',{});".format(my_date, loc)

pdata = pd.read_sql(query, con=db_connection)
pdata.rename(columns = {'date_timestamp':'timestamp'}, inplace = True)
pdata.insert(0,'id',0)
pdata.head(5)



Unnamed: 0,id,timestamp,Hour,nbRec,minTemp,maxTemp,avgTemp,minFeelsLike,maxFeelsLike,avgFeelsLike,...,avgPressure,minHumidity,maxHumidity,avgHumidity,avgWindSpeed,gust,avgWindDir,avgCloudsCover,sumRain,sumSnow
0,0,2021-08-23 00:01:57,0,3,14.75,15.56,15.29,14.1,14.96,14.67,...,1021.0,75.0,76.0,75.33,1.54,1.54,0.0,52.67,0.0,0.0
1,0,2021-08-23 02:22:15,2,1,14.06,14.06,14.06,13.42,13.42,13.42,...,1021.0,79.0,79.0,79.0,1.03,1.03,0.0,0.0,0.0,0.0
2,0,2021-08-23 03:22:16,3,1,13.66,13.66,13.66,13.03,13.03,13.03,...,1021.0,81.0,81.0,81.0,1.03,1.03,0.0,0.0,0.0,0.0
3,0,2021-08-23 04:22:18,4,1,13.34,13.34,13.34,12.73,12.73,12.73,...,1021.0,83.0,83.0,83.0,2.57,2.57,90.0,0.0,0.0,0.0
4,0,2021-08-23 05:22:19,5,1,12.81,12.81,12.81,12.25,12.25,12.25,...,1021.0,87.0,87.0,87.0,2.57,2.57,90.0,0.0,0.0,0.0


In [91]:
pdata.to_sql('RecordsByHour', con=db_connection, if_exists='append',
           index=False)

In [None]:
from pytz import all_timezones
for timezone in all_timezones:
    print(timezone)