In [3]:
import pandas as pd

from sp_project.data_preparation.db_client import get_global_db_client

%autoawait asyncio

In [5]:
async def extract_data_daily() -> pd.DataFrame:
    """Extract the daily averages of all the interesting datapoints including hours of daylight"""
    
    collection = get_global_db_client().openweather
    
    pipeline = [
        {
            '$addFields': {
                'date': {
                    '$substr': [
                        '$dt', 0, 10
                    ]
                }, 
                'sunhours': {
                    '$dateDiff': {
                        'startDate': '$sunrise', 
                        'endDate': '$sunset', 
                        'unit': 'minute'
                    }
                }, 
                'temp': {
                    '$cond': [
                        {
                            '$gte': [
                                '$temp', 200
                            ]
                        }, '$temp', {
                            '$add': [
                                '$temp', 10000
                            ]
                        }
                    ]
                }
            }
        }, {
            '$group': {
                '_id': '$date', 
                'sunhours': {
                    '$avg': '$sunhours'
                }, 
                'avg_temp': {
                    '$avg': '$temp'
                }, 
                'min_temp': {
                    '$min': '$temp'
                }, 
                'max_temp': {
                    '$max': '$temp'
                }, 
                'uvi': {
                    '$avg': '$uvi'
                }, 
                'wind_speed': {
                    '$avg': '$wind_speed'
                }, 
                'clouds': {
                    '$avg': '$clouds'
                }, 
                'rain': {
                    '$push': '$rain'
                }, 
                'snow': {
                    '$push': '$snow'
                }
            }
        }
    ]

    results=[]
    async for x in collection.aggregate(pipeline):
        total_rain = 0
        total_snow = 0

        for r in x['rain']:
            for k, v in r.items():
                rain = v
                hours = int(k.strip('h'))
                total_rain += (rain*hours)
        avg_rain = total_rain/24
        x['rain'] = avg_rain

        for s in x['snow']:
            for k, v in s.items():
                snow = v
                hours = int(k.strip('h'))
                total_snow += (rain*hours)
        avg_snow = total_snow/24
        x['snow'] = avg_snow

        results.append(x)


    df = pd.DataFrame(results)
    df = df.set_index("_id")
    df = df.set_index(pd.to_datetime(df.index).tz_localize("UTC").rename("date"))
    df = df.sort_index()
 
    df["avg_temp"] -= 273
    df["min_temp"] -= 273
    df["max_temp"] -= 273

    return df

In [6]:
async def extract_heatingdemand() -> pd.DataFrame:
    """Extract the daily average of the negative deviation of 14°C = 288°K"""
    
    collection = get_global_db_client().openweather
    
    pipeline = [
        {
            '$addFields': {
                'date': {
                    '$substr': [
                        '$dt', 0, 10
                    ]
                }, 
                'temp': {
                    '$cond': [
                        {
                            '$gte': [
                                '$temp', 200
                            ]
                        }, '$temp', {
                            '$add': [
                                '$temp', 10000
                            ]
                        }
                    ]
                }
            }
        }, {
            '$addFields': {
                'heatingdemand': {
                    '$cond': {
                        'if': {
                            '$lte': [
                                '$temp', 288
                            ]
                        }, 
                        'then': {
                            '$subtract': [
                                288, '$temp'
                            ]
                        }, 
                        'else': 0
                    }
                }
            }
        }, {
            '$group': {
                '_id': '$date', 
                'heating_demand': {
                    '$avg': '$heatingdemand'
                }
            }
        }
    ]

    results=[]
    async for x in collection.aggregate(pipeline):
        results.append(x)
    
    df = pd.DataFrame(results)
    df = df.set_index("_id")
    df = df.set_index(pd.to_datetime(df.index).tz_localize("UTC").rename("date"))
    df = df.sort_index()
    
    return df

In [7]:
async def extract_windpower() -> pd.DataFrame:
    """Extract the daily average of wind-speed**2, which is the equivalent of wind-power"""
    
    collection = get_global_db_client().openweather
    
    pipeline = [
        {
            '$addFields': {
                'date': {
                    '$substr': [
                        '$dt', 0, 10
                    ]
                }
            }
        }, {
            '$group': {
                '_id': '$date', 
                'windpower': {
                    '$avg': {
                        '$pow': [
                            '$wind_speed', 2
                        ]
                    }
                }
            }
        }
    ]


    results=[]
    async for x in collection.aggregate(pipeline):
        results.append(x)
    
    df = pd.DataFrame(results)
    df = df.set_index("_id")
    df = df.set_index(pd.to_datetime(df.index).tz_localize("UTC").rename("date"))
    df = df.sort_index()
    
    return df

In [9]:
df_daily = await(extract_data_daily())
df_heatingdemand = await(extract_heatingdemand())
df_windpower = await(extract_windpower())

In [10]:
df_windpower

Unnamed: 0_level_0,windpower
date,Unnamed: 1_level_1
2022-10-30 00:00:00+00:00,2.359957
2022-10-31 00:00:00+00:00,2.428081
2022-11-01 00:00:00+00:00,3.173102
2022-11-02 00:00:00+00:00,1.949441
2022-11-03 00:00:00+00:00,4.839700
...,...
2023-04-30 00:00:00+00:00,7.287211
2023-05-01 00:00:00+00:00,3.113731
2023-05-02 00:00:00+00:00,6.582331
2023-05-03 00:00:00+00:00,9.776797
