In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import holidays

In [2]:
german_provinces = holidays.DE.PROVINCES

In [3]:
holiday_dict = {}
for province in german_provinces:
    holiday_dict[province] = holidays.DE(years=[2020,2021],prov=province)

In [4]:
wtr = pd.read_csv('weather_data.csv')
# wtr = wtr[wtr.district_name != 'LK Göttingen (alt)']
#Göttingen (alt) was a weird artifact in the RKI data, with a single row having that value.
#I've corrected it in the file imported here, and hopefully, up-to-date RKI data is also clean

In [5]:
wtr['date'] = pd.to_datetime(wtr['date'])

In [6]:
wtr['day'] = wtr.date.dt.weekday
wtr['week_no'] = wtr.date.dt.week

In [7]:
# wtr.drop_duplicates(inplace=True)

In [8]:
prov_ags_dict = {8:'BW',
                 9:'BY',
                 11:'BE',
                 12:'BB',
                 4:'HB',
                 2:'HH',
                 6:'HE',
                 13:'MV',
                 3:'NI',
                 5:'NW',
                 7:'RP',
                 10:'SL',
                 14:'SN',
                 15:'ST',
                 1:'SH',
                 16:'TH'}

wtr['province'] = np.floor(wtr['districtId'] / 1000).map(prov_ags_dict)

In [9]:
wtr['holiday'] = None
for i in wtr.index:
    wtr.holiday.at[i] = wtr.date[i] in holiday_dict[wtr.province[i]]

wtr['weekend'] = wtr['day'].isin([5,6])

wtr['day_off'] = wtr['holiday'] | wtr['weekend']

wtr['workday'] = ~wtr['day_off']

In [10]:
# for wcol in ['temperature','humidity','precipitation','sunshine','velocity','direction']:
#     for work in ['workday','day_off']:
#         var_name = wcol + '_' + work
#         wtr[var_name] = wtr[wcol] * wtr[work]

In [11]:
wtr_work = wtr[wtr.workday].copy()
wtr_off = wtr[wtr.day_off].copy()

In [12]:
wtr_cols = ['temperature','humidity','precipitation','sunshine','velocity','direction']
out_dfs = []
for df in [wtr_work,wtr_off,wtr]:
    df_ = df.groupby(['districtId','week_no'])[wtr_cols].mean().reset_index()
    out_dfs.append(df_)

In [13]:
wtr_fts = pd.merge(out_dfs[0],out_dfs[1],on=['districtId','week_no'],suffixes=['_workday','_day_off']).merge(out_dfs[2])

In [14]:
wtr_fts

Unnamed: 0,districtId,week_no,temperature_workday,humidity_workday,precipitation_workday,sunshine_workday,velocity_workday,direction_workday,temperature_day_off,humidity_day_off,precipitation_day_off,sunshine_day_off,velocity_day_off,direction_day_off,temperature,humidity,precipitation,sunshine,velocity,direction
0,1001,1,3.613336,93.792429,3.402688,0.000000,5.662172,241.271102,4.339365,84.897609,0.842699,69.030477,4.942204,260.704731,4.048953,88.455537,1.866694,41.418286,5.230191,252.931280
1,1001,2,6.619149,92.035929,5.361398,10.080127,4.945447,232.647987,5.319712,89.757973,6.084063,2.076286,6.493203,226.572160,6.247881,91.385084,5.567874,7.793315,5.387663,230.912037
2,1001,3,7.457608,86.379880,2.967453,79.762874,6.156738,206.561340,4.692887,86.434985,1.295414,213.754786,4.587267,272.196247,6.667687,86.395625,2.489728,118.046278,5.708318,225.314171
3,1001,4,5.501040,93.349986,0.574046,37.813071,4.389905,258.813990,4.117203,94.292722,0.582729,45.085650,4.394525,216.644263,5.105658,93.619339,0.576527,39.890951,4.391225,246.765497
4,1001,5,6.274828,90.868050,7.237159,11.812334,5.870143,230.361479,7.172883,86.127561,1.714058,15.074964,6.669318,256.089283,6.531415,89.513625,5.659130,12.744514,6.098478,237.712280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16035,16077,36,15.667168,75.944043,2.861186,258.732843,2.395088,220.844202,14.631362,77.898252,0.512500,274.866643,1.741328,243.272514,15.371224,76.502388,2.190133,263.342500,2.208300,227.252291
16036,16077,37,16.023616,67.749303,0.128934,610.185165,2.009237,213.158872,18.212622,68.869065,0.000000,663.913895,1.834656,204.044996,16.649046,68.069235,0.092096,625.536231,1.959357,210.554907
16037,16077,38,17.671804,65.536756,0.048212,622.040051,2.159913,161.530035,12.985157,61.716078,0.000000,679.658127,1.913355,99.370058,16.332762,64.445134,0.034437,638.502359,2.089468,143.770042
16038,16077,39,16.293061,68.524138,2.774485,399.705482,2.407687,191.380813,8.366334,87.432276,16.906402,58.861520,2.129271,216.799368,14.028282,73.926463,6.812176,302.321493,2.328139,198.643258


In [15]:
wtr_fts.to_csv('weather_features.csv',index=False)

In [16]:
np.unique(wtr.districtId)

array([ 1001,  1002,  1003,  1004,  1051,  1053,  1054,  1055,  1056,
        1057,  1058,  1059,  1060,  1061,  1062,  2000,  3101,  3102,
        3103,  3151,  3153,  3154,  3155,  3157,  3158,  3159,  3241,
        3251,  3252,  3254,  3255,  3256,  3257,  3351,  3352,  3353,
        3354,  3355,  3356,  3357,  3358,  3359,  3360,  3361,  3401,
        3402,  3403,  3404,  3405,  3451,  3452,  3453,  3454,  3455,
        3456,  3457,  3458,  3459,  3460,  3461,  3462,  4011,  4012,
        5111,  5112,  5113,  5114,  5116,  5117,  5119,  5120,  5122,
        5124,  5154,  5158,  5162,  5166,  5170,  5314,  5315,  5316,
        5334,  5358,  5362,  5366,  5370,  5374,  5378,  5382,  5512,
        5513,  5515,  5554,  5558,  5562,  5566,  5570,  5711,  5754,
        5758,  5762,  5766,  5770,  5774,  5911,  5913,  5914,  5915,
        5916,  5954,  5958,  5962,  5966,  5970,  5974,  5978,  6411,
        6412,  6413,  6414,  6431,  6432,  6433,  6434,  6435,  6436,
        6437,  6438,