In [57]:
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn

In [2]:
def transform_station(df):
    df['dtime'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')
    return df

In [3]:
def transform_model(df):
    df['dtime'] = pd.to_datetime(df['valid_time'], format='%Y-%m-%d %H:%M:%S')
    return df

In [4]:
def load():
    # loading dataset
    #X_station_train_df = pd.read_csv("../../Hackathon/Train/X_station_train.csv", index_col=["Id"])
    X_station_train_df = pd.read_csv("../../Hackathon/Train/X_station_train.csv")

    # transform whole dataset
    # X_station_train_df = transform_station(X_station_train_df)
    # X_station_train_df.head()

    # loading dataset
    X_2D_arome_interp_2016_df = pd.read_csv("../../Hackathon/Train/X_2D_arome_interp_2016.csv")

    # transform whole dataset
    # X_2D_arome_interp_2016_df = transform_model(X_2D_arome_interp_2016_df)
    # X_2D_arome_interp_2016_df.head()
    return X_2D_arome_interp_2016_df, X_station_train_df

df_modele, df_station = load()

## Check model values

In [5]:
df_modele = transform_model(df_modele)
#df_model

### Remove NaNs
Can be changed, just removed those which are really necessary

In [6]:
def remove_nans(df, col):
    Nb_nans = len(df[df[col].isnull()].index)
    total = len(df.index)
    print(Nb_nans)
    print(Nb_nans/total, "%")
    df = df[df[col].notnull()]
    return df
    
df_modele = remove_nans(df_modele, 'tp')

117650
0.04284023668639053 %


In [7]:
def df_sta_date(df, station,date):
    df1 = df[df['number_sta']==station]
    df2 = df1[df1['time']==date]
    return df2

In [8]:
def offset_1hour(df):
    df['dtime'] = pd.DatetimeIndex(df['dtime']) - pd.Timedelta('1h')
    return df

df_modele_off = offset_1hour(df_modele)

def example_offset():
    print(df_sta_date(df_modele,86137003, '2016-12-04').head())
    print(df_sta_date(df_modele_off,86137003, '2016-12-04').head())

## Check measured values

In [9]:
df_station = transform_station(df_station)

In [10]:
def offset_1day(df):
    df['nextday'] = pd.DatetimeIndex(df['dtime']) + pd.DateOffset(1)
    return df

df_station = offset_1day(df_station)
df_station = remove_nans(df_station, 'precip')

310298
0.07037075170417152 %


## Merge

In [13]:
df_merge = pd.merge(df_modele, df_station, how='inner', left_on=['number_sta', 'dtime'], right_on=['number_sta', 'nextday'])

In [21]:
df_dataset = df_merge[['number_sta','time','dtime_y','nextday','tp','precip','Id']]
df_dataset = df_dataset.rename({'number_sta': 'station', 'precip': 'measure D-1', 'tp' : 'prediction_arome', 'time': 'D day', 'dtime_y': 'D-1 (measure)'}, axis='columns')

def df_station_date(station, date):
    df1 = df_dataset[df_dataset['station']==station]
    df2 = df1[df1['D day']==date]
    return df2

## Building dataset

In [32]:
df_dataset = df_dataset[['prediction_arome', 'measure D-1', 'Id', 'station', 'D day']]

In [33]:
df_melted = pd.melt(df_dataset, id_vars=['Id', 'station', 'D day'])

In [35]:
def hour(row):
    my_str = row['Id'].split('_')[-1]
    return my_str
    
def day(row):
    my_str = row['Id'].split('_')[-2]
    return my_str

df_melted['hour'] = df_melted.apply(lambda row: hour(row), axis = 1)
df_melted['day'] = df_melted.apply(lambda row: day(row), axis = 1)

df_melted['var'] = np.where(df_melted['variable']=='prediction_arome', "P_" + df_melted['hour'], "M_" + df_melted['hour'])

In [39]:
df_for_mapping = df_melted[['D day', 'day']]
df_melted_minimal = df_melted[['station','day','value','var']]
df_melted['day']=pd.to_numeric(df_melted['day'])

In [42]:
def example_pivot():
    df_station_ex = df_melted_minimal[df_melted_minimal['station']==86096001]
    df_station_ex = pd.pivot_table(df_station_ex, columns='var', values='value', index='day')
    df_station_ex = df_station_ex.reset_index()
    return df_station_ex

#### Dataset in the form of dataframes in a dictionnary

In [44]:
def build_pandas_dataset(df_m):
    Dataset = {}
    stations = list(set(df_m['station'].values))

    for station in stations:
        df_station_loop = df_m[df_m['station']==station]
        df_station_loop = pd.pivot_table(df_station_loop, columns='var', values='value', index='day')
        if len(df_station_loop.columns)==48:
            Dataset[str(station)] = df_station_loop
            
    return Dataset

d = build_pandas_dataset(df_melted_minimal)

In [46]:
Dataset_tensor_M = {}
Dataset_tensor_P = {}
Dataset_tensor = {}

Dataset = d

for k in Dataset.keys():
    df = Dataset[k]
    df = df.reset_index()
    Dataset_tensor_M[k] = torch.tensor(df.loc[:,['day', 'M_0', 'M_1', 'M_2', 'M_3', 'M_4', 'M_5', 'M_6', 'M_7', 'M_8', 'M_9', 
        'M_10', 'M_11', 'M_12', 'M_13', 'M_14', 'M_15', 'M_16',  'M_17', 'M_18', 'M_19', 'M_20', 'M_21', 'M_22', 'M_23']].values)

    Dataset_tensor_P[k] = torch.tensor(df.loc[:,['day', 'P_0', 'P_1', 'P_2', 'P_3', 'P_4', 'P_5', 'P_6', 'P_7', 'P_8', 'P_9',
        'P_10', 'P_11', 'P_12', 'P_13', 'P_14', 'P_15', 'P_16', 'P_17', 'P_18', 'P_19', 'P_20', 'P_21', 'P_22', 'P_23']].values)
    
    Dataset_tensor[k] = torch.tensor(df.loc[:,['day', 'M_0', 'M_1', 'M_2', 'M_3', 'M_4', 'M_5', 'M_6', 'M_7', 'M_8', 'M_9', 
        'M_10', 'M_11', 'M_12', 'M_13', 'M_14', 'M_15', 'M_16',  'M_17', 'M_18', 'M_19', 'M_20', 'M_21', 'M_22', 'M_23',
        'P_0', 'P_1', 'P_2', 'P_3', 'P_4', 'P_5', 'P_6', 'P_7', 'P_8', 'P_9', 'P_10', 'P_11', 'P_12', 'P_13', 'P_14', 
        'P_15', 'P_16', 'P_17', 'P_18', 'P_19', 'P_20', 'P_21', 'P_22', 'P_23']].values)

In [50]:
def example_tensor():
    L = len(Dataset_tensor.keys())
    print(L)
    station_index = np.random.randint(0,L)
    station = list(Dataset_tensor.keys())[station_index]
    print(station)

    print("\n")
    print(Dataset_tensor[station].shape)
    Nb_days = Dataset_tensor[station].shape[0]
    day = np.random.randint(0,Nb_days)
    print(day)
    print(Dataset_tensor[station][day])

## Y_train

In [52]:
df_y_train = pd.read_csv("../../Hackathon/Train/Y_train.csv")
df_y_train[df_y_train['number_sta']==14216001]

Unnamed: 0,date,number_sta,Ground_truth,Id
3,2016-01-02,14216001,4.0,14216001_0
252,2016-01-03,14216001,11.3,14216001_1
502,2016-01-04,14216001,0.0,14216001_2
752,2016-01-05,14216001,0.0,14216001_3
1000,2016-01-06,14216001,2.6,14216001_4
...,...,...,...,...
182501,2017-12-27,14216001,6.6,14216001_725
182751,2017-12-28,14216001,0.0,14216001_726
183001,2017-12-29,14216001,11.2,14216001_727
183251,2017-12-30,14216001,9.8,14216001_728


## Mapping

In [54]:
df_map = df_for_mapping.groupby(by='day').min()
df_map = df_map.reset_index()

dates_mapping = {}
df_map['day'] = pd.to_numeric(df_map['day'])
for dday in df_map['D day'].values:
    dates_mapping[dday] = df_map[df_map['D day']==dday]['day'].values[0]
    
dates_mapping = dict(sorted(dates_mapping.items(), key=lambda x:x[1]))

dates_inv_map = {}
for k in dates_mapping.keys():
    dates_inv_map[dates_mapping[k]] = k

In [55]:
def day_to_dday(day):
    return dates_inv_map[day]

def dday_to_day(dday):
    return dates_mapping[dday]

### Example

In [60]:
station = list(Dataset_tensor.keys())[232]
Dataset_tensor[station].shape



torch.Size([337, 49])

# Training a model

In [387]:
station = list(Dataset_tensor.keys())[232]
print(station)
day = 89

print(Dataset_tensor[station][day])

17218001
tensor([96.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.2000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.8308,  0.8314,  0.9003,  0.9142,  1.0873,
         1.2925,  2.1580,  3.4265,  3.9668,  4.0666,  4.2506,  4.2509,  4.7033,
         5.2938,  6.2825,  6.6081,  6.6081,  6.6081,  6.6081,  6.6081,  6.6081,
         6.6081], dtype=torch.float64)


In [374]:
X = Dataset_tensor[station][day]
X

tensor([96.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.2000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.8308,  0.8314,  0.9003,  0.9142,  1.0873,
         1.2925,  2.1580,  3.4265,  3.9668,  4.0666,  4.2506,  4.2509,  4.7033,
         5.2938,  6.2825,  6.6081,  6.6081,  6.6081,  6.6081,  6.6081,  6.6081,
         6.6081], dtype=torch.float64)

In [380]:
def prediction(X):
    return X.mean().item()

In [381]:
prediction(X)

3.8104473785892785

In [386]:
int(X[0].item())

96

In [400]:
def true_y(station):
    df = df_y_train[df_y_train['number_sta']==station]
    return df[['date','Ground_truth']]

df_true_all_dates = true_y(17218001)

def true_y_value(station,day):
    df_station = true_y(station)
    df = df_station[df_station['date']==day_to_dday(day)]
    return df['Ground_truth'].values[0]

true_y_value(17218001, int(X[0].item()))

2.4000000000000004

In [401]:
def error(y_pred, y_true):
    return (y_pred - y_true)**2

error(prediction(X), true_y_value(17218001, int(X[0].item())))

1.9893618077693664

# Demonstration of dataframes

In [25]:
df_modele.head()

Unnamed: 0.1,Unnamed: 0,number_sta,time,valid_time,ws,p3031,u10,v10,t2m,d2m,r,tp,msl,dtime
325,325,86118001,2016-06-19,2016-06-19 01:00:00,2.795233,308.449229,2.188373,-1.737481,286.753213,284.331579,85.295749,0.0,102590.139065,2016-06-19
326,326,86149001,2016-06-19,2016-06-19 01:00:00,2.116415,321.499597,1.307743,-1.657971,284.070408,281.995456,87.037103,0.0,102660.182026,2016-06-19
327,327,56081003,2016-06-19,2016-06-19 01:00:00,1.10539,251.550895,1.044015,0.359752,282.503075,281.550988,93.771181,0.0,102757.47857,2016-06-19
328,328,53215001,2016-06-19,2016-06-19 01:00:00,2.886293,311.049406,2.177551,-1.891574,283.321058,282.15349,92.463499,0.0,102680.993824,2016-06-19
329,329,22135001,2016-06-19,2016-06-19 01:00:00,1.6329,195.615184,0.437298,1.564739,282.26994,281.545011,95.217601,0.0,102709.885414,2016-06-19


In [26]:
df_station.head()

Unnamed: 0,number_sta,date,ff,t,td,hu,dd,precip,Id,dtime,nextday
0,14066001,2016-01-01 00:00:00,3.05,279.28,277.97,91.4,200.0,0.0,14066001_0_0,2016-01-01 00:00:00,2016-01-02 00:00:00
1,14066001,2016-01-01 01:00:00,2.57,278.76,277.45,91.4,190.0,0.0,14066001_0_1,2016-01-01 01:00:00,2016-01-02 01:00:00
2,14066001,2016-01-01 02:00:00,2.26,278.27,277.02,91.7,181.0,0.0,14066001_0_2,2016-01-01 02:00:00,2016-01-02 02:00:00
3,14066001,2016-01-01 03:00:00,2.62,277.98,276.95,93.0,159.0,0.0,14066001_0_3,2016-01-01 03:00:00,2016-01-02 03:00:00
4,14066001,2016-01-01 04:00:00,2.99,277.32,276.72,95.9,171.0,0.0,14066001_0_4,2016-01-01 04:00:00,2016-01-02 04:00:00


In [31]:
df_merge.head()

Unnamed: 0.1,Unnamed: 0,number_sta,time,valid_time,ws,p3031,u10,v10,t2m,d2m,...,date,ff,t,td,hu,dd,precip,Id,dtime_y,nextday
0,328,53215001,2016-06-19,2016-06-19 01:00:00,2.886293,311.049406,2.177551,-1.891574,283.321058,282.15349,...,2016-06-18 00:00:00,,285.4,283.55,88.5,,0.0,53215001_169_0,2016-06-18,2016-06-19
1,329,22135001,2016-06-19,2016-06-19 01:00:00,1.6329,195.615184,0.437298,1.564739,282.26994,281.545011,...,2016-06-18 00:00:00,1.66,285.64,284.1,90.3,296.0,0.0,22135001_169_0,2016-06-18,2016-06-19
2,330,56178003,2016-06-19,2016-06-19 01:00:00,1.148291,210.906516,0.478861,0.874092,283.408999,281.872091,...,2016-06-18 00:00:00,1.04,286.07,283.91,86.7,294.0,0.0,56178003_169_0,2016-06-18,2016-06-19
3,331,86092002,2016-06-19,2016-06-19 01:00:00,1.041323,264.260951,1.031377,0.105632,284.378823,282.738859,...,2016-06-18 00:00:00,,285.75,,,,0.0,86092002_169_0,2016-06-18,2016-06-19
4,332,86096001,2016-06-19,2016-06-19 01:00:00,1.827698,315.249067,1.290623,-1.29366,284.334968,282.398812,...,2016-06-18 00:00:00,,283.1,,,,0.0,86096001_169_0,2016-06-18,2016-06-19


In [29]:
df_dataset.head()

Unnamed: 0,prediction_arome,measure D-1,Id,station,D day
0,0.0,0.0,53215001_169_0,53215001,2016-06-19
1,0.0,0.0,22135001_169_0,22135001,2016-06-19
2,0.0,0.0,56178003_169_0,56178003,2016-06-19
3,0.0,0.0,86092002_169_0,86092002,2016-06-19
4,0.0,0.0,86096001_169_0,86096001,2016-06-19


In [34]:
df_melted.head()

Unnamed: 0,Id,station,D day,variable,value
0,53215001_169_0,53215001,2016-06-19,prediction_arome,0.0
1,22135001_169_0,22135001,2016-06-19,prediction_arome,0.0
2,56178003_169_0,56178003,2016-06-19,prediction_arome,0.0
3,86092002_169_0,86092002,2016-06-19,prediction_arome,0.0
4,86096001_169_0,86096001,2016-06-19,prediction_arome,0.0


In [37]:
df_melted_minimal.head()

Unnamed: 0,station,day,value,var
0,53215001,169,0.0,P_0
1,22135001,169,0.0,P_0
2,56178003,169,0.0,P_0
3,86092002,169,0.0,P_0
4,86096001,169,0.0,P_0


In [43]:
df = example_pivot()
df

var,day,M_0,M_1,M_10,M_11,M_12,M_13,M_14,M_15,M_16,...,P_21,P_22,P_23,P_3,P_4,P_5,P_6,P_7,P_8,P_9
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.550355,4.552382,4.659995,0.000000,0.000000,0.000000,0.000000,0.235398,1.639058,1.847065
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.554000,8.645421,8.725951,0.009309,0.009309,0.009309,0.009309,0.009177,0.165012,0.195485
2,2,0.0,0.0,1.0,0.5,1.0,0.5,0.5,1.0,0.5,...,4.799740,5.830107,6.771815,0.603107,0.604687,2.884130,2.940111,2.970224,4.332536,4.357071
3,3,0.0,0.0,0.0,0.0,0.5,0.5,1.0,0.5,0.0,...,6.715648,6.715648,6.762902,0.326361,0.691108,1.909568,2.053884,2.252195,2.252654,2.252654
4,4,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,...,3.043679,4.728147,7.285497,1.160601,1.160601,1.164734,1.164414,1.164414,1.164414,1.172097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330,360,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000846,0.000846,0.000846,0.000000,0.000000,0.000011,0.000016,0.000016,0.000743,0.000784
331,361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
332,362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
333,363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.144624,0.164128,0.187633,0.020024,0.027829,0.035630,0.039565,0.044200,0.052932,0.061528


## Year, Month, Day, Hour

In [None]:
def add_YMDH(df):
    
    def year(row):
        _dtime = row['dtime']
        return _dtime.year

    def month(row):
        _dtime = row['dtime']
        return _dtime.month
    
    def day(row):
        _dtime = row['dtime']
        return _dtime.day

    def hour(row):
        _dtime = row['dtime']
        return _dtime.hour

    df['year'] = df.apply(lambda row: year(row), axis = 1)
    df['month'] = df.apply(lambda row: month(row), axis = 1)
    df['day'] = df.apply(lambda row: day(row), axis = 1)
    df['hour'] = df.apply(lambda row: hour(row), axis = 1)
    
    return df

In [None]:
def precip_station(month,day):
    station_ex = station_df[station_df['year']==2016][station_df['month']==month][station_df['day']==day]
    return station_ex['precip'].values

In [None]:
def precip_modele(month,day):
    station_ex = X_modele_station[X_modele_station['year']==2016][X_modele_station['month']==month][X_modele_station['day']==day]
    return station_ex['tp'].values

In [None]:
 def total_precip(df, month, day):
        station_ex = df[df['year']==2016][df['month']==month][df['day']==day]
        return np.sum(station_ex['precip'].values)