# load dataset

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
import json
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
import os
from scripts.preprocess import scale, ts_preprocess, balance

In [6]:
data_dirs = os.listdir("data")
dfs = [pd.read_csv(f"data/{dir}") for dir in data_dirs]

In [7]:
dataframe = pd.concat(dfs, ignore_index=True)
dataframe.head()

Unnamed: 0,datetime,city_name,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,clouds_all,rain,thunder,thunder_count
0,2022-10-09 18:00:00,54,4.99719,-72.7074,24.95,21.46,25.62,24.24,25.24,1010,81,1.57,90,0.0,99,0,0,0
1,2022-10-09 19:00:00,54,4.99719,-72.7074,24.61,22.11,25.37,23.83,25.14,1011,86,1.28,75,0.0,98,0,1,1
2,2022-10-09 20:00:00,54,4.99719,-72.7074,24.74,22.43,25.54,24.01,25.28,1013,87,1.18,64,0.0,99,0,0,0
3,2022-10-09 21:00:00,54,4.99719,-72.7074,23.76,22.2,24.57,22.96,24.29,1014,91,1.21,64,0.0,100,0,1,1
4,2022-10-09 22:00:00,54,4.99719,-72.7074,23.18,21.63,23.93,22.39,23.56,1014,91,0.73,23,0.0,98,0,1,2


In [None]:
#string to datetime format
dataframe["datetime"] = pd.to_datetime(dataframe["datetime"])

#drop redundant features
dataframe_fs = dataframe.drop(['thunder_count', 'temp_min', 'temp_max', 'feels_like'], axis=1)

# #select features tu scale
# scale_cols = list(dataframe_fs.columns)
# scale_cols = [x for x in scale_cols if x not in ['thunder', 'city_name', 'datetime']]

# #scale feature with standard scaler
# dataframe_fs[scale_cols] = scale(dataframe_fs[scale_cols], 'standard')

#sort by city and datetime and group by city
df_sorted = dataframe_fs.sort_values(by=["city_name", "datetime"])
groups = df_sorted.groupby('city_name')

In [None]:
#process data to obtain shape: (N, seq_length, N_columns)
#and balance the data
X_ts = []
y_ts = []
X_sc = []
y_sc = []

n_steps = {
    54: 30,
    54: 30
}

for key, group in groups:

    X = group.drop(['thunder', 'datetime', 'city_name'], axis=1)
    y = group['thunder']
    
    X = X.to_numpy()
    y = y.to_numpy()

    #without timesteps
    X_, y_ = balance(X, y)
    X_sc.append(X_)
    y_sc.append(y_)

    #with timesteps
    X, y = ts_preprocess(X, y, n_steps)
    X, y = balance(X, y)
    X_ts.append(X)
    y_ts.append(y)

X_ts = np.concatenate(X_ts, axis=0)
y_ts = np.concatenate(y_ts, axis=0)
X_sc = np.concatenate(X_sc, axis=0)
y_sc = np.concatenate(y_sc, axis=0)