In [460]:
import pandas as pd
import numpy as np
import importlib

import utils_data_preprocessing
import lstm_classifier
import post_inference
# Reload the module
importlib.reload(utils_data_preprocessing)
importlib.reload(lstm_classifier)
importlib.reload(post_inference)

# Reinitialize the class
from utils_data_preprocessing import Utils_data_preprocessing
from lstm_classifier import Lstm_classifier
from post_inference import Post_inference

In [461]:
# Reset tensoflow, each time you run, the run's model is save, generate conflits with the code
from tensorflow.keras.backend import clear_session
clear_session()  # Resets the backend state

### Data preprocessing

In [462]:
SELECTED_DISASTER_TYPES=["Flood"] # Storm, Flood, 

In [463]:
emdat = pd.read_csv("data/emdat_ready.csv")

clean_emdat = Utils_data_preprocessing().clean_emdat(emdat_df=emdat, 
                                                   minimum_start_year=2010, # disasters previous this year are discarded (minmum: 2010)
                                                   accepted_disasters_types=SELECTED_DISASTER_TYPES) # other disasters with types diferent from this are removed

In [464]:
noaa = pd.read_csv("data/meteostat_noaa.csv")

# Renaming specific columns
noaa = noaa.rename(columns={
    'State': 'state',
    'Season': 'season',
    'tavg': 'TAVG',
    'pres': 'PRES'
})
# texas -> Texas
noaa['state'] = noaa['state'].str.capitalize()
# Winter -> winter
noaa['season'] = noaa['season'].str.lower()

# Replace NAN's by median of numeric columns only. 
numeric_cols = noaa.select_dtypes(include='number')  # Select only numeric columns
noaa[numeric_cols.columns] = numeric_cols.apply(lambda col: col.fillna(col.median()))

N_AFTER_DISASTER_DAYS_TO_LABEL = 0 # only the first day of the diaster is labelled no more, RECOMMENDED TO NOT CHANGE THIS
noaa_counted = Utils_data_preprocessing().count_diasters_by_day(clean_emdat_df=clean_emdat, 
                                                              noaa_df=noaa, 
                                                              n_after_disaster_days_to_label= N_AFTER_DISASTER_DAYS_TO_LABEL) # if the disaster lasts more than n_after_disaster_days_to_label=2 days, 
                                                                                                                                #only the first n_after_disaster_days_to_label=2 days are counted as disaster 

### Feature engineering
<br>Choose if want date for an entire state or for an individual station

In [465]:
N_NEXT_DAYS_UNTIL_DISASTER = 20 # if =7 predict if it will ocurr a disaster the next 7 days, if a disaster ocurred the 8th of April, 1st to 7th of April will be also labelled as disaster (1)
LENGTHS_DAYS_MA = [10, 20, 40] # the number of days of Moving Averages, 7-day-MA, 21-day-MA. Each one is a new computed variable
MAX_LAG_PERIOD = 5 # how many days we are looking back, if =7, we are creating 7new variables with the past values of the last 7 days of EACH variable
SELECTED_STATE='Arkansas'  # rest of states are removed. States: 'Arkansas', 'Kansas', 'Texas', 'Oklahoma', 'Louisiana', 'Mississippi'

In [466]:
model_ready_data = Utils_data_preprocessing().prepare_state_version_data_for_model_predict(data=noaa_counted, 
                                                                selected_state=SELECTED_STATE,
                                                                n_next_days_until_disaster=N_NEXT_DAYS_UNTIL_DISASTER,
                                                                lengths_days_ma=LENGTHS_DAYS_MA, 
                                                                max_lag_period=MAX_LAG_PERIOD) 

Adjust the target like this: <br>
[0,   1,     1,    1,    1,    1,    1, 1, 1, 1, 1] adjust to <br>
[0, 0.14, 0.29, 0.43, 0.57, 0.71, 0.86, 1, 1, 1, 1]

In [467]:
# OPTIONAL. if you do not want to adjust set = False
WANT_TO_ADJUST_TARGET = False
if WANT_TO_ADJUST_TARGET:
    model_ready_data['target'] = Utils_data_preprocessing().adjust_days_previous_disaster(column_to_adjust=model_ready_data['target'], 
                                                                                        n_next_days_until_disaster=N_NEXT_DAYS_UNTIL_DISASTER) # IT HAS TO BE THE SAME AS BEFORE (before=the .prepare_ function)

In [468]:
cols_not_scale = [col for col in model_ready_data.columns 
                  if 'WT' in col or 'season' in col or 'target' in col]

scaled_data = Lstm_classifier().scale_data(data=model_ready_data, 
                                           choosen_scaler='standard', # 'standard', 'minmax', 'quantile'
                                           cols_not_scale=cols_not_scale)

In [469]:
SEQUENCE_LENGTH = 20

Generate Preds 

In [470]:
# compute the split% to see which gets that the test set is the last SEQUENCE_LENGTH+1 days, so we predict correctly the last days
split_ratio_to_predict_real = (model_ready_data.shape[0]-(SEQUENCE_LENGTH+1))/model_ready_data.shape[0]

#later do the same for inference_noaa

In [471]:
# Split the data
X_train, X_val, y_train, y_val = Lstm_classifier().lstm_time_series_train_test_split(scaled_data=scaled_data,
                                                                                    target_column=model_ready_data['target'],  
                                                                                    sequence_length=SEQUENCE_LENGTH, 
                                                                                    train_test_split_ratio=split_ratio_to_predict_real)
    
# clean previous info (this is for plots)
from tensorflow.keras.backend import clear_session
clear_session()

# Build LSTM Model
model = Lstm_classifier().train_lstm(X_train=X_train, 
                                    y_train=y_train, 
                                    units=100, # more units -> more powerful but more time-comsuming and more risk of overfitting
                                    dropout=0.2, # more dropout -> more sleep neurons when traing, less probable to overfit
                                    l2_regularizer_weight=0.001, # how much penalty you want to set for large kernel weight in the loss, large pentalty = more likely to use small kernel weights and not overfit. Set to 0.0 if you do not want l2-regularizer
                                    learning_rate=0.00001,  # less rate -> more slow when learing, less probable to overfit
                                    class_1_weight=1.0, # weight of the 1's class (disaster class), 1.0=same importance as 0's, 5.0=predicting 1's wrong penalizes 5 times more when than 0's. As a reference you can use: class_1_weight=sum(y_train == 0)/sum(y_train == 1)
                                    epochs=5, # number of iterations where lstm goes through the entire dataset
                                    batch_size=32, # how many samples of data are processed together in a single forward and backward pass of the model. For smaller datasets, smaller batch sizes (e.g., 8, 16, or 32) are usually better to ensure the model doesn’t overfit. For large datasets, larger batch sizes (e.g., 64, 128, 256) can speed up training. 
                                    validation_data=(X_val, y_val), 
                                    verbose=0,
                                    show_plots=False) # at the end of the training if True, it will plot some monitoring 
    
# predict
predictions_probs = model.predict(X_val).flatten()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 203ms/step


Get the predicted dates

In [472]:
# filter out the non-selected states
noaa_counted_selected_state = noaa_counted[noaa_counted["state"]==SELECTED_STATE]
# get the station with the last date
station_with_last_date = noaa_counted_selected_state.groupby("STATION")["DATE"].max().idxmax()
# get all the dates of that station
all_dates = noaa_counted_selected_state[noaa_counted_selected_state["STATION"]==station_with_last_date]["DATE"]
# get only the predicted ones
predicted_dates = all_dates.iloc[-(SEQUENCE_LENGTH+1):]

Create csv and save predictions and model

In [473]:
pred_data = pd.DataFrame({"DATE":predicted_dates,f"{SELECTED_STATE}_{SELECTED_DISASTER_TYPES[0]}_pred_prob_next_{SEQUENCE_LENGTH+1}_days": predictions_probs})
pred_data.to_csv(f"predictions/{SELECTED_STATE}_{SELECTED_DISASTER_TYPES[0]}_pred_prob_next_{SEQUENCE_LENGTH+1}_days.csv")

model.save(f'trained_models/{SELECTED_STATE}_{SELECTED_DISASTER_TYPES[0]}_pred_prob_next_{SEQUENCE_LENGTH+1}_days.keras') 