In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import math
import rpy2.robjects as ro
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

In [3]:
df = pd.read_csv('../data/processed/train_desc_aod_and_meteo_vars_11.03.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34312 entries, 0 to 34311
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   datetime               34312 non-null  object 
 1   grid_id                34312 non-null  object 
 2   value                  34312 non-null  float64
 3   date                   34312 non-null  object 
 4   Column_WV              22203 non-null  float64
 5   Optical_Depth_055_avg  11552 non-null  float64
 6   Optical_Depth_055_min  11552 non-null  float64
 7   Optical_Depth_055_max  11552 non-null  float64
 8   Optical_Depth_055_var  11272 non-null  float64
 9   Optical_Depth_055_std  11272 non-null  float64
 10  Optical_Depth_055_p95  11552 non-null  float64
 11  latitude               34312 non-null  float64
 12  longitude              34312 non-null  float64
 13  surface                34312 non-null  float64
 14  gust                   34312 non-null  float64
 15  sp

In [5]:
cols_to_drop = ['surface', 'level', 'heightAboveGround']

In [6]:
def prep(df, sort=False):
    le = LabelEncoder()
    df['grid_id_encoded'] = le.fit_transform(df['grid_id'])

    df['year'] = pd.to_datetime(df['date']).dt.year
    df['month'] = pd.to_datetime(df['date']).dt.month
    df['day'] = pd.to_datetime(df['date']).dt.day

    df.drop(cols_to_drop, inplace=True, axis=1)

    if sort:
        df.sort_values(by='date', inplace=True)

    return df

In [7]:
def fillna(df, method='locf_nocb'):

    if method == 'locf_nocb':
        df = df.fillna(method='ffill')
        df = df.fillna(method='bfill')
    
    elif method == 'interp_linear':
        df = df.interpolate(method='linear', axis=0)
    
    elif method == 'interp_spline_2':
        df = df.interpolate(method='spline', order=2, axis=0)

    elif method == 'interp_spline_3':
        df = df.interpolate(method='spline', order=3, axis=0)
        
    return df

In [8]:
def fillna_by_grid(df, method):
    
    grid_ids = df.grid_id.unique()
    res = []
    for id in grid_ids:
        mask = df['grid_id'] == id
        subset = df[mask].copy()

        subset.sort_values(by='date', inplace=True, ascending=True)
        
        if method in ('locf_nocb', 'interp_linear', 'interp_spline_2', 'interp_spline_3'):
            subset = fillna(subset, method=method)
        
        res.append(subset)
    
    return pd.concat(res).sort_values(by='date', ascending=True)

In [9]:
df = prep(df)

In [14]:
imputing_method = 'interp_linear'
df_filled = fillna_by_grid(df, imputing_method)
df_filled.shape, \
df_filled.dropna().shape

((34312, 30), (34183, 30))

In [15]:
df_filled.dropna(inplace=True)

In [12]:
def get_train_test_split(df, split_ratio = 0.8):
    y = df['value']
    x = df.drop(columns=['value', 'date', 'grid_id', 'datetime', 'location'])

    # Only future values should go to test
    split_idx = int(df.shape[0]*split_ratio)
    train_x, test_x = x.iloc[:split_idx], x.iloc[split_idx:]
    train_y, test_y = y.iloc[:split_idx], y.iloc[split_idx:]

    return train_x, test_x, train_y, test_y

In [16]:
y = df_filled['value']
x = df_filled.drop(columns=['value', 'date', 'grid_id', 'datetime', 'location'])

In [17]:
train_x, test_x, train_y, test_y = get_train_test_split(df_filled)
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((27346, 25), (27346,), (6837, 25), (6837,))

In [18]:
from sklearn.model_selection import TimeSeriesSplit

In [19]:
tscv = TimeSeriesSplit(n_splits=4, test_size = int(x.shape[0]*0.2))

In [20]:
timecv = {'r2':[], 'rmse': []}
x_ind = []
y_ind = []
for train_index, test_index in tscv.split(x):
        print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
        train_x, test_x = x.iloc[train_index], x.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        x_ind.append(train_index)
        y_ind.append(test_index)
        
        rf = RandomForestRegressor(random_state=17, n_estimators=3)
        rf.fit(train_x, train_y)

        preds = rf.predict(test_x)
        rmse = math.sqrt(mean_squared_error(test_y, preds))
        r2 = r2_score(test_y, preds)
        
        timecv['rmse'].append(rmse)
        timecv['r2'].append(r2)

TRAIN: (6839,) TEST: (6836,)
TRAIN: (13675,) TEST: (6836,)
TRAIN: (20511,) TEST: (6836,)
TRAIN: (27347,) TEST: (6836,)


In [21]:
def objective(trial):
      #criterion = trial.suggest_categorical('criterion', ['squared_error', 'absolute_error'])
      #bootstrap = trial.suggest_categorical('bootstrap',['True','False'])
      max_depth = trial.suggest_int('max_depth', 4, 50)
      max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2'])
      #max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 1, 50)
      n_estimators =  trial.suggest_int('n_estimators', 10, 1000)
      #min_samples_split = trial.suggest_int('min_samples_split', 1, 150),
      min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 60)
      
      tscv = TimeSeriesSplit(n_splits=4, test_size = int(x.shape[0]*0.2))
      timecv = {'r2':[], 'mse': []}
      for train_index, test_index in tscv.split(x):
            #print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
            train_x, test_x = x.iloc[train_index], x.iloc[test_index]
            train_y, test_y = y.iloc[train_index], y.iloc[test_index]
            
            model = RandomForestRegressor(random_state=17, n_estimators=n_estimators, max_depth=max_depth, max_features=max_features,  
            min_samples_leaf= min_samples_leaf)#, n_jobs=4)
            model.fit(train_x, train_y)

            preds = model.predict(test_x)
            mse = mean_squared_error(test_y, preds)
            r2 = r2_score(test_y, preds)
            
            timecv['mse'].append(mse)
            timecv['r2'].append(r2)
      
      print(timecv['r2'])
      print(timecv['mse'])


      return np.mean(timecv['mse']) 

In [22]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

[32m[I 2022-04-05 22:37:00,232][0m A new study created in memory with name: no-name-c43d8a7e-23f2-46fd-8685-c9a2077d86a7[0m


KeyboardInterrupt: 

In [None]:
study.trials_dataframe()

In [71]:
os.makedirs('optim_studies', exist_ok=True)
study.trials_dataframe().to_csv('optim_studies/RF_optuna.csv', index=False)