In [1]:
# coding: utf-8
import pandas as pd
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import KBinsDiscretizer
from helpers import *
import math
import csv
pd.set_option('display.max_columns', 100)

**Settings**

*Parameters*
- ``output``: set the name of the output that you want to predict in the same time
- ``mode``: To choose use the data in seasonwise or yearwise.
- ``INPUT_FOLDER``: To set the path that where to get the input data. By default no need to be modified.
- ``OUTPUT_FOLDER``: To set the path that where to put the data. By default no need to be modified.
- ``train_dim``,``test_dim``,``validate_dim``: Dimension of the splitting. Default are respectively $0.6$, $0.2$ and $0.2$.

*Memory problem:* If ``MemoryError`` arise (with current parameters and 32GB of ram would be very unlikely), different changes can be done to make the script less RAM heavy. With  `` mode = 'season' `` the random forest is performed seasonally and the dataset on which the regression in performed is $1/4$ in dimension. Other matrix dimension reduction can be done by lowering the dimension of training dataset (``train_dim``). 

## Some configurations for running the code
output, seanwise/yearwise, input/output directory

In [2]:
output = ['u_x', 'u_y']
mode = 'season'
if mode == 'season':
    duration = ["Spring","Summer","Autumn","Winter"]
elif mode == 'full':
    duration = ["full_year"]
else:
    duration = [mode]
INPUT_FOLDER = '../data/'
OUTPUT_FOLDER = '../results/random_forest/'
train_dim = 0.01
test_dim = 0.1
validate_dim = 0.89

## Some output file configurations. 
Open and close to make sur the file is empty for each run

In [3]:
filetxt = open(OUTPUT_FOLDER + "/bins_proposal.txt", "w")
filetxt.close()
filetxt = open(OUTPUT_FOLDER + "/mses_u_seasons.txt", "w")
filetxt.close()
filetxt = open(OUTPUT_FOLDER + "/rsquared_u_seasons.txt", "w")
filetxt.close()
filetxt = open(OUTPUT_FOLDER + "/magnitude_average_pred.txt", "w")
filetxt.close()
filetxt = open(OUTPUT_FOLDER + ("Order_feature_importance_for_%s.txt" % mode), "w")
filecsv = open(OUTPUT_FOLDER +("feature_for_ridge_%s.txt" % mode), "w")
csvwriter = csv.writer(filecsv,delimiter = ',')

## Main part of the random forest regression + feature importance
A big for loop for recurrence for 4 seasons

In [4]:
for i,season in enumerate(duration):
    
    # Load the data in pandas
    tot_df=pd.read_csv(INPUT_FOLDER + 'regression_mat_year.csv',index_col=0)

    if mode == 'season':
    # Split the season
        tot_df = season_splitter(tot_df)
        tot_df = tot_df[i]
        
    ##----------- Preprocessing --------------##
    # Vectorize speed
    tot_df = vectorize_wind_speed(tot_df)
    # Prepare the input and output
    x = np.array(tot_df.drop(columns=['u_x', 'u_y','u_z']))
    y_continue = np.array(tot_df[output])
    feat_labels = tot_df.drop(columns=['u_x', 'u_y','u_z']).columns
    del tot_df
    # Discretize the output 
    bins = bins_proposal(y_continue,0.1)
    discretizer = KBinsDiscretizer(bins, encode='ordinal', strategy='uniform')
    discretizer.fit(y_continue)
    y_disc = discretizer.transform(y_continue)
    # Split train and test
    x_tr,y_tr,x_ev,y_ev,x_te,y_te = split_train_evaluation_test(x,y_disc,train_dim,validate_dim,test_dim)
    del y_disc, x
    
    ##-------- Random forest trainning ------------##
    rf = RandomForestClassifier(n_estimators=1, max_depth=1000, criterion='gini', random_state=0)
    rf.fit(x_tr, y_tr)
    print("Random forest for one season finished")
    del x_tr, y_tr
    
    ##-------- Random forest prediction ------------##
    # Transfer back to the original data
    y_pred = rf.predict(x_te)
    y_pred = discretizer.inverse_transform(y_pred)
    y_te  = discretizer.inverse_transform(y_te)
    _, y_te_hs = split_hs_test(x_te,y_te)
    _, y_pred_hs = split_hs_test(x_te,y_pred)
    del x_te,y_te,y_pred
    mse, rsq = compute_mse_rsq(y_te_hs, y_pred_hs)
    
    ##----------- Plot and save the prediction result ------------##
    if len(output) == 2:
        plot_ys(y_pred_hs,y_te_hs,OUTPUT_FOLDER,save=True,name=('%s'% season))
    if len(output) == 1:
        plot_ys_single(y_pred_hs,y_te_hs,OUTPUT_FOLDER,save=True,name=('%s'% season))
    write_rf_prediction(OUTPUT_FOLDER,bins,mse,rsq,season)
    profile = np.zeros([1,len(y_pred_hs)])
    for j in range(len(y_pred_hs)):
        profile[0,j] = magnitude_avg(y_pred_hs[j])
    df = pd.DataFrame(profile)
    df.to_csv(OUTPUT_FOLDER + "/magnitude_average_pred.txt", mode = 'a', header=None)
    
    ##----------- Plot and save the feature importance result ------------##
    importances = rf.feature_importances_
    if i == 0:
        fi = pd.DataFrame(columns = feat_labels)
    df2 = pd.DataFrame(np.array(importances).reshape(1,-1), columns = feat_labels)
    fi = fi.append(df2, ignore_index=True)
    important_features = extract_important_features(feat_labels,importances,0.8)
    ## For the ordering feature importance
    filetxt.write("\n For the %s: \n" % season)
    write_feature_importance(filetxt, importances,feat_labels, important_features)
    ## The feature importance for the ridge regression
    csvwriter.writerow([season] + important_features)
    del rf 

KeyboardInterrupt: 

## Save and close some files

In [None]:
fi.to_csv(OUTPUT_FOLDER + "/feature_importance.txt", header=True)
plot_feature_importance(OUTPUT_FOLDER, fi, name = 'feature_importance')    
filetxt.close()
filecsv.close()