In [None]:
#=================== Random Forest selected features ========================================================================================================#
#=================== Export industries with Random Forest selected features===================================================================================#


!pip install dask
!pip install pandas
!pip install numpy
!pip install sklearn

import dask.dataframe as dd
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler



##### Set the current working directory

path = r'C:/Users/endou012/emile/filemonth'                    
all_files = glob.glob(path + "/*.csv")

In [2]:
###### Use GridSearchCV function to fine tuning hyperparameters for Random forest


# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3,4],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300]
}

# Create a based model
rf = RandomForestRegressor(random_state = 42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)


# For each industry, fine-tuning hyperparameters

Best_grid=[]


for i in range(63):
    file=pd.read_csv(all_files[i],index_col='date')
    file_x1=file[file.columns[1:]]
    file_x=StandardScaler().fit_transform(file_x1)
    file_y=file[file.columns[0]]
    grid_search.fit(file_x, file_y)
    best_grid = grid_search.best_estimator_
    Best_grid.append(best_grid) 

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Fitting 3 folds for each of 324 candidates, totalling 9

In [4]:
# For each industry, fit Random Forest model on entire data sample and select the 30 most important features. We add the first lag target variable as an additional feature.


Df_2=[]

for i in range(63):
    file=pd.read_csv(all_files[i],index_col='date')
    file_x1=file[file.columns[1:]]
    file_x=StandardScaler().fit_transform(file_x1)
    file_y=file[file.columns[0]]
    features=np.array(file_x1.columns)
    rf = Best_grid[i]
    rf.fit(file_x, file_y)
    f_a = list(zip(features,rf.feature_importances_))
    f_a.sort(key = lambda x : x[1])
    A=[]
    for j in range(56):
        if f_a[0:56][j][1]>=0:
              A.append(f_a[0:56][j][0])
         
       
    file_xx=file_x1.drop(A,axis=1)
    df=pd.concat([file_y,file_xx],axis=1)
    df['first_lag_excess_return']=file_y.shift(periods=1)
    df=df.fillna(0)
    Df_2.append(df)


# Graphical representation of selected variables

In [None]:
Df=[]

for i in range(63):
    file=pd.read_csv(all_files[i],index_col='date')
    file_x1=file[file.columns[1:]]
    file_x=StandardScaler().fit_transform(file_x1)
    file_y=file[file.columns[0]]
    features=np.array(file_x1.columns)
    rf = Best_grid[i]
    rf.fit(file_x, file_y)
    f_a = list(zip(features,rf.feature_importances_))
    f_a.sort(key = lambda x : x[1])
       
    Df.append(f_a[56:])
    
f_b=Df[62][0:15]
f_b.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_b],[x[1] for x in f_b])
    
plt.savefig('imp_var62_1.pdf')
plt.show()

In [8]:
###### Export industries with Random Forest selected features.

Df_2[0].to_csv('C:/Users/endou012/emile/month/file_0.csv')
Df_2[1].to_csv('C:/Users/endou012/emile/month/file_1.csv')
Df_2[2].to_csv('C:/Users/endou012/emile/month/file_2.csv')
Df_2[3].to_csv('C:/Users/endou012/emile/month/file_3.csv')
Df_2[4].to_csv('C:/Users/endou012/emile/month/file_4.csv')    
Df_2[5].to_csv('C:/Users/endou012/emile/month/file_5.csv')
Df_2[6].to_csv('C:/Users/endou012/emile/month/file_6.csv')
Df_2[7].to_csv('C:/Users/endou012/emile/month/file_7.csv')
Df_2[8].to_csv('C:/Users/endou012/emile/month/file_8.csv') 

Df_2[9].to_csv('C:/Users/endou012/emile/month/file_9.csv')
Df_2[10].to_csv('C:/Users/endou012/emile/month/file_10.csv')
Df_2[11].to_csv('C:/Users/endou012/emile/month/file_11.csv')
Df_2[12].to_csv('C:/Users/endou012/emile/month/file_12.csv')    
Df_2[13].to_csv('C:/Users/endou012/emile/month/file_13.csv')
Df_2[14].to_csv('C:/Users/endou012/emile/month/file_14.csv')
Df_2[15].to_csv('C:/Users/endou012/emile/month/file_15.csv')
Df_2[16].to_csv('C:/Users/endou012/emile/month/ile_16.csv')

Df_2[17].to_csv('C:/Users/endou012/emile/month/ile_17.csv')
Df_2[18].to_csv('C:/Users/endou012/emile/month/file_18.csv')
Df_2[19].to_csv('C:/Users/endou012/emile/month/file_19.csv')
Df_2[20].to_csv('C:/Users/endou012/emile/month/file_20.csv')    
Df_2[21].to_csv('C:/Users/endou012/emile/month/file_21.csv')
Df_2[22].to_csv('C:/Users/endou012/emile/month/file_22.csv')
Df_2[23].to_csv('C:/Users/endou012/emile/month/file_23.csv')
Df_2[24].to_csv('C:/Users/endou012/emile/month/file_24.csv') 

Df_2[25].to_csv('C:/Users/endou012/emile/month/file_25.csv')
Df_2[26].to_csv('C:/Users/endou012/emile/month/file_26.csv')
Df_2[27].to_csv('C:/Users/endou012/emile/month/file_27.csv')
Df_2[28].to_csv('C:/Users/endou012/emile/month/file_28.csv')    
Df_2[29].to_csv('C:/Users/endou012/emile/month/file_29.csv')
Df_2[30].to_csv('C:/Users/endou012/emile/month/file_30.csv')
Df_2[31].to_csv('C:/Users/endou012/emile/month/file_31.csv')
Df_2[32].to_csv('C:/Users/endou012/emile/month/file_32.csv') 


Df_2[33].to_csv('C:/Users/endou012/emile/month/file_33.csv')
Df_2[34].to_csv('C:/Users/endou012/emile/month/file_34.csv')    
Df_2[35].to_csv('C:/Users/endou012/emile/month/file_35.csv')
Df_2[36].to_csv('C:/Users/endou012/emile/month/file_36.csv')
Df_2[37].to_csv('C:/Users/endou012/emile/month/file_37.csv')
Df_2[38].to_csv('C:/Users/endou012/emile/month/file_38.csv') 

Df_2[39].to_csv('C:/Users/endou012/emile/month/file_39.csv')
Df_2[40].to_csv('C:/Users/endou012/emile/month/file_40.csv')
Df_2[41].to_csv('C:/Users/endou012/emile/month/file_41.csv')
Df_2[42].to_csv('C:/Users/endou012/emile/month/file_42.csv')    
Df_2[43].to_csv('C:/Users/endou012/emile/month/file_43.csv')
Df_2[44].to_csv('C:/Users/endou012/emile/month/file_44.csv')
Df_2[45].to_csv('C:/Users/endou012/emile/month/file_45.csv')
Df_2[46].to_csv('C:/Users/endou012/emile/month/file_46.csv')

Df_2[47].to_csv('C:/Users/endou012/emile/month/file_47.csv')
Df_2[48].to_csv('C:/Users/endou012/emile/month/file_48.csv')
Df_2[49].to_csv('C:/Users/endou012/emile/month/file_49.csv')
Df_2[50].to_csv('C:/Users/endou012/emile/month/file_50.csv')    
Df_2[51].to_csv('C:/Users/endou012/emile/month/file_51.csv')
Df_2[52].to_csv('C:/Users/endou012/emile/month/file_52.csv')
Df_2[53].to_csv('C:/Users/endou012/emile/month/file_53.csv')
Df_2[54].to_csv('C:/Users/endou012/emile/month/file_54.csv') 

Df_2[55].to_csv('C:/Users/endou012/emile/month/file_55.csv')
Df_2[56].to_csv('C:/Users/endou012/emile/month/file_56.csv')
Df_2[57].to_csv('C:/Users/endou012/emile/month/file_57.csv')
Df_2[58].to_csv('C:/Users/endou012/emile/month/file_58.csv')    
Df_2[59].to_csv('C:/Users/endou012/emile/month/file_59.csv')
Df_2[60].to_csv('C:/Users/endou012/emile/month/file_60.csv')
Df_2[61].to_csv('C:/Users/endou012/emile/month/file_61.csv')
Df_2[62].to_csv('C:/Users/endou012/emile/month/file_62.csv') 
