In [42]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier


In [43]:
def readfile(bestand):
    table = None
    table = pd.read_excel(bestand, index_col=0)
    table['time'] = table.index
    
    X = table.select_dtypes(include='number')
    y = pd.get_dummies(table.select_dtypes(exclude='number') )
    y.drop(columns='Health_state_H', inplace=True)
    
    return X,y

In [44]:
def make_arr(XX):
    arr = pd.DataFrame(index=XX.index)
    arr[' ARR_1  '] = abs( (XX['V0_measured']-XX['V1_measured']) - (XX['V1_measured']-XX['V2_measured']) )
    
    dumV = XX['S'].diff().fillna(1).abs()*(XX['V0_measured']-XX['V2_measured'])
    dumV.replace(to_replace=0.0, value=np.nan, inplace=True)
    dumV.ffill(inplace=True)
    
    dumT = XX['S'].diff().fillna(1).abs()*(XX.index)             #identify all lines immediately after a switch transition
    dumT.replace(to_replace=0.0, inplace=True, method='ffill')   #forward the last switch time 

    arr[' ARR_2  '] = abs( (XX['V0_measured']-XX['V2_measured']) - dumV*np.exp(-(dumT.index - dumT)/4)  )
    
    return arr

In [45]:
# fit a RF model and determine gini importance

def selectRFmodel(XX,y):
#    imputer = KNNImputer(n_neighbors=3, weights="distance")
#    XX = imputer.fit_transform(XX)
    
    rfmodel = RandomForestClassifier(n_estimators = 100, criterion="gini",random_state = None)
    
    return rfmodel.fit(XX, y), rfmodel.feature_importances_

In [46]:
#load the balanced and complete training data in a pandas dataframe

#df_X, df_y = readfile("C:/Users/Admin/Pythonprojects/RAMS/data/Arduino_R10kR10kC200mu_10sec.xlsx")
df_X, df_y = readfile("https://raw.githubusercontent.com/chrisrijsdijk/RAMS/master/data/Arduino_R10kR10kC200mu_10sec.xlsx")

In [49]:
# infer RF models from df_X; df_y

X_names = df_X.columns      #load a string of column names to label the output of the random forest model afterwards...

rf_X_RC_model, rf_X_RC_featimp = selectRFmodel(df_X, df_y)

In [50]:
# infer RF models from arr's; df_y

df_as = make_arr(df_X)
as_names = df_as.columns      #load a string of column names to label the output of the random forest model afterwards...

rf_A_RC_model, rf_A_RC_featimp = selectRFmodel(df_as, df_y)

In [51]:
#rf_XA_RC_model.predict(df_Xas)
print('###########################################################################')
print(pd.DataFrame(rf_X_RC_featimp, index=X_names, columns=['Gini importance of the model X']).sort_values(by='Gini importance of the model X', ascending=False) )
print(' ')
print(pd.DataFrame(rf_A_RC_featimp, index=as_names, columns=['Gini importance of the model A']).sort_values(by='Gini importance of the model A', ascending=False) )
print('###########################################################################')

###########################################################################
             Gini importance of the model X
V2_measured                        0.397304
V1_measured                        0.369779
time                               0.202113
V0_measured                        0.022788
S                                  0.008016
 
          Gini importance of the model A
 ARR_2                          0.521028
 ARR_1                          0.478972
###########################################################################


In [52]:

val_X, val_y = readfile("C:/Users/Admin/Pythonprojects/RAMS/notebook/Arduino_RRC/kanweg.xlsx")


In [53]:
pd.concat([
    val_X,
    val_y,
    pd.DataFrame(rf_X_RC_model.predict(val_X), index=val_y.index)], axis='columns')

Unnamed: 0,S,V0_measured,V1_measured,V2_measured,time,Health_state_C,Health_state_R,0,1
0.0,1,4.995117,4.667969,4.326172,0.0,0,0,1,0
1.025012,1,4.995117,4.746094,4.487305,1.025012,0,0,1,0
2.052483,1,4.995117,4.799805,4.604492,2.052483,0,0,1,0
3.078557,1,4.995117,4.848633,4.692383,3.078557,0,0,1,0
4.104397,1,4.995117,4.87793,4.760742,4.104397,0,0,1,0
5.12714,1,4.995117,4.907227,4.814453,5.12714,0,0,1,0
6.149997,1,4.995117,4.926758,4.853516,6.149997,0,0,1,0
7.169579,1,4.995117,4.936523,4.882812,7.169579,0,0,1,0
8.201067,1,4.995117,4.956055,4.912109,8.201067,0,0,1,0
9.226753,1,4.995117,4.960938,4.931641,9.226753,0,0,1,0


In [40]:
val_as = make_arr(val_X)

In [41]:
pd.concat([
    val_as,
    val_y,
    pd.DataFrame(rf_A_RC_model.predict(val_as), index=val_y.index)], axis='columns')

Unnamed: 0,ARR_1,ARR_2,Health_state_C,Health_state_R,0,1
0.0,0.014648,0.0,0,0,1,0
1.025012,0.009766,0.009915,0,0,0,0
2.052483,0.0,0.009822,0,0,0,0
3.078557,0.009766,0.007108,0,0,0,0
4.104397,0.0,0.005377,0,0,0,0
5.12714,0.004883,0.004996,0,0,0,0
6.149997,0.004883,0.002167,0,0,1,0
7.169579,0.004883,0.000885,0,0,1,0
8.201067,0.004883,0.003086,0,0,1,0
9.226753,0.004883,0.003144,0,0,1,0
