In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.metrics import auc, plot_roc_curve, average_precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

df = pd.read_csv("Data.csv", sep=";")
y = df["Target"]
X = df.iloc[:,1:-1]

# scaler.fit(df)
# df_scaled = scaler.transform(df)- RF is used with unstandardized data

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, test_size=0.30, random_state=1, shuffle=True)
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, 
                                                                      y_train, test_size=0.2, random_state=1, shuffle=True)

# Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.metrics import auc, plot_roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
rfc = RandomForestClassifier(random_state = 42)
print('Parameters currently in use:\n')
pprint(rfc.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


## Training

In [None]:
# Best: 0.910809 using {'bootstrap': False, 'max_depth': 10, 'max_features': 'auto', 
#                       'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 200}
rfc = RandomForestClassifier(n_estimators = 200, min_samples_split=8, min_samples_leaf=1,
                             max_depth=10, bootstrap = False, criterion = "gini", max_features="auto")
rfc.fit(X_train_val, y_train_val)
pred = rfc.predict(X_test_val)
print(classification_report(y_test_val,pred, digits=3))
print(confusion_matrix(y_test_val,pred))
print(f"Average precision score {average_precision_score(y_test_val, pred):.3f} ")

## Test-Data

In [None]:
# Best: 0.910809 using {'bootstrap': False, 'max_depth': 10, 'max_features': 'auto', 
#                       'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 200}
rfc = RandomForestClassifier(n_estimators = 200, min_samples_split=8, min_samples_leaf=1,
                             max_depth=10, bootstrap = False, criterion = "gini", max_features="auto")
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(classification_report(y_test,pred, digits=3))
print(confusion_matrix(y_test,pred))

## Cross validation

In [None]:
cv = StratifiedKFold(n_splits=5)
rfc = RandomForestClassifier(n_estimators = 200, min_samples_split=8, max_depth=10, 
                                    min_samples_leaf=1, 
                                    bootstrap = False, max_features="auto")

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X_train, y_train)):
    rfc.fit(X_train.iloc[train], y_train.iloc[train])
    viz = plot_roc_curve(rfc, X_train.iloc[test], y_train.iloc[test],
                         name='ROC fold {}'.format(i),
                         alpha=0.3, lw=1, ax=ax)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
        label=r'Mean ROC (AUC = %0.3f $\pm$ %0.3f)' % (mean_auc, std_auc),
        lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                label=r'$\pm$ 1 std. dev.')

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="a) Receiver operating characteristics: RFC")
ax.legend(loc="lower right")

plt.savefig("rfc.pdf", dpi=600, transparent=True)
plt.savefig("rfc.jpg", dpi=600, transparent=True)
plt.show()

## Y Randomisierung

In [None]:
from sklearn.metrics import average_precision_score
Accur = []
for i in range(1,100):
    y_rand = shuffle(y)
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y_rand, test_size=0.30, random_state=1)
    rfc.fit(X_train, y_train)
    pred = rfc.predict(X_test)
    Accur.append(average_precision_score(y_test, pred))
#     print(classification_report(y_test,pred,digits=3))
#     print(confusion_matrix(y_test,pred))
print(f"Mean f1 score = {np.mean(Accur):.3f}, std {np.std(Accur):.3f}")

## Feature importance

In [None]:
# Best: 0.910809 using {'bootstrap': False, 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 200}

feature_names = [f'feature {i}' for i in range(X.shape[1])]
rfc = RandomForestClassifier(n_estimators = 200, min_samples_split=8, min_samples_leaf=1,
                             max_depth=10, bootstrap = False, criterion = "gini", max_features="auto")
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)

importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0)
names = X.columns
d_imp = pd.DataFrame({"Means": importances, "STD": std}, index=names)
d_imp = d_imp.iloc[0:9,:]

fig, ax = plt.subplots()
d_imp.plot.bar(y="Means",yerr=d_imp["STD"], ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()
fig.show()