In [None]:
import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
from sklearn.ensemble import RandomForestClassifier

### Helper functions

In [None]:
def merge_scores(list_with_scores):
    """
    Merge scores from cross-validation into one score dictionary.

    Parameters
    ----------
    list_with_scores : list
        contains two or more crossvalidation scores
    
    Returns
    -------
    dict
        dictionary with merged scores
    """

    for i in range(1,len(list_with_scores)):
        for k in list_with_scores[0].keys():
            list_with_scores[0][k] = np.concatenate((list_with_scores[0][k], list_with_scores[i][k]), axis=0)

    return list_with_scores[0]

### Loading the file with features

In [None]:
dataset = pd.read_csv("../data/features.csv")
X = dataset.iloc[:,4:-1]
y = dataset.iloc[:,-1]

## Preprocessing

In [None]:
# removing the columns with missing values
X = X.dropna(axis=1)
# adding the performed activities and imaging session number as dummy variable
X = pd.concat([X, pd.get_dummies(dataset.iloc[:,3], drop_first=True), pd.get_dummies(dataset.iloc[:,2], prefix="rep_", drop_first=True)], axis=1)

# Saving the column names
col_names = X.columns

# convert to numpy arrays
X, y = np.array(X), np.array(y)

# Standardization with standard scaler
standard_scaler = StandardScaler()
norm_X = standard_scaler.fit_transform(X)

## Training the machine learning model

In [None]:
# Constants used for training
NUM_TRIALS = 10
K_FOLD = 10
N_JOBS = -1

In [None]:
# Random Forest
rf = RandomForestClassifier()

# parameter grid for GridSearchCV
p_grid = {
    'n_estimators': [1000],
    'criterion': ["gini", "entropy", "log_loss"],
    'max_depth': [50, 100, None],
    'max_features': ["sqrt", "log2"]
}

nested_scores = []

count = 0
for i in range(NUM_TRIALS):
    inner_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=K_FOLD, shuffle=True, random_state=i*10)

    clf = GridSearchCV(estimator=rf, param_grid=p_grid, scoring="roc_auc", cv=inner_cv, n_jobs=N_JOBS)
    nested_cv = cross_validate(estimator=clf, scoring="roc_auc", X=norm_X, y=y, n_jobs=N_JOBS, cv=outer_cv, return_estimator=True)

    count += 1
    print(f"RF Trial {count}/{NUM_TRIALS}")
    nested_scores.append(nested_cv)

nested_scores = merge_scores(nested_scores)


Extracting the feature importances from the best estimator in each cross validation

In [None]:
df_dict = {'features': col_names}
df_dict.update({f'cv_{i+1}_importances':nested_scores["estimator"][i].best_estimator_.feature_importances_ for i in range(len(nested_scores["estimator"]))})
df = pd.DataFrame(df_dict)
df

Calculating row mean and standard deviation

In [None]:
df['rowsMean']=df.iloc[:,1:].mean(axis=1)
df['rowsStd']=df.iloc[:,1:].std(axis=1)

Sorting by mean feature importance and selecting the best 20 features

In [None]:
sorted_df = df.sort_values(by=['rowsMean'], ascending=False)
sorted_df[:20]

Ploting the figure of the best 20 features and their Gini importance

In [None]:
NUM_FEATURES = 20

top_features = sorted_df["features"][:NUM_FEATURES]


fig, ax = plt.subplots(figsize=(5,5))
y_pos = np.arange(NUM_FEATURES)

ax.barh(y_pos, sorted_df['rowsMean'][:NUM_FEATURES], xerr=sorted_df['rowsStd'][:NUM_FEATURES], align='center')
ax.set_yticks(y_pos, top_features)
ax.invert_yaxis()
ax.set_xlabel('Gini importance')