In [None]:
%matplotlib inline

import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

from sklearn.ensemble import RandomForestClassifier

## Read in training data and label dictionary

### Define the data and label paths

In [None]:
# Training data file from previous step
data_path = "results/training_data_singlepoints.txt"

# Dictionary with class labels from previous step
labels_path = "results/class_labels.json"

### Load the data and identify the feature columns for the model

In [None]:
# load the data
model_input = np.loadtxt(data_path)

# load the column_names
with open(data_path, "r") as file:
    header = file.readline()

# Remove comment symbol from header, then extract label and feature names
column_names = header.split()[1:]

label_col = column_names[0]
feature_cols = column_names[1:]

print(f"Label column:\n{label_col}\n")
print(f"Feature columns:\n{feature_cols}\n")

# Extract relevant indices from training data
model_col_indices = [column_names.index(var_name) for var_name in column_names[1:]]

### Load the class labels dictionary

In [None]:
# Read the class label dictionary
with open(labels_path, "r") as json_file:
    labels_dict = json.load(json_file)
    
print(labels_dict)

## Convert model input into sklearn format

In [None]:
# Insert data into a Pandas DataFrame, then split into features and labels
model_input_df = pd.DataFrame(model_input, columns=column_names)
X = model_input_df.drop(label_col, axis=1)
y = model_input_df[[label_col]]

### Check data counts for each class

In [None]:
# Investigate value counts for each class
model_input_df[label_col].value_counts()

## Fit a Random Forest model for feature importance
This step is purely to investigate the feature importance in a Random Forest model, without any model optimisation. This step can be used to understand which features are predictive of the class label. From this, the feature list can be refined or expanded.

In [None]:
# Construct the model
model = RandomForestClassifier(random_state=42)

# Fit the model
model.fit(X, y.values.ravel());

### Display the features in ascending order of importance

In [None]:
order = np.argsort(model.feature_importances_)

fig, ax = plt.subplots(figsize=(6, 12))
ax.barh(range(len(np.array(feature_cols)[order])), model.feature_importances_[order])
ax.set_xlabel("Importance", fontsize=14)
ax.set_ylabel("Feature", fontsize=14)
ax.set_yticks(np.arange(len(order)))
ax.set_yticklabels(np.array(feature_cols)[order], rotation=0, fontsize=8)
plt.ylim([-1,len(np.array(feature_cols)[order])])
plt.xlim(0, max(model.feature_importances_)+0.001)

plt.savefig("results/feature_importance.png", dpi=300, bbox_inches="tight", facecolor="white")

## Investigate correlation in features

In [None]:
correlation_matrix = X.corr().abs()

In [None]:
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape),k=1).astype(bool))

In [None]:
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
print(); print(to_drop)

In [None]:
df1 = X.drop(to_drop, axis=1)
print(); print(df1.columns)

In [None]:
# Construct the model
model = RandomForestClassifier(random_state=42)

# Fit the model
model.fit(df1, y.values.ravel());

In [None]:
order = np.argsort(model.feature_importances_)

fig, ax = plt.subplots(figsize=(12, 8))
ax.bar(x=np.array(df1.columns)[order], height=model.feature_importances_[order])
ax.set_ylabel("Importance", labelpad=10)
ax.set_xlabel("Feature", labelpad=10)
ax.set_xticks(np.arange(len(df1.columns)))
ax.set_xticklabels(np.array(df1.columns)[order], rotation=90)
plt.tight_layout()

fig.savefig(f"results/feature_importance_afterremovecorrelation.png", dpi=300, bbox_inches="tight", facecolor="white")