In [7]:
import os
import pickle

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cv2
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE

In [2]:
def find(s, ch):
    return [i for i, ltr in enumerate(s) if ltr == ch]

In [3]:
data_df = pd.DataFrame()

compression_levels = ["low", "medium", "high"]
train_validate_list = ["train", "validate"]

for level_i, compression_level in enumerate(compression_levels):
    for train_validate in train_validate_list:
        path = os.path.join(compression_level, train_validate)
        for csv_name in os.listdir(path):
            file_path = os.path.join(path, csv_name)
            df = pd.read_csv(file_path, header=None, skiprows=1)
            df = df.drop(df.columns[0], axis=1)
            df = df.to_numpy()

            # Uncomment this code to display the images
            # df = df.T
            # df = df[::-1]
            # plt.imshow(df)
            # plt.show()

            # df_gray = cv2.cvtColor(df, cv2.COLOR_BGR2GRAY)
            df_blur = cv2.GaussianBlur(df, (3,3), 0)

            edges = cv2.Canny(image=np.uint8(df_blur), threshold1=100, threshold2=200) # Canny Edge Detection

            # edges = 255 - edges
            edges[edges > 255] = 1

            # plt.imshow(edges, cmap=plt.get_cmap('gray'))
            # plt.show()

            edges_count = np.sum(np.sum(edges))

            df_flattened = df.flatten()

            # Uncomment this code to display the histogram of the colors of the images
            # plt.hist(df)
            # plt.show()

            indexes = find(csv_name, "_")

            sensor_type = csv_name[: indexes[-2]]
            main_type = csv_name.split("_")[-3]
            period = csv_name.split("_")[-2]
            mean = df_flattened.mean()
            sd = df_flattened.std()
            max_val = df_flattened.max()
            min_val = df_flattened.min()
            median = np.median(df_flattened)
            trimmed_mean = stats.trim_mean(df_flattened, 0.2)
            skew = stats.skew(df_flattened)

            # Find if poisson
            spread = max_val - min_val
            if max_val - mean < 0.25 * spread:
                is_poisson = True
            else:
                is_poisson = False

            # Left and right stats
            # Left
            middle_val = (max_val - min_val) / 2 + min_val
            left = df_flattened[df_flattened < middle_val]
            left_mean = left.mean()
            left_sd = left.std()
            left_median = np.median(left)
            left_skew = stats.skew(left)
            # Right
            right = df_flattened[df_flattened >= middle_val]
            right_mean = right.mean()
            right_sd = right.std()
            right_median = np.median(right)
            right_skew = stats.skew(right)

            new_row = {"Compression": level_i,
                       "Train_validate": train_validate,
                       "Type": sensor_type,
                       "Main_Type": main_type,
                       "Period": period,
                       # Features
                       "Edges": edges_count,
                       "Mean": mean,
                       "Trimmed_mean": trimmed_mean,
                       "Median": median,
                       "Skew": skew,
                       "Is_poisson": is_poisson,
                       "SD": sd,
                       "Max": max_val,
                       "Min": min_val,
                       "Left_mean": left_mean,
                       "Left_SD": left_sd,
                       "Left_median": left_median,
                       "Left_skew": left_skew,
                       "Right_mean": right_mean,
                       "Right_SD": right_sd,
                       "Right_median": right_median,
                       "Right_skew": right_skew
            }

            data_df = data_df.append(new_row, ignore_index=True)

In [4]:
def clean_data(data_df, train_validate):
    X = data_df[(data_df["Train_validate"] == train_validate)]
    X = X.drop(columns=["Compression", "Type", "Train_validate", "Main_Type", "Type", "Period"])

    y = data_df[(data_df["Train_validate"] == train_validate)]
    y = y["Compression"]

    return X, y

In [5]:
X_train, y_train = clean_data(data_df, "train")
X_test, y_test = clean_data(data_df, "validate")

clf = RandomForestClassifier(random_state=0).fit(X_train, y_train)

print(f"Train Accuracy: {round(clf.score(X_train, y_train), 3)}")
print(f"Test Accuracy: {round(clf.score(X_test, y_test), 3)}")

print("\nFeature importances:")
zipped_lists = zip(clf.feature_importances_, X_train.columns)
sorted_pairs = sorted(zipped_lists, reverse=True)
for pair in sorted_pairs:
    print(f"{round(pair[0], 2)} {pair[1]}")

Train Accuracy: 0.994
Test Accuracy: 0.781

Feature importances:
0.21 Min
0.1 Left_median
0.09 Left_mean
0.08 Median
0.07 Right_skew
0.06 Skew
0.06 Trimmed_mean
0.05 Mean
0.05 Max
0.04 Right_median
0.04 SD
0.04 Left_skew
0.03 Right_mean
0.03 Left_SD
0.02 Right_SD
0.02 Edges
0.01 Is_poisson


In [6]:
def feature_selection(X_train, X_test, y_train, y_test):
    sfs = SequentialFeatureSelector
    rfe = RFE

    best_accuracy = 0
    best_model = None
    best_features = None
    for selector_method in [sfs, rfe]:
        for n_features in range(1, X_train.shape[1]):
            clf = RandomForestClassifier(random_state=0)
            selector = selector_method(clf, n_features_to_select=n_features)
            selector.fit(X_train, y_train)
            X_train_transformed = selector.transform(X_train)
            X_test_transformed = selector.transform(X_test)

            clf.fit(X_train_transformed, y_train)
            test_accuracy = clf.score(X_test_transformed, y_test)

            if test_accuracy > best_accuracy:
                best_accuracy = test_accuracy
                best_model = clf
                best_features = selector.get_feature_names_out()

    print(f"Best accuracy: {best_accuracy}")
    print("Best features:", best_features)

    # Uncomment to save model
    # with open('best_model.pkl', 'wb') as handle:
    #     pickle.dump(best_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return best_model

Best accuracy: 0.8333333333333334
Best features: ['Median' 'Min' 'Left_mean' 'Left_median' 'Right_skew']


In [12]:
def get_best_model(X_train, X_test, y_train, y_test, retrain=False):
    if retrain:
        return feature_selection(X_train, X_test, y_train, y_test)
    
    with open('best_model.pkl', 'rb') as handle:
        return pickle.load(handle)

In [18]:
model = get_best_model(X_train, X_test, y_train, y_test)

In [17]:
# Copy and pasted column names. Didn't want to automate this
features = ['Median', 'Min', 'Left_mean', 'Left_median', 'Right_skew']
zipped_lists = zip(model.feature_importances_, features)
sorted_pairs = sorted(zipped_lists, reverse=True)
for pair in sorted_pairs:
    print(f"{round(pair[0], 2)} {pair[1]}")

0.36 Min
0.18 Left_median
0.17 Median
0.16 Left_mean
0.14 Right_skew
