In [None]:
### This project was tested using both hierarchical clustering and Random Forest Regression for 
### selecting sensors used in prediction (dimensionality reduction), but tests conducted with
### Random Forest Regression achieve significantly better results

import pandas as pd
import pickle
from sklearn import preprocessing
import numpy as np

In [None]:
file = 'interpolated_no_na_no_noise.csv' # Constructing in sensor_cleaning.py

In [None]:
def normalize(df):
    cols = df.columns.values
    df_values = df[cols].values
    min_max_scaler = preprocessing.MinMaxScaler()
    df_values_scaled = min_max_scaler.fit_transform(df_values)
    new_df = pd.DataFrame(index=df.index, data=df_values_scaled, columns=cols)
    return new_df

In [None]:
def calculate_euclidean_distance_between_columns(col1_values, col2_values):
    distance = 0
    for i in range(len(col1_values)):
        distance += np.linalg.norm(col1_values[i] - col2_values[i])
    return distance

In [None]:
def calculate_euclidean(df_normalized, start_index=0, similarity_matrix=False):
    cols = df_normalized.columns.values
    if similarity_matrix: # Checks is similarity matrix exists
        if len(cols) > len(similarity_matrix):
            start_index = len(similarity_matrix)
    else:
        similarity_matrix = []
    for i, main_col in enumerate(cols[start_index:]):
        
        similarity_measures_for_col = []
        
        print(str(i+1) + '. ' + 'Finding similarities for column ', main_col)
        
        for col_to_compare in cols:
            main_col_values = df_normalized[main_col].values
            col_to_compare_values = df_normalized[col_to_compare].values
            
            distance_between_cols = calculate_euclidean_distance_between_columns(main_col_values, col_to_compare_values)
            similarity_measures_for_col.append(distance_between_cols)
            
        similarity_matrix.append(similarity_measures_for_col)
        print(similarity_measures_for_col)
        with open('temp_similarity_matrix.pkl', 'wb') as f:
            pickle.dump(similarity_matrix, f, pickle.HIGHEST_PROTOCOL)
    
    return similarity_matrix

In [None]:
df = pd.read_csv('interpolated_no_na_no_noise.csv', index_col=0, sep=";")
df_normalized = normalize(df)

euclidean_similarity_matrix = calculate_euclidean(df_normalized)

with open('euclidean_similarity_matrix.pkl', 'wb') as f:
        pickle.dump(euclidean_similarity_matrix, f, pickle.HIGHEST_PROTOCOL)