##### This notebook implements feature elimination to the preprocessed dataframe.

In more detail it implements the feature elimination by following the **correlation analysis** technique

In [1]:
import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

Load the hourly preprocessed dataframe

In [2]:
training_df = pd.read_pickle('../data/preprocessing_temps/preprocessed_training_df_hourly.pkl')
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,distance,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,WORK/SCHOOL,badges,exercises,exercise_duration,is_weekend,is_holiday,day_sin,hour_sin,day_cos,hour_cos
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,0.810469,0.622928,0.0,0.029382,0.008276,0.254701,0.083045,0.495139,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.500000,0.574623,1.000000
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,0.810469,0.622928,0.0,0.002914,0.000000,0.254701,0.083045,0.495139,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.629410,0.574623,0.982963
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,0.810469,0.622928,0.0,0.000729,0.021869,0.254701,0.083045,0.495139,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.750000,0.574623,0.933013
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,0.810469,0.622928,0.0,0.012860,0.001254,0.254701,0.083045,0.495139,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.853553,0.574623,0.853553
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,0.810469,0.622928,0.0,0.003315,0.000000,0.254701,0.083045,0.495139,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.933013,0.574623,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,0.680095,0.720932,0.0,0.000383,0.027588,0.000000,0.000000,0.490278,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.982963,0.020417,0.370590
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,0.680095,0.720932,0.0,0.000383,0.027588,0.000000,0.000000,0.490278,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.933013,0.020417,0.250000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,0.680095,0.720932,0.0,0.000383,0.027588,0.000000,0.000000,0.490278,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.853553,0.020417,0.146447
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,0.680095,0.720932,0.0,0.000383,0.027588,0.000000,0.000000,0.490278,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.750000,0.020417,0.066987


##### Correlation Analysis

Compute the correlation matrix of all the training set features and find feature pairs with correlation higher than 0.9

In [3]:
# compute the correlation matrix
correlation_analysis_df = training_df.drop(columns=['id', 'date'])
correlation_matrix = correlation_analysis_df.corr()

# find and print column pairs with correlation higher than 0.9
high_correlation_pairs = [(col1, col2) for col1 in correlation_analysis_df.columns for col2 in correlation_analysis_df.columns if col1 < col2 and correlation_matrix.loc[col1, col2] > 0.9]
for col1, col2 in high_correlation_pairs:
    print(col1, "-", col2, " --> correlation", correlation_matrix.loc[col1, col2])

distance - steps  --> correlation 0.9447563780490101


Calculate the VIF for each column pair and find for each pair the feature with the higher VIF

In [4]:
for col1, col2 in high_correlation_pairs:
    vif1 = variance_inflation_factor(correlation_analysis_df.values, correlation_analysis_df.columns.get_loc(col1))
    vif2 = variance_inflation_factor(correlation_analysis_df.values, correlation_analysis_df.columns.get_loc(col2))
    if vif1 > vif2:
        print(col1, " --> the higher VIF:",vif1, "while", col2, "--> VIF:", vif2)
    else:
        print(col2, " --> the higher VIF:",vif2, "while", col1, "--> VIF:", vif1)

steps  --> the higher VIF: 18.249657784990216 while distance --> VIF: 16.0668799519794


Drop the features with higher VIF

In [5]:
training_df.drop(columns=['distance'], inplace=True)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,calories,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,steps,...,WORK/SCHOOL,badges,exercises,exercise_duration,is_weekend,is_holiday,day_sin,hour_sin,day_cos,hour_cos
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,0.810469,0.622928,0.0,0.029382,0.254701,0.083045,0.495139,0.017563,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.500000,0.574623,1.000000
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,0.810469,0.622928,0.0,0.002914,0.254701,0.083045,0.495139,0.000000,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.629410,0.574623,0.982963
2,621e2e8e67b776a24055b564,2021-05-24 02:00:00,0.810469,0.622928,0.0,0.000729,0.254701,0.083045,0.495139,0.046184,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.750000,0.574623,0.933013
3,621e2e8e67b776a24055b564,2021-05-24 03:00:00,0.810469,0.622928,0.0,0.012860,0.254701,0.083045,0.495139,0.002661,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.853553,0.574623,0.853553
4,621e2e8e67b776a24055b564,2021-05-24 04:00:00,0.810469,0.622928,0.0,0.003315,0.254701,0.083045,0.495139,0.000000,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.933013,0.574623,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,2021-08-17 07:00:00,0.680095,0.720932,0.0,0.000383,0.000000,0.000000,0.490278,0.064983,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.982963,0.020417,0.370590
159922,621e375b67b776a240290cdc,2021-08-17 08:00:00,0.680095,0.720932,0.0,0.000383,0.000000,0.000000,0.490278,0.064983,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.933013,0.020417,0.250000
159923,621e375b67b776a240290cdc,2021-08-17 09:00:00,0.680095,0.720932,0.0,0.000383,0.000000,0.000000,0.490278,0.064983,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.853553,0.020417,0.146447
159924,621e375b67b776a240290cdc,2021-08-17 10:00:00,0.680095,0.720932,0.0,0.000383,0.000000,0.000000,0.490278,0.064983,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.750000,0.020417,0.066987


In [6]:
training_df.to_pickle('../data/clustering_input/clustering_df_hourly_full.pkl')

Load the daily preprocessed dataframe

In [6]:
training_df = pd.read_pickle('../data/preprocessing_temps/preprocessed_training_df_daily.pkl')
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,badges,calories,distance,lightly_active_minutes,moderately_active_minutes,...,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,is_weekend,is_holiday,day_sin,hour_sin,day_cos,hour_cos
0,621e2e8e67b776a24055b564,2021-05-24,0.809989,0.623032,0.065476,0.000000,0.280325,0.218337,0.254701,0.083045,...,0.0,0.0,0.0,0.0,0.0,0.0,0.005131,0.0,5.746227e-01,0.0
1,621e2e8e67b776a24055b564,2021-05-25,0.809989,0.681027,0.074405,0.000000,0.277999,0.240483,0.225641,0.086505,...,0.0,0.0,0.0,1.0,0.0,0.0,0.030521,0.0,6.728133e-01,0.0
2,621e2e8e67b776a24055b564,2021-05-25,0.809989,0.681027,0.074405,0.000000,0.277999,0.240483,0.225641,0.086505,...,0.0,1.0,0.0,0.0,0.0,0.0,0.030521,0.0,6.728133e-01,0.0
3,621e2e8e67b776a24055b564,2021-05-26,0.961998,0.681027,0.089286,0.000000,0.269678,0.204045,0.191453,0.093426,...,0.0,1.0,0.0,0.0,0.0,0.0,0.075133,0.0,7.638763e-01,0.0
4,621e2e8e67b776a24055b564,2021-05-27,0.923996,0.681027,0.071429,0.076923,0.277166,0.222879,0.227350,0.072664,...,0.0,0.0,0.0,1.0,0.0,0.0,0.137138,0.0,8.440835e-01,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8392,621e375b67b776a240290cdc,2021-08-13,0.681236,0.724307,0.000000,0.000000,0.154455,0.251051,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.742963,0.0,6.041634e-02,0.0
8393,621e375b67b776a240290cdc,2021-08-14,0.681236,0.724307,0.000000,0.000000,0.154455,0.251051,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.649874,0.0,2.041741e-02,0.0
8394,621e375b67b776a240290cdc,2021-08-15,0.681236,0.724307,0.000000,0.000000,0.154455,0.251051,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,1.0,0.550649,0.0,5.551115e-17,0.0
8395,621e375b67b776a240290cdc,2021-08-16,0.681236,0.724307,0.000000,0.000000,0.154455,0.251051,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.449351,0.0,0.000000e+00,0.0


Check if there are columns that contain only 1 unique value

In [7]:
# check if there are colums that contain only 1 value and print them and drop them
single_value_columns = [col for col in training_df.columns if training_df[col].nunique() == 1]
print("These columns contain one single value: ", single_value_columns)
training_df.drop(columns=single_value_columns, inplace=True)
training_df

These columns contain one single value:  ['hour_sin', 'hour_cos']


Unnamed: 0,id,date,sleep_points,exertion_points,altitude,badges,calories,distance,lightly_active_minutes,moderately_active_minutes,...,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,is_weekend,is_holiday,day_sin,day_cos
0,621e2e8e67b776a24055b564,2021-05-24,0.809989,0.623032,0.065476,0.000000,0.280325,0.218337,0.254701,0.083045,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005131,5.746227e-01
1,621e2e8e67b776a24055b564,2021-05-25,0.809989,0.681027,0.074405,0.000000,0.277999,0.240483,0.225641,0.086505,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.030521,6.728133e-01
2,621e2e8e67b776a24055b564,2021-05-25,0.809989,0.681027,0.074405,0.000000,0.277999,0.240483,0.225641,0.086505,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.030521,6.728133e-01
3,621e2e8e67b776a24055b564,2021-05-26,0.961998,0.681027,0.089286,0.000000,0.269678,0.204045,0.191453,0.093426,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.075133,7.638763e-01
4,621e2e8e67b776a24055b564,2021-05-27,0.923996,0.681027,0.071429,0.076923,0.277166,0.222879,0.227350,0.072664,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.137138,8.440835e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8392,621e375b67b776a240290cdc,2021-08-13,0.681236,0.724307,0.000000,0.000000,0.154455,0.251051,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.742963,6.041634e-02
8393,621e375b67b776a240290cdc,2021-08-14,0.681236,0.724307,0.000000,0.000000,0.154455,0.251051,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.649874,2.041741e-02
8394,621e375b67b776a240290cdc,2021-08-15,0.681236,0.724307,0.000000,0.000000,0.154455,0.251051,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.550649,5.551115e-17
8395,621e375b67b776a240290cdc,2021-08-16,0.681236,0.724307,0.000000,0.000000,0.154455,0.251051,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.449351,0.000000e+00


Compute the correlation matrix of all the training set features and find feature pairs with correlation higher than 0.9

In [8]:
# compute the correlation matrix
correlation_analysis_df = training_df.drop(columns=['id', 'date'])
correlation_matrix = correlation_analysis_df.corr()

# find and print column pairs with correlation higher than 0.9
high_correlation_pairs = [(col1, col2) for col1 in correlation_analysis_df.columns for col2 in correlation_analysis_df.columns if col1 < col2 and correlation_matrix.loc[col1, col2] > 0.9]
for col1, col2 in high_correlation_pairs:
    print(col1, "-", col2, " --> correlation", correlation_matrix.loc[col1, col2])

distance - steps  --> correlation 0.9843966323888864


Calculate the VIF for each column pair and find for each pair the feature with the higher VIF

In [9]:
for col1, col2 in high_correlation_pairs:
    vif1 = variance_inflation_factor(correlation_analysis_df.values, correlation_analysis_df.columns.get_loc(col1))
    vif2 = variance_inflation_factor(correlation_analysis_df.values, correlation_analysis_df.columns.get_loc(col2))
    if vif1 > vif2:
        print(col1, " --> the higher VIF:",vif1, "while", col2, "--> VIF:", vif2)
    else:
        print(col2, " --> the higher VIF:",vif2, "while", col1, "--> VIF:", vif1)

steps  --> the higher VIF: 179.32008117020973 while distance --> VIF: 166.24683902487723


In [10]:
training_df.drop(columns=['distance'], inplace=True)
training_df

Unnamed: 0,id,date,sleep_points,exertion_points,altitude,badges,calories,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,...,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL,is_weekend,is_holiday,day_sin,day_cos
0,621e2e8e67b776a24055b564,2021-05-24,0.809989,0.623032,0.065476,0.000000,0.280325,0.254701,0.083045,0.495139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005131,5.746227e-01
1,621e2e8e67b776a24055b564,2021-05-25,0.809989,0.681027,0.074405,0.000000,0.277999,0.225641,0.086505,0.488889,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.030521,6.728133e-01
2,621e2e8e67b776a24055b564,2021-05-25,0.809989,0.681027,0.074405,0.000000,0.277999,0.225641,0.086505,0.488889,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.030521,6.728133e-01
3,621e2e8e67b776a24055b564,2021-05-26,0.961998,0.681027,0.089286,0.000000,0.269678,0.191453,0.093426,0.493056,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.075133,7.638763e-01
4,621e2e8e67b776a24055b564,2021-05-27,0.923996,0.681027,0.071429,0.076923,0.277166,0.227350,0.072664,0.431944,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.137138,8.440835e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8392,621e375b67b776a240290cdc,2021-08-13,0.681236,0.724307,0.000000,0.000000,0.154455,0.000000,0.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.742963,6.041634e-02
8393,621e375b67b776a240290cdc,2021-08-14,0.681236,0.724307,0.000000,0.000000,0.154455,0.000000,0.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.649874,2.041741e-02
8394,621e375b67b776a240290cdc,2021-08-15,0.681236,0.724307,0.000000,0.000000,0.154455,0.000000,0.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.550649,5.551115e-17
8395,621e375b67b776a240290cdc,2021-08-16,0.681236,0.724307,0.000000,0.000000,0.154455,0.000000,0.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.449351,0.000000e+00


In [None]:
training_df.to_pickle('../data/clustering_input/clustering_df_daily_full.pkl')