# Feature Inter-correlation of Data from RobustScaler
## Load Packages

In [1]:
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt

## Load Data

In [2]:
# import dataset from pickle
X_train, X_test, y_train, y_test = pickle.load( open( "../pickles/lowthreshold_RobustScaler_20210711_205741.pickle", "rb" ) )

## Approach 1
Correlation between feature c1 and c2. Remove feature c1, if a correlation above a given threshold is observed

In [3]:
def correlation1(dataset, threshold):
    col_corr = set() 
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # calculate absolute values
                colname = corr_matrix.columns[i]  # get the name of the columns
                col_corr.add(colname)
    return col_corr

In [4]:
# Correlation threshold 90%
corr_features1 = correlation1(X_train, 0.90)

In [5]:
X_train_app1 = X_train.drop(corr_features1,axis=1)
X_test_app1 = X_test.drop(corr_features1,axis=1)

In [6]:
print('Features with high correlation: ', len(corr_features1))
print('Features with low correlation: ', X_train_app1.shape[1])

Features with high correlation:  901
Features with low correlation:  471


#### Simple heatmap for the correlation matrix

In [7]:
#plt.figure(figsize=(12,12))
#cor = X_train_app1.corr()
#sns.heatmap(cor)
#plt.show()

## Approach 2
If correlation between feature c1 & c2 and c1 & c3. It gets only removed if there is a correlation between c2 & c3, if the correlation is above a given threshold.

In [8]:
def correlation2(dataset, threshold):
    col_corr = set() 
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            # if c1 correlates with c2, and c2 with c3 and c1 with c3 ...
            if abs(corr_matrix.iloc[i, j]) > threshold and (corr_matrix.columns[j] not in col_corr): 
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [9]:
# Correlation threshold 90%
corr_features2 = correlation2(X_train, 0.9)

In [10]:
X_train_app2 = X_train.drop(corr_features2,axis=1)
X_test_app2 = X_test.drop(corr_features2,axis=1)

In [11]:
print('Features with high correlation: ', len(corr_features2))
print('Features with low correlation: ', X_train_app2.shape[1])

Features with high correlation:  836
Features with low correlation:  536


#### Simple heatmap for the correlation matrix

In [12]:
#plt.figure(figsize=(12,12))
#cor = X_train_app2.corr()
#sns.heatmap(cor)
#plt.show()

## Export / Save for Next Working Step
Save the feature set generated by the two approaches into two different pickles.

In [13]:
exobj_mima = (X_train_app1, X_test_app1, y_train, y_test)
timestamp = dt.now().strftime("%Y%m%d_%H%M%S")
filename = '../pickles/intercorr_app1_RobustScaler_%s.pickle' %timestamp
filehandler = open(filename, 'wb')
pickle.dump(exobj_mima, filehandler)

In [14]:
exobj_mima = (X_train_app2, X_test_app2, y_train, y_test)
timestamp = dt.now().strftime("%Y%m%d_%H%M%S")
filename = '../pickles/intercorr_app2_RobustScaler_%s.pickle' %timestamp
filehandler = open(filename, 'wb')
pickle.dump(exobj_mima, filehandler)