In [1]:
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA

sns.set_style("whitegrid")

In [2]:
X_train = pd.read_csv('../data/crop_mapping/transformed_selection_train.csv')
X_test = pd.read_csv('../data/crop_mapping/transformed_selection_test.csv')

with open('../data/crop_mapping/node_selection.json', 'r') as file:
    nodes = json.load(file)

In [3]:
def num_components_above_variance_threshold(evr, threshold=.95):
    evr = np.cumsum(evr)
    return int(1 + np.argmax(evr >= threshold))

In [4]:
variance_threshold = 0.95

In [6]:
node_list = list(nodes.keys())

In [7]:
# Apply principal component analysis to each node, to reduce correlations among features at the same node

reduced_nodes = {}
X_train_with_node_PCA = []
X_test_with_node_PCA = []

for node in node_list:
    features_in_node = nodes[node]
    X_node_train = X_train[features_in_node]
    X_node_test = X_test[features_in_node]

    if len(features_in_node) == 1:
        reduced_node_train = X_node_train.values
        reduced_node_test = X_node_test.values
        reduced_node_labels = [node + '_' + str(0)]
    else:
        PCA_node = PCA()
        PCA_node.fit(X_node_train)
        
        k = num_components_above_variance_threshold(PCA_node.explained_variance_ratio_, variance_threshold)
        
        reduced_node_train = PCA_node.transform(X_node_train)[:,:k]
        reduced_node_test = PCA_node.transform(X_node_test)[:,:k]
        
        reduced_node_labels = [node + '_' + str(i) for i in range(k)]

    reduced_nodes[node] = reduced_node_labels
            
    X_train_with_node_PCA.append(pd.DataFrame(reduced_node_train, columns=reduced_node_labels).reset_index(drop=True))
    X_test_with_node_PCA.append(pd.DataFrame(reduced_node_test, columns=reduced_node_labels).reset_index(drop=True))

In [8]:
X_train_with_node_PCA = pd.concat(X_train_with_node_PCA, axis=1)
X_test_with_node_PCA = pd.concat(X_test_with_node_PCA, axis=1)

In [9]:
reduced_nodes

{'sig': ['sig_0', 'sig_1'],
 'R': ['R_0', 'R_1'],
 'L': ['L_0', 'L_1', 'L_2'],
 'HA': ['HA_0', 'HA_1', 'HA_2'],
 'PH': ['PH_0'],
 'rvi': ['rvi_0'],
 'paul': ['paul_0', 'paul_1', 'paul_2'],
 'krog': ['krog_0', 'krog_1'],
 'free': ['free_0'],
 'yam': ['yam_0'],
 'RGB': ['RGB_0', 'RGB_1']}

In [10]:
X_train_with_node_PCA.head()

Unnamed: 0,sig_0,sig_1,R_0,R_1,L_0,L_1,L_2,HA_0,HA_1,HA_2,...,rvi_0,paul_0,paul_1,paul_2,krog_0,krog_1,free_0,yam_0,RGB_0,RGB_1
0,-0.696158,0.004994,0.164055,1.067401,-0.573103,-0.211538,-0.890728,1.38757,0.495203,-0.251052,...,0.230854,-0.601696,0.534874,0.452114,-0.799636,-0.556955,-1.127236,-0.978018,2.066715,-0.054433
1,0.123222,0.56757,-1.473724,-1.008096,-0.045235,-0.953021,0.303714,0.430033,-1.471583,-0.20784,...,-1.48477,0.134432,0.31195,-0.691853,0.309349,-0.03255,0.577051,0.381695,-2.087932,-0.868236
2,-0.626803,0.43226,-0.571688,1.490873,-0.530539,-0.260143,-0.30766,0.732904,0.003993,-0.132352,...,-0.122128,-0.478201,0.281149,0.132394,-0.484098,-0.368088,-0.427839,-0.58251,2.656319,-0.439936
3,0.663752,0.336514,-0.407404,0.409608,0.334506,-0.176541,0.586059,-0.575513,-0.762185,0.473586,...,-0.428757,0.665013,-0.144635,-0.203245,0.583456,0.189226,0.886353,0.612481,-1.353442,-0.302973
4,-3.342546,0.671346,-1.888454,0.979551,-3.011673,0.039472,-0.115253,0.902156,-0.452186,0.003633,...,-0.552566,-3.120205,0.557752,-0.08327,-2.136196,-0.471693,-1.649366,-1.949291,2.793634,0.006933


In [11]:
with open('../data/crop_mapping/reduced_nodes.json', 'w') as file:
    json.dump(reduced_nodes, file)

X_train_with_node_PCA.to_csv('../data/crop_mapping/node_reduced_train.csv', mode='w', index=False)
X_test_with_node_PCA.to_csv('../data/crop_mapping/node_reduced_test.csv', mode='w', index=False)