In [1]:
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA

sns.set_style("whitegrid")

In [2]:
X_train = pd.read_csv('../data/crop_mapping/selection_features_train.csv')
X_test = pd.read_csv('../data/crop_mapping/selection_features_test.csv')

with open('../data/crop_mapping/node_structure.json', 'r') as file:
    nodes = json.load(file)

In [3]:
def num_components_above_variance_threshold(evr, threshold=.95):
    evr = np.cumsum(evr)
    return int(1 + np.argmax(evr >= threshold))

In [4]:
variance_threshold = 0.95

In [5]:
node_list = list(nodes.keys())

In [6]:
# Apply principal component analysis to each node, to reduce correlations among features at the same node

reduced_nodes = {}
X_train_with_node_PCA = []
X_test_with_node_PCA = []

for node in node_list:
    features_in_node = nodes[node]
    X_node_train = X_train[features_in_node]
    X_node_test = X_test[features_in_node]

    if len(features_in_node) == 1:
        reduced_node_train = X_node_train.values
        reduced_node_test = X_node_test.values
        reduced_node_labels = [node + '_' + str(0)]
    else:
        PCA_node = PCA()
        PCA_node.fit(X_node_train)
        
        k = num_components_above_variance_threshold(PCA_node.explained_variance_ratio_, variance_threshold)
        
        reduced_node_train = PCA_node.transform(X_node_train)[:,:k]
        reduced_node_test = PCA_node.transform(X_node_test)[:,:k]
        
        reduced_node_labels = [node + '_' + str(i) for i in range(k)]

    reduced_nodes[node] = reduced_node_labels
            
    X_train_with_node_PCA.append(pd.DataFrame(reduced_node_train, columns=reduced_node_labels).reset_index(drop=True))
    X_test_with_node_PCA.append(pd.DataFrame(reduced_node_test, columns=reduced_node_labels).reset_index(drop=True))

In [7]:
X_train_with_node_PCA = pd.concat(X_train_with_node_PCA, axis=1)
X_test_with_node_PCA = pd.concat(X_test_with_node_PCA, axis=1)

In [8]:
reduced_nodes

{'sig': ['sig_0', 'sig_1'],
 'R': ['R_0', 'R_1'],
 'Ro': ['Ro_0', 'Ro_1', 'Ro_2'],
 'L': ['L_0', 'L_1', 'L_2'],
 'HA': ['HA_0', 'HA_1', 'HA_2'],
 'PH': ['PH_0'],
 'rvi': ['rvi_0'],
 'paul': ['paul_0', 'paul_1', 'paul_2'],
 'krog': ['krog_0', 'krog_1'],
 'free': ['free_0'],
 'yam': ['yam_0'],
 'em': ['em_0', 'em_1']}

In [9]:
X_train_with_node_PCA.head()

Unnamed: 0,sig_0,sig_1,R_0,R_1,Ro_0,Ro_1,Ro_2,L_0,L_1,L_2,...,rvi_0,paul_0,paul_1,paul_2,krog_0,krog_1,free_0,yam_0,em_0,em_1
0,4.76225,1.629869,-3.576126,4.708064,3.269934,-0.321053,-0.131631,6.775365,-2.462814,-4.539149,...,-0.961749,5.196567,1.062489,-0.156334,6.010227,1.242568,-0.469461,2.235002,0.118294,-2.317249
1,4.253752,1.227837,-2.500557,3.382256,2.383808,-0.598302,-0.156237,6.195942,-0.871581,-2.457755,...,-0.161115,4.505555,0.652685,-0.201317,5.043893,0.664518,5.31471,4.12027,1.19131,-0.318678
2,3.877229,1.240065,-2.696121,3.861151,2.495919,-0.518506,-0.19413,5.262315,-0.799443,-2.360562,...,-0.204504,4.182742,0.665014,-0.102231,4.318128,0.629425,4.522716,3.44273,3.40527,1.304889
3,4.832597,1.285339,-2.522085,4.404359,2.732545,-0.466695,-0.118784,7.473929,-1.097001,-3.238627,...,-0.208784,5.153667,0.649562,-0.067276,5.262868,0.695637,-0.048994,4.433876,-0.062266,-1.089974
4,3.326296,1.214561,-2.808012,3.793767,2.367929,-0.527819,-0.212071,4.191452,-0.63984,-1.920763,...,-0.189894,3.642212,0.538297,-0.115452,3.643771,0.600347,3.684085,2.753547,0.949429,-0.406914


In [10]:
with open('../data/crop_mapping/reduced_nodes.json', 'w') as file:
    json.dump(reduced_nodes, file)

X_train_with_node_PCA.to_csv('../data/crop_mapping/node_reduced_train.csv', mode='w', index=False)
X_test_with_node_PCA.to_csv('../data/crop_mapping/node_reduced_test.csv', mode='w', index=False)