In [1]:
import json

import pandas as pd
import numpy as np

from pca_utils import *

from sklearn.decomposition import PCA

In [2]:
X_train = pd.read_csv('../data/crop_mapping/selection_features_train.csv')
X_test = pd.read_csv('../data/crop_mapping/selection_features_test.csv')

with open('../data/crop_mapping/node_structure.json', 'r') as file:
    nodes = json.load(file)

In [3]:
# Apply principal component analysis to the features at each node
node_list = list(nodes.keys())

reduced_nodes = {}
X_nodewise_PCA_train = []
X_nodewise_PCA_test = []

for node in node_list:
    features_in_node = nodes[node]
    node_train = X_train[features_in_node]
    node_test = X_test[features_in_node]

    node_PCA_train, node_PCA_test = top_principal_components(
        node_train, node_test,
        name=node,
        use_threshold=True
    )
            
    X_nodewise_PCA_train.append(node_PCA_train)
    X_nodewise_PCA_test.append(node_PCA_test)
    reduced_nodes[node] = list(node_PCA_train.columns)
X_nodewise_PCA_train = pd.concat(X_nodewise_PCA_train, axis=1)
X_nodewise_PCA_test = pd.concat(X_nodewise_PCA_test, axis=1)

In [4]:
reduced_nodes

{'sig': ['sig_0', 'sig_1'],
 'R': ['R_0', 'R_1'],
 'Ro': ['Ro_0', 'Ro_1', 'Ro_2'],
 'L': ['L_0', 'L_1', 'L_2'],
 'HA': ['HA_0', 'HA_1', 'HA_2'],
 'PH': ['PH_0'],
 'rvi': ['rvi_0'],
 'paul': ['paul_0', 'paul_1', 'paul_2'],
 'krog': ['krog_0', 'krog_1'],
 'free': ['free_0'],
 'yam': ['yam_0']}

In [5]:
X_nodewise_PCA_train.head()

Unnamed: 0,sig_0,sig_1,R_0,R_1,Ro_0,Ro_1,Ro_2,L_0,L_1,L_2,...,HA_2,PH_0,rvi_0,paul_0,paul_1,paul_2,krog_0,krog_1,free_0,yam_0
0,3.33814,1.264752,-2.756998,1.648688,2.280445,-0.194301,-0.038048,4.258447,-1.852446,-0.824095,...,0.894321,-1.001813,-0.88785,3.508173,0.559003,-0.581387,3.980456,0.354577,3.946998,3.320197
1,-0.573794,-1.456842,4.027465,-0.471639,-0.773188,-1.444647,-0.413415,-0.545994,0.973368,-0.001078,...,0.758091,1.843733,1.857908,-0.4266,0.181912,0.820247,0.39747,0.462225,-0.217378,-0.300503
2,0.65983,0.630135,-1.805483,1.449285,1.005649,0.297449,0.146115,0.335708,-0.392851,0.031351,...,-0.478713,-0.500121,-0.463519,0.719431,-0.231342,-0.200081,0.633747,-0.087007,0.492777,0.368868
3,-1.606204,-1.128917,2.707511,0.697302,-1.759648,-1.17278,0.056862,-1.412963,0.927668,-0.091128,...,-0.066118,1.897357,1.969222,-1.593464,-0.519082,0.844251,-0.923249,0.116404,-0.856274,-1.002239
4,2.56688,1.133864,-2.783028,3.291489,1.974122,-0.377885,-0.23732,3.010317,-0.650924,-0.822453,...,1.234373,-0.464315,-0.235398,2.851342,0.507857,-0.133315,2.6397,0.634457,2.126375,1.440915


In [6]:
with open('../data/crop_mapping/reduced_nodes.json', 'w') as file:
    json.dump(reduced_nodes, file)

X_nodewise_PCA_train.to_csv('../data/crop_mapping/nodewise_PCA_train.csv', mode='w', index=False)
X_nodewise_PCA_test.to_csv('../data/crop_mapping/nodewise_PCA_test.csv', mode='w', index=False)