In [1]:
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA

sns.set_style("whitegrid")

In [2]:
df_train = pd.read_csv('../data/crop_mapping/Winnipeg_selection_transformed_train.csv')
df_test = pd.read_csv('../data/crop_mapping/Winnipeg_selection_transformed_test.csv')

with open('../data/crop_mapping/Winnipeg_selection_nodes.json', 'r') as file:
    df_nodes = json.load(file)

In [3]:
def num_components_above_variance_threshold(evr, threshold=.95):
    evr = np.cumsum(evr)
    return int(1 + np.argmax(evr >= threshold))

In [4]:
variance_threshold = 0.95

In [5]:
df_train.head()

Unnamed: 0,class,sigHH,sigHV,sigVV,Rhhvv,Rhvhh,Rhvvv,l1_lg,l2_lg,l3_lg,...,paulbeta,paulgamma,krogks,krogkd,freeodd_lg,yamodd_lg,G_lg,R_lg,Redge_lg,NIR
0,6,-0.196969,0.557888,-1.374813,1.441568,1.354012,1.95714,-0.702337,-0.88583,0.91366,...,-0.915217,0.557888,-0.704815,-0.691764,-0.90151,-0.990055,-2.216052,-1.068417,-1.758435,1.137511
1,4,-0.294606,-0.175322,1.311454,-1.979434,0.126163,-1.447925,0.69362,0.642851,-1.210202,...,1.491552,-0.175322,-0.261225,1.311688,-0.565124,-0.063559,0.722445,0.325345,0.338568,-1.040333
2,6,0.806394,-0.098789,0.433983,0.475828,-1.432808,-0.526993,0.565805,0.751874,0.008108,...,0.423655,-0.098789,0.7101,0.228566,0.882185,0.908236,-1.384908,-1.152677,-1.558487,0.404779
3,6,-0.492751,0.674578,-1.936184,1.760724,2.030909,2.625855,-1.181714,-0.962631,0.596391,...,-0.878792,0.674578,-1.283807,-1.047907,-1.617801,-1.423677,-0.194133,-0.463957,-0.174898,0.211419
4,5,1.419783,2.145722,1.065013,0.469107,1.838436,1.509426,1.385313,0.990965,2.393308,...,0.819357,2.145722,1.862653,1.432967,1.350435,1.173527,-0.934516,-1.152677,0.005849,1.43264


In [6]:
df_nodes

{'sig': ['sigHH', 'sigHV', 'sigVV'],
 'R': ['Rhhvv', 'Rhvhh', 'Rhvvv'],
 'L': ['l1_lg', 'l2_lg', 'l3_lg'],
 'HA': ['H_lg', 'A_lg', 'a'],
 'PH': ['PH_lg'],
 'rvi': ['rvi_lg'],
 'paul': ['paulalpha', 'paulbeta', 'paulgamma'],
 'krog': ['krogks', 'krogkd'],
 'free': ['freeodd_lg'],
 'yam': ['yamodd_lg'],
 'RGB': ['G_lg', 'R_lg', 'Redge_lg', 'NIR']}

In [7]:
node_list = list(df_nodes.keys())

In [8]:
df_reduced_nodes = {}
df_train_with_node_PCA = []
df_test_with_node_PCA = []

for node in node_list:
    features_in_node = df_nodes[node]
    df_node_train = df_train[features_in_node]
    df_node_test = df_test[features_in_node]

    if len(features_in_node) == 1:
        reduced_node_train = df_node_train.values
        reduced_node_test = df_node_test.values
        reduced_node_labels = [node + '_' + str(0)]
    else:
        PCA_node = PCA()
        PCA_node.fit(df_node_train)
        
        k = num_components_above_variance_threshold(PCA_node.explained_variance_ratio_, variance_threshold)
        
        reduced_node_train = PCA_node.transform(df_node_train)[:,:k]
        reduced_node_test = PCA_node.transform(df_node_test)[:,:k]
        
        reduced_node_labels = [node + '_' + str(i) for i in range(k)]

    df_reduced_nodes[node] = reduced_node_labels
            
    df_train_with_node_PCA.append(pd.DataFrame(reduced_node_train, columns=reduced_node_labels).reset_index(drop=True))
    df_test_with_node_PCA.append(pd.DataFrame(reduced_node_test, columns=reduced_node_labels).reset_index(drop=True))

In [9]:
df_train_with_node_PCA = pd.concat(df_train_with_node_PCA, axis=1)
df_test_with_node_PCA = pd.concat(df_test_with_node_PCA, axis=1)

In [10]:
df_reduced_nodes

{'sig': ['sig_0', 'sig_1'],
 'R': ['R_0', 'R_1'],
 'L': ['L_0', 'L_1', 'L_2'],
 'HA': ['HA_0', 'HA_1', 'HA_2'],
 'PH': ['PH_0'],
 'rvi': ['rvi_0'],
 'paul': ['paul_0', 'paul_1', 'paul_2'],
 'krog': ['krog_0', 'krog_1'],
 'free': ['free_0'],
 'yam': ['yam_0'],
 'RGB': ['RGB_0', 'RGB_1']}

In [11]:
df_train_with_node_PCA.head()

Unnamed: 0,sig_0,sig_1,R_0,R_1,L_0,L_1,L_2,HA_0,HA_1,HA_2,...,rvi_0,paul_0,paul_1,paul_2,krog_0,krog_1,free_0,yam_0,RGB_0,RGB_1
0,-0.527848,-1.377504,2.78015,0.131221,-0.411864,1.384501,-0.172222,-0.532323,2.313138,0.238706,...,1.96171,-0.495946,-0.571911,0.95333,-0.988817,0.008159,-0.90151,-0.990055,-3.106349,-0.426958
1,0.415326,1.242972,-2.048382,1.352128,0.096514,-1.487225,0.367302,1.900626,-2.073635,0.387985,...,-2.442317,0.5999,1.352273,-0.306511,0.740496,1.112728,-0.565124,-0.063559,1.149357,-0.511838
2,0.664127,0.220785,-0.754377,-1.410546,0.77164,-0.535624,-0.021493,-0.111094,-0.525608,-0.494542,...,-0.474017,0.61673,-0.037252,-0.629253,0.66277,-0.340057,0.882185,0.908236,-2.312205,-0.696479
3,-0.939297,-1.832728,3.731098,0.452049,-0.917723,1.284354,-0.437622,0.361956,2.52474,0.383149,...,1.934154,-0.803207,-0.208336,1.456428,-1.650201,0.165134,-1.617801,-1.423677,-0.533453,0.093304
4,2.690041,-0.545432,2.149293,1.122591,2.73849,1.0603,0.028199,-0.915906,1.591056,0.478693,...,1.585437,2.622656,-0.611836,0.551673,2.329354,-0.301873,1.350435,1.173527,-1.721609,1.100353


In [12]:
with open('../data/crop_mapping/Winnipeg_reduced_nodes.json', 'w') as fp:
    json.dump(df_reduced_nodes, fp)

df_train_with_node_PCA.to_csv('../data/crop_mapping/Winnipeg_node_reduced_train.csv', mode='w', index=False)
df_test_with_node_PCA.to_csv('../data/crop_mapping/Winnipeg_node_reduced_test.csv', mode='w', index=False)