## Step 01: Preprocess data

In this notebook, we preprocess the crop mapping data "WinnipegDataset.csv". We parse the list of features in "features.txt" and set these as the column names. We split the dataset according to whether data points were collected on 05 July or 14 July. The 05 July data will be used to train Quiver PCA, and the 14 July data will be used as a test set to evaluate its effectiveness. We sample 20% of the data to reduce computational complexity, maintaining the relative frequency of each class.

In [1]:
import pandas as pd

In [2]:
crops = pd.read_csv('../data/crop_mapping/WinnipegDataset.csv')

In [3]:
with open('../data/crop_mapping/features.txt', 'r') as features:
    features_lines = features.readlines()
    feature_dict = {}
    type_dict = {}
    for line in features_lines:
        line = line.replace('\n', '')
        key, val = line.split(':')
        feature_dict[key] = val

In [4]:
crops.rename(columns=feature_dict, inplace=True)

In [5]:
crops_05July = crops.filter(regex='05July')
crops_05July = pd.concat([crops['crop type class'], crops_05July], axis=1)

In [6]:
crops_14July = crops.filter(regex='14July')
crops_14July = pd.concat([crops['crop type class'], crops_14July], axis=1)

In [14]:
# Remove labels from column names (Rad/Opt and 05July/14July)
def parse_feature_type(features):
    parsed_features = []
    type_dict = {}
    for s in features:
        if not '_' in s:
            type_dict[s] = ''
            parsed_features.append(s)
        else:
            key, val = s.split('_')
            type_dict[key] = val
            parsed_features.append(key)
    return parsed_features, type_dict

In [8]:
parsed_features, type_dict = parse_feature_type(list(crops_05July.columns))

In [9]:
print(parsed_features)

['crop type class', 'sigHH', 'sigHV', 'sigVV', 'sigRR', 'sigRL', 'sigLL', 'Rhhvv', 'Rhvhh', 'Rhvvv', 'Rrrll', 'Rrlrr', 'Rrlll', 'Rhh', 'Rhv', 'Rvv', 'Rrr', 'Rrl', 'Rll', 'Ro12', 'Ro13', 'Ro23', 'Ro12cir', 'Ro13cir', 'Ro23cir', 'l1', 'l2', 'l3', 'H', 'A', 'a', 'HA', 'H1mA', '1mHA', '1mH1mA', 'PH', 'rvi', 'paulalpha', 'paulbeta', 'paulgamma', 'krogks', 'krogkd', 'krogkh', 'freeodd', 'freedbl', 'freevol', 'yamodd', 'yamdbl', 'yamhlx', 'yamvol', 'B', 'G', 'R', 'Redge', 'NIR', 'NDVI', 'SR', 'RGRI', 'EVI', 'ARVI', 'SAVI', 'NDGI', 'gNDVI', 'MTVI2', 'NDVIre', 'SRre', 'NDGIre', 'RTVIcore', 'RNDVI', 'TCARI', 'TVI', 'PRI2', 'MeanPC1', 'VarPC1', 'HomPC1', 'ConPC1', 'DisPC1', 'EntPC1', 'SecMomPC1', 'CorPC1', 'MeanPC2', 'VarPC2', 'HomPC2', 'ConPC2', 'DisPC2', 'EntPC2', 'SecMomPC2', 'CorPC2']


In [10]:
print(type_dict)

{'crop type class': '', 'sigHH': 'Rad05July', 'sigHV': 'Rad05July', 'sigVV': 'Rad05July', 'sigRR': 'Rad05July', 'sigRL': 'Rad05July', 'sigLL': 'Rad05July', 'Rhhvv': 'Rad05July', 'Rhvhh': 'Rad05July', 'Rhvvv': 'Rad05July', 'Rrrll': 'Rad05July', 'Rrlrr': 'Rad05July', 'Rrlll': 'Rad05July', 'Rhh': 'Rad05July', 'Rhv': 'Rad05July', 'Rvv': 'Rad05July', 'Rrr': 'Rad05July', 'Rrl': 'Rad05July', 'Rll': 'Rad05July', 'Ro12': 'Rad05July', 'Ro13': 'Rad05July', 'Ro23': 'Rad05July', 'Ro12cir': 'Rad05July', 'Ro13cir': 'Rad05July', 'Ro23cir': 'Rad05July', 'l1': 'Rad05July', 'l2': 'Rad05July', 'l3': 'Rad05July', 'H': 'Rad05July', 'A': 'Rad05July', 'a': 'Rad05July', 'HA': 'Rad05July', 'H1mA': 'Rad05July', '1mHA': 'Rad05July', '1mH1mA': 'Rad05July', 'PH': 'Rad05July', 'rvi': 'Rad05July', 'paulalpha': 'Rad05July', 'paulbeta': 'Rad05July', 'paulgamma': 'Rad05July', 'krogks': 'Rad05July', 'krogkd': 'Rad05July', 'krogkh': 'Rad05July', 'freeodd': 'Rad05July', 'freedbl': 'Rad05July', 'freevol': 'Rad05July', 'yamo

In [11]:
crops_05July.rename(columns=dict(zip(crops_05July.columns, parsed_features)), inplace=True)
crops_14July.rename(columns=dict(zip(crops_14July.columns, parsed_features)), inplace=True)
crops_05July.rename({'crop type class': 'class'}, axis=1, inplace=True)
crops_14July.rename({'crop type class': 'class'}, axis=1, inplace=True)

In [None]:
# Sample 20% of the data
crops_05July_sample = crops_05July.groupby('class').sample(frac=0.2)
crops_sample_index = crops_05July_sample.index
crops_14July_sample = crops_05July.iloc[crops_sample_index]

X_train = crops_05July_sample.drop(['class'], axis=1)
X_test = crops_14July_sample.drop(['class'], axis=1)
y_train = crops_05July_sample['class']
y_test = crops_14July_sample['class']

In [13]:
X_train.to_csv('../data/crop_mapping/preprocessed_train.csv', mode="w", index=False)
X_test.to_csv('../data/crop_mapping/preprocessed_test.csv', mode="w", index=False)
y_train.to_csv('../data/crop_mapping/labels_train.csv', mode="w", index=False)
y_test.to_csv('../data/crop_mapping/labels_test.csv', mode="w", index=False)