In [1]:
import os
from glob import glob

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
class SpectralCurveFiltering():
    """
    Create a histogram (a spectral curve) of a 3D cube, using the merge_function
    to aggregate all pixels within one band. The return array will have
    the shape of [CHANNELS_COUNT]
    """

    def __init__(self, merge_function = np.mean):
        self.merge_function = merge_function

    def __call__(self, sample: np.ndarray):
        return self.merge_function(sample, axis=(1, 2))

In [3]:
def load_data(directory: str):
    """Load each cube, reduce its dimensionality and append to array.

    Args:
        directory (str): Directory to either train or test set
    Returns:
        [type]: A list with spectral curve for each sample.
    """
    
    data = []
    filtering = SpectralCurveFiltering()
    all_files = np.array(
        sorted(
            glob(os.path.join(directory, "*.npz")),
            key=lambda x: int(os.path.basename(x).replace(".npz", "")),
        )
    )
    for file_name in all_files:
        with np.load(file_name) as npz:
            arr = np.ma.MaskedArray(**npz)
        arr = filtering(arr)
        data.append(arr)
    return np.array(data)

In [4]:
def load_gt(file_path: str):
    """Load labels for train set from the ground truth file.
    Args:
        file_path (str): Path to the ground truth .csv file.
    Returns:
        [type]: 2D numpy array with soil properties levels
    """
    
    gt_file = pd.read_csv(file_path)
    labels = gt_file[["P", "K", "Mg", "pH"]].values
    return labels


In [5]:
X_train = load_data("../data/raw/train_data/train_data")
y_train = load_gt("../data/raw/train_data/train_gt.csv")
X_test = load_data("../data/raw/test_data")

print(f"Train data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Target data shape: {y_train.shape}")

Train data shape: (1732, 150)
Test data shape: (1154, 150)
Target data shape: (1732, 4)


In [6]:
np.corrcoef(y_train, rowvar=False)

array([[ 1.        ,  0.41349407, -0.10303426,  0.12006662],
       [ 0.41349407,  1.        ,  0.23416241,  0.16879292],
       [-0.10303426,  0.23416241,  1.        ,  0.01097964],
       [ 0.12006662,  0.16879292,  0.01097964,  1.        ]])

In [9]:
# List of variables
variables = ["P", "K", "Mg", "pH"]

# Create scatterplots
for i in range(len(variables)):
    for j in range(i+1, len(variables)):
        plt.figure(figsize=(8, 6))
        plt.scatter(y_train[:, i], y_train[:, j], c='blue', alpha=0.5)
        plt.title(f'{variables[i]} vs {variables[j]}')
        plt.xlabel(variables[i])
        plt.ylabel(variables[j])
        plt.show()

array([[ 45.1, 188. , 179. ,   7.2],
       [ 44.8, 205. , 188. ,   7. ],
       [ 44.4, 207. , 145. ,   6.8],
       ...,
       [ 39.4, 180. , 122. ,   6.5],
       [ 37.3, 162. , 127. ,   6.5],
       [ 29.5, 146. , 133. ,   6.3]])

In [None]:
np.savetxt("../data/interim/X_train.txt", X_train, delimiter=" ", fmt='%f')
np.savetxt("../data/interim/X_test.txt", X_test, delimiter=" ", fmt='%f')
np.savetxt("../data/interim/y_train.txt", y_train, delimiter=" ", fmt='%f')