In [1]:
import itertools
import json
import pandas as pd
import numpy as np
from soil_roughness_change_detection.modules.outlier_detectors import (
    dbscan_outlier_detector,
    mahalanobis_distance_outlier_detector,
    isolation_forest_outlier_detector
)
from soil_roughness_change_detection.modules.experiment import run_experiment
from soil_roughness_change_detection.modules.preprocessor import (
    preprocess_backscatter,
    preprocess_harrysfarm_activity_log
)

%matplotlib inline

## Config

In [2]:
training_period_from = '2018-12-31'
training_period_to = '2020-12-31'
testing_period_to = '2022-12-31'
s1_file_path = './data/harrysfarm/harrysfarm_fields_s1.pkl'
farm_activity_annotation_path = './data/harrysfarm_tillage_activity.csv'
random_state = 123

## Preprocessed Sentinel 1 Backscatter Data

In [3]:
df = pd.read_pickle(s1_file_path)
training_df, testing_df = preprocess_backscatter(
    df,
    training_period_from=training_period_from,
    training_period_to=training_period_to,
    testing_period_from=training_period_to,
    testing_period_to=testing_period_to
)
training_interval_df = training_df.reset_index()[['from_date', 'date']]\
    .groupby(['from_date', 'date']).first().reset_index()

testing_interval_df = testing_df.reset_index()[['from_date', 'date']]\
    .groupby(['from_date', 'date']).first().reset_index()

In [4]:
print("Number of training dates:", training_df.reset_index().date.unique().shape[0])
print("Number of training fields:", training_df.reset_index().field_id.unique().shape[0])
print("Number of testing dates:", testing_df.reset_index().date.unique().shape[0])
print("Number of testing fields:", testing_df.reset_index().field_id.unique().shape[0])

Number of training dates: 359
Number of training fields: 196
Number of testing dates: 269
Number of testing fields: 196


## Preprocess Harrys Farm Activity Data

In [5]:
tillage_df = pd.read_csv(farm_activity_annotation_path)

train_tillage_df, test_tillage_df = preprocess_harrysfarm_activity_log(
    tillage_df,
    training_period_from=training_period_from,
    training_period_to=training_period_to,
    testing_period_from=training_period_to,
    testing_period_to=testing_period_to
)
print('Number of training events:', train_tillage_df.shape[0])
print('Number of testing events:', test_tillage_df.shape[0])

Number of training events: 31
Number of testing events: 41


## Models

### DBSCAN

In [6]:
eps_list = [0.1, 0.3, 0.5, 0.7, 1.0]
min_samples_list = [3, 5, 10, 15, 20]
metric_list = ['euclidean']
leaf_size_list = [10, 20, 30, 40, 50]
db_scan_parameter_combinations = [
    {
        'eps':item[0],
        'min_samples':item[1],
        'metric':item[2],
        'leaf_size':item[3],
        'n_jobs': -1
    }
    for item in list(itertools.product(eps_list, min_samples_list, metric_list, leaf_size_list))
]
dbscan_experiment_config = dict(
    name = 'dbscan',
    detector = dbscan_outlier_detector,
    parameter_spaces = db_scan_parameter_combinations,
)

### Mahalanobis Distance with $\chi^2$

In [7]:
mahalanobis_parameter_combinations = [
    {'alpha': 0.01},
    {'alpha': 0.05},
    {'alpha': 0.1}
]
mahalanobis_experiment_config = dict(
    name = 'mahalanobis',
    detector = mahalanobis_distance_outlier_detector,
    parameter_spaces = mahalanobis_parameter_combinations,
)

### Isolation Forest

In [8]:
n_estimators_list = [30, 50, 100, 150]
contamination_list = [0.1, 0.2, 0.3, 0.4, 0.5, 'auto']

isolation_forest_parameter_combinations = [
    {
        'n_estimators':item[0],
        'contamination': item[1],
        'n_jobs': -1,
        'random_state': random_state
    }
    for item in list(itertools.product(
        n_estimators_list,
        contamination_list,
    ))
]
isolation_forest_experiment_config = dict(
    name = 'isolation_forest',
    detector = isolation_forest_outlier_detector,
    parameter_spaces = isolation_forest_parameter_combinations,
)

## Experiment

### Experiment Config

In [9]:
features = [
    ['VV_diff', 'VH_diff'],
    ['VV_ratio', 'VH_ratio'],
    ['VV_ratio', 'VH_ratio', 'VH_VV_ratio_diff']
]

model_configs = [
    dbscan_experiment_config,
    mahalanobis_experiment_config,
    isolation_forest_experiment_config
]

### Run Experiment

In [10]:
for feature_idx, feature in enumerate(features):
    print(f'Running feature {feature_idx}: {feature}')
    for model_config in model_configs:
        print(f'Running {model_config["name"]}')
        results = run_experiment(
            training_df,
            train_tillage_df,
            training_interval_df,
            feature,
            model_config['detector'],
            model_config['parameter_spaces']
        )
        with open(f'results/{model_config["name"]}_train_result_feature_{feature_idx}.json', 'w+') as f:
            f.write(json.dumps(results, indent=2))

Running feature 0: ['VV_diff', 'VH_diff']
Running dbscan


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [14:09<00:00,  6.80s/it]


Finish with 00:14:09
Running mahalanobis


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:11<00:00,  3.80s/it]


Finish with 00:00:11
Running isolation_forest


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [07:36<00:00, 19.02s/it]


Finish with 00:07:36
Running feature 1: ['VV_ratio', 'VH_ratio']
Running dbscan


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [14:10<00:00,  6.80s/it]


Finish with 00:14:10
Running mahalanobis


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:11<00:00,  3.71s/it]


Finish with 00:00:11
Running isolation_forest


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [07:38<00:00, 19.11s/it]


Finish with 00:07:38
Running feature 2: ['VV_ratio', 'VH_ratio', 'VH_VV_ratio_diff']
Running dbscan


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [14:13<00:00,  6.83s/it]


Finish with 00:14:13
Running mahalanobis


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:09<00:00,  3.03s/it]


Finish with 00:00:09
Running isolation_forest


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [07:38<00:00, 19.10s/it]

Finish with 00:07:38



