In [64]:
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import pickle

import geoviews as gv
from geoviews import opts, tile_sources as gvts

gv.extension("bokeh", "matplotlib")

### Prepare Downloaded Data

In [2]:
shots = pd.concat(
    pd.read_csv("../data/" + g) for g in os.listdir("../data") if g.endswith(".csv")
)

Drop unwanted shots, for instance, unlabeled shots.

In [3]:
shots = shots[shots["Class"] != "Not Classified"]
n_shots, _ = shots.shape
print(n_shots, "shots remain.")

737847 shots remain.


Randomly permute the shots to guarantee that the test shots will not differ in distribution from the training shots.

In [4]:
shots = shots.sample(frac=1).reset_index(drop=True)
shots.head()

Unnamed: 0.1,Unnamed: 0,Class,Group,beam,channel,lat_lowestmode,lon_lowestmode,elev_lowestmode,delta_time,rh,land_cover_data/landsat_water_persistence,land_cover_data/landsat_treecover,land_cover_data/region_class,land_cover_data/urban_proportion,land_cover_data/urban_focal_window_size,shot_number
0,593,Evergreen Forest,Forest,3,1,43.770251,-74.149947,518.363,101200200.0,[-2.320e+00 -1.340e+00 -5.600e-01 3.000e-02 ...,0,87.0,7,0,3,128030300300251923
1,742,Cultivated Crops,Planted/Cultivated,5,2,42.257794,-79.430706,381.42648,124981100.0,[-4.34 -4. -3.7 -3.48 -3.25 -3.06 -2.91 -2....,0,0.0,7,0,3,170690500200277409
2,1510,Deciduous Forest,Forest,6,3,42.81552,-77.784368,207.14969,53007250.0,[-3.67 -2.84 -2.24 -1.76 -1.38 -1.01 -0.71 -0....,0,0.0,7,0,3,41550600300391215
3,28,Emergent Herbaceous Wetlands,Wetlands,2,1,44.510707,-75.444823,64.90311,55045720.0,[-4.15 -3.74 -3.44 -3.21 -3.03 -2.84 -2.69 -2....,3,0.0,7,0,3,45210200300211870
4,403,Deciduous Forest,Forest,3,1,43.62836,-75.833698,385.6232,129655900.0,[-3.85 -3.55 -3.29 -3.1 -2.88 -2.73 -2.58 -2....,0,85.0,7,0,3,179080300200120673


Extract rh profiles and coverage type labels.

In [5]:
def str2np(rhstr):
    return np.fromstring(rhstr[1:-1], sep=' ')
    
rh = np.array([str2np(r) for r in shots["rh"]])
rh.shape

(737847, 101)

In [6]:
classes = np.array(shots["Class"])
groups = np.array(shots["Group"])

Perform train/test split.

In [7]:
split = int(n_shots * 0.7)
print(f"training on {split} shots, testing on {n_shots - split} shots")
class_train, classes_test = classes[:split], classes[split:]
group_train, group_test = groups[:split], groups[split:]
rh_train, rh_test = rh[:split], rh[split:]

training on 516492 shots, testing on 221355 shots


In [8]:
def test_model(model, n_test):
    pred = pipe.predict(rh_test[:n_test])
    acc = balanced_accuracy_score(group_test[:n_test], pred)
    return acc, pred

### Classification Pipelines
We demonstrate KNN classification on three different embeddings of the data: first, downsampling rh profiles followed by robust scaling, second, PCA, and third, downsampling with no normalization whatsoever. Balanced classification accuracy is similar across embeddings.

In [9]:
class Downsampler:
    """Downsamples input data according to the sklearn transform interface."""

    def __init__(self, keep):
        """
        keep should index the features *not* to be dropped. For instance, if only the middle two
        of four features are of interest, keep should be [1, 2] or [False, True, True, False].
        """
        self.keep_ = keep

    def fit(self, X, y=None):
        """Does nothing; provided for interface consistency."""
        return self

    def transform(self, X, y=None):
        """Transform a data matrix of shape (n_samples, n_features) by dropping unwanted features."""
        return np.copy(X[:, self.keep_])
    

In [10]:
n_test = 10000    # don't use the full test set to speed things up

# this loop is somewhat wasteful, since it repeats building and querying the Ball/KD tree.
for k in [10, 20, 30, 50, 80]:
    pipe = Pipeline([
        ("downsampler", Downsampler(range(0, 101, 10))),
        ("scaler", RobustScaler()),
        ("classifier", KNeighborsClassifier(n_neighbors=k))
    ]).fit(rh_train, group_train)
    acc, _ = test_model(pipe, n_test)
    print(f"k = {k} balanced accuracy of {acc}")

k = 10 balanced accuracy of 0.3566081406371947
k = 20 balanced accuracy of 0.35623104626495156
k = 30 balanced accuracy of 0.35511336299477564
k = 50 balanced accuracy of 0.3542144187772743
k = 80 balanced accuracy of 0.35686419227587096


In [11]:
for k in [10, 20, 30, 50, 80]:
    pipe = Pipeline([
        ("pca", PCA(n_components=6)),
        ("classifier", KNeighborsClassifier(n_neighbors=k))
    ]).fit(rh_train, group_train)
    acc, _ = test_model(pipe, n_test)
    print(f"k = {k} balanced accuracy of {acc}")

k = 10 balanced accuracy of 0.3512684471939765
k = 20 balanced accuracy of 0.3580523281697129
k = 30 balanced accuracy of 0.35697949906445475
k = 50 balanced accuracy of 0.3547602697869164
k = 80 balanced accuracy of 0.35321456256789935


In [12]:
for k in [10, 20, 30, 50, 80]:
    pipe = Pipeline([
        ("downsampler", Downsampler(range(0, 101, 10))),
        ("classifier", KNeighborsClassifier(n_neighbors=k))
    ]).fit(rh_train, group_train)
    acc, _ = test_model(pipe, n_test)
    print(f"k = {k} balanced accuracy of {acc}")

k = 10 balanced accuracy of 0.35019350218501694
k = 20 balanced accuracy of 0.35590277212003324
k = 30 balanced accuracy of 0.357474574734065
k = 50 balanced accuracy of 0.3571711308883035
k = 80 balanced accuracy of 0.3557297597831745


### Visualize Results
Let's scatterplot the test points by true and predicted group labels. The next cell defines a color for each group.

In [13]:
group_color = {
    "Water": "#bad8ea",
    "Developed": "#b50000",
    "Barren": "#b2ada3",
    "Forest": "#b5c98e",
    "Scrubland": "#ccba7c",
    "Planted/Cultivated": "#aa7028",
    "Wetlands": "#70a3ba"
}

In [14]:
with open("../data/nys_simple_boundaries.pickle", "rb") as file:
    bounds = pickle.load(file)

In [15]:
model = Pipeline([
    ("downsampler", Downsampler(range(0, 101, 10))),
    ("classifier", KNeighborsClassifier(n_neighbors=30))
]).fit(rh_train, group_train)
acc, pred = test_model(model, n_test)
print(f"Balanced accuracy of {acc}")

Balanced accuracy of 0.3557297597831745


In [60]:
lon_test = shots["lon_lowestmode"][split:]
lat_test = shots["lat_lowestmode"][split:]

plotdf = pd.DataFrame({
    "Longitude": lon_test[:n_test],
    "Latitude": lat_test[:n_test],
    "pred": [group_color[g] for g in pred],
    "true": [group_color[g] for g in group_test[:n_test]]
})

In the plot below, each shot is colored according to its predicted group, with the smaller dot inside indicating the true group. Therefore misclassified shots appear as concentric circles of different color.

In [62]:
gvts.EsriImagery \
* gv.Polygons(bounds).opts(color=None, height=480, width=640) \
* gv.Points(plotdf).opts(color="pred", size=10) \
* gv.Points(plotdf).opts(color="true", size=5)