In [1]:
import os
import cv2
import json
import numpy as np
import pandas
import skimage
import shapefile
import matplotlib.pyplot as plt

import modules

### load data

In [2]:
kenya_osm, kenya_sf = modules.data.load_shapefile("kenya")
kenya_geo = modules.data.load_geodata("kenya")
kenya_dat = pandas.DataFrame.merge(kenya_geo, kenya_osm, on="index")

kenya_filenames = set(modules.data.util.load_image_filenames("kenya", D=128))

In [9]:
N = 300

kenya_dat[kenya_dat["class"] == "major"]
kenya_dat[kenya_dat["class"] == "minor"]
kenya_dat[kenya_dat["class"] == "two-track"]
kenya_dat["valid"] = [f"{int(index)}_{int(road_id)}.npy" in kenya_filenames for index, road_id in enumerate(kenya_dat.values[:, 0])]

major = kenya_dat.iloc[np.random.choice(kenya_dat[np.logical_and(kenya_dat["class"] == "major", kenya_dat["valid"] == True)].index, size=N, replace=False)]
minor = kenya_dat.iloc[np.random.choice(kenya_dat[np.logical_and(kenya_dat["class"] == "minor", kenya_dat["valid"] == True)].index, size=N, replace=False)]
two_track = kenya_dat.iloc[np.random.choice(kenya_dat[np.logical_and(kenya_dat["class"] == "two-track", kenya_dat["valid"] == True)].index, size=N, replace=False)]

filenames = [f"{idx}_{int(df.loc[idx]['id'])}.npy" for df in [major, minor, two_track] for idx in df.index]
permutation = np.arange(len(filenames))
np.random.shuffle(permutation)

images = [np.load(os.path.join(modules.data.util.root(), "kenya", "kenya_128x128_images", filenames[i])) for i in permutation]
images = np.array(images)
labels = permutation // N
             

### featurize

In [14]:
def _SIFT(image, sift, plot=False):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    keypoints = sift.detect(image, None)
    if plot:
        plt.imshow(cv2.drawKeypoints(image, keypoints, outImage=np.array([])))
    return keypoints

def SIFT(images):
    keypoints = []
    sift = cv2.xfeatures2d.SIFT_create()
    for image in images:
        kpts = _SIFT(image, sift)
        keypoints.append(kpts)
    return keypoints

sift = SIFT(images[:5])

In [15]:
def HOG(images):
    features = []
    hog = cv2.HOGDescriptor()
    for image in images:
        features.append(hog.compute(image))
    return features

hog = HOG(images[:5])

In [16]:
def Canny(images):
    channels = []
    for image in images:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        median = np.median(gray)
        channel = cv2.Canny(gray, (1/2) * median, (2) * median, apertureSize=3)[:, :, None]
        channels.append(channel)
    return np.array(channels)

In [17]:
def location(df, fnames):
    locations = []
    for fname in fnames:
        index = fname.split("_")[0]
        locations.append(df.loc[int(index)][["lat", "lon"]].values.astype(np.float64))
    return np.array(locations)

In [18]:
def channel_mean(images):
    return np.mean(images, axis=(1, 2))

In [19]:
def channel_variance(images):
    return np.mean(np.power(images - np.mean(images, axis=(1, 2))[:, None, None, :], 2), axis=(1, 2))

In [20]:
def feature_set(images):
    canny = Canny(images)

    locations = location(kenya_dat, filenames)
    rgb_means = channel_mean(images)
    rgb_variances = channel_variance(canny)
    canny_means = channel_mean(images)
    canny_variances = channel_variance(canny)
    
    return np.concatenate([
        locations,
        rgb_means,
        rgb_variances,
        canny_means,
        canny_variances
    ], axis=-1)

In [22]:
features = feature_set(images)
for i in range(0+1, 128-1, 128//3):
    for j in range(0+1, 128-1, 128//3):
        print(f"Patch ({i}, {j})")
        patches = images[:, i:i+128//3, j:j+128//3, :]
        features = np.concatenate([features, feature_set(patches)], axis=-1)

Patch (1, 1)
Patch (1, 43)
Patch (1, 85)
Patch (43, 1)
Patch (43, 43)
Patch (43, 85)
Patch (85, 1)
Patch (85, 43)
Patch (85, 85)


### Train

In [73]:
from sklearn.preprocessing import MinMaxScaler

from sklearn import linear_model, svm
from sklearn import tree
from sklearn import ensemble
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import r2_score, accuracy_score, f1_score, log_loss

In [75]:
def test(classifier, X_train, y_train, X_test, y_test, verbose=True):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    scores = [r2_score(y_test, y_pred), 
              accuracy_score(y_test, y_pred), 
              f1_score(y_test, y_pred, average=None),
              log_loss(y_test, y_pred)
             ]
    if verbose:
        print(f"r^2: {scores[0]}")
        print(f"accuracy: {scores[1]}")
        print(f"f1: {scores[2]}")
    return classifier, scores

In [87]:
cutoff = features.shape[0] * 9 // 10

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features[:cutoff])

X_train = scaled_features[:cutoff]
y_train = labels[:cutoff]
X_test = scaled_features[cutoff:]
y_test = labels[cutoff:]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((810, 100), (810,), (0, 100), (90,))

In [82]:
# Various classifiers of interest implemented by SKLearn
log_classifier = linear_model.LogisticRegression(solver="lbfgs", max_iter=1000)
svm_classifier = svm.SVC(kernel='rbf', gamma="auto")
k_classifier = KNeighborsClassifier(n_neighbors=6)
tree_classifier = tree.DecisionTreeClassifier()
rf_classifier = ensemble.RandomForestClassifier(n_estimators=500)
boost_classifier = ensemble.GradientBoostingClassifier()

In [83]:
_, _ = test(log_classifier, X_train, y_train, X_test, y_test)



ValueError: Found array with 0 sample(s) (shape=(0, 100)) while a minimum of 1 is required.

In [66]:
_, _ = test(svm_classifier, X_train, y_train, X_test, y_test)

r^2: -1.6315789473684212
accuracy: 0.3111111111111111
f1: [0.         0.         0.47457627]


  'precision', 'predicted', average, warn_for)


In [67]:
_, _ = test(k_classifier, X_train, y_train, X_test, y_test)

r^2: -0.7826825127334467
accuracy: 0.4
f1: [0.41935484 0.37288136 0.40677966]


In [68]:
_, _ = test(tree_classifier, X_train, y_train, X_test, y_test)

r^2: -0.12054329371816652
accuracy: 0.4666666666666667
f1: [0.62068966 0.3        0.48387097]


In [71]:
_, _ = test(rf_classifier, X_train, y_train, X_test, y_test)

r^2: -0.2903225806451615
accuracy: 0.4888888888888889
f1: [0.57142857 0.4        0.5       ]


In [72]:
_, _ = test(boost_classifier, X_train, y_train, X_test, y_test)

r^2: -0.5959252971137523
accuracy: 0.4222222222222222
f1: [0.53571429 0.32142857 0.41176471]
