## Multi-label prediction in kenya

- The purpose would be to train an NN to multi-label planet imagery based on labels collected on high-resolution image.
- Predict the different coverage percentage of every type of land cover based on the visual interpretation.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import rasterio as rio
from rasterio.plot import show
from config import conf
import numpy as np
import cv2

In [None]:
raw_data = pd.read_csv(conf.label_path / "Kenya_final_data_coded.csv", low_memory=False)
# Remove undersired rows
raw_data = raw_data[~raw_data.id.str.contains("CHANGE")]

In [None]:
target_cols = [
    "land_use_category_label",
    "land_use_subcategory_label",
    "land_use_subdivision_label",
    "degraded_forest",
    "topographyroad_coverage",
    "topographyriver_coverage",
    "topographylake_coverage",
    "topographyhouse_coverage",
    "topographycrops_coverage",
    "topographytrees_coverage",
    "topographygrass_coverage",
    "topographybush_coverage",
    "topographybareSoil_coverage",
    "land_use_multiple",
];

In [None]:
topo_cols = np.array([col for col in target_cols if "topo" in col], dtype=np.str_)
topo_labels = np.array(
    [col.replace("topography", "").replace("_coverage", "") for col in topo_cols]
)
topo_labels;

In [None]:
# Get all the different ranges from all the topography tags.
unique_ranges = raw_data[topo_cols].apply(lambda x: x.unique().tolist())
topo_ranges = sorted(
    pd.Series(
        [
            item
            for sublist in unique_ranges.values.squeeze().tolist()
            for item in sublist
        ]
    )
    .unique()
    .tolist()
)
topo_ranges;

In [None]:
# Define a range to consider that a topo element is consider representative in the plot
topo_thres = [
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-100",
]

In [None]:
topo_labels

In [None]:
# Merge all topography_tags in one cell based on the topo threshold.
raw_data["topo_tags"] = raw_data[topo_cols].apply(
    lambda row: ",".join(
        topo_labels[[True if cat in topo_thres else False for cat in row]]
    ),
    axis=1,
)

In [None]:
# Merge all topography_tags in one cell based on the topo threshold.
raw_data["topo_tags"] = raw_data[topo_cols].apply(
    lambda row: ",".join(
        topo_labels[[True if cat in topo_thres else False for cat in row]]
    ),
    axis=1,
)

data_df = raw_data.rename(
    columns={
        "land_use_category_label": "lc_tags",
        "land_use_subdivision_label": "lc_sub_tags",
        "land_use_multiple": "multiple",
    }
)


data_df = data_df[
    [
        "id",
        "multiple",
        "lc_tags",
        "lc_sub_tags",
        "degraded_forest",
        "topo_tags",
    ]
]
data_df = data_df.reset_index(drop=True)
data_df.to_csv(conf.data_file)

In [None]:
conf.data_file

In [None]:
data_df