## Sample VIIRS Data

## Setup

In [1]:
### Libraries ###
import os, datetime
import numpy as np
import pandas as pd
import geopandas as gpd
import json
import rasterio
from rasterio.plot import show

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix

import logging, os 

### User Defined Libraries ###
import config as cf
import feature_extraction as fe

### Set Seeds ###
seed_value = 42
# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED'] = str(seed_value)
# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)
# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

### Parameters / Paths ###
FINAL_TARGET_NAME = 'ntl_bins'
VIIRS_GDF_FILEPATH = cf.VIIRS_GDF_FILEPATH
DTL_DIRECTORY = cf.DTL_DIRECTORY

## Functions

In [2]:
def transform_target(values, n_bins):
    '''
    Creates log NTL variable and bins into n_bins classes using k-means clutering.
    '''
    
    # Perform log(x+1) for defined domain
    values_log = np.log(values + 1)
    
    # Bin target
    target = values_log.to_numpy().reshape(-1,1)
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='kmeans')
    return discretizer.fit_transform(target)

def sample_by_target(input_gdf, target_col_name, n):
    '''
    Create a sample dataframe containing n observations from each target bin.
    '''

    gdf = gpd.GeoDataFrame()
    for x in input_gdf[target_col_name].unique():
        bin_gdf = input_gdf[input_gdf[target_col_name] == x]
        
        n_use = min(bin_gdf.shape[0], n)
        
        sample_gdf = bin_gdf.sample(n=n_use, random_state=1)
        gdf = gdf.append(sample_gdf)
    return gdf

## Sample

In [4]:
viirs = pd.read_pickle(VIIRS_GDF_FILEPATH)
viirs_gdf = gpd.GeoDataFrame(viirs, geometry='geometry')
viirs_gdf = viirs_gdf[ ~ np.isnan(viirs_gdf['tile_id'])]

In [7]:
viirs_gdf['median_rad_2014_3bin'] = transform_target(viirs_gdf['median_rad_2014'], 3)
viirs_gdf['median_rad_2014_5bin'] = transform_target(viirs_gdf['median_rad_2014'], 5)

In [8]:
min_bin_count = min(viirs_gdf['median_rad_2014_3bin'].value_counts())
gdf = sample_by_target(viirs_gdf, 'median_rad_2014_3bin', min_bin_count)

In [9]:
gdf.median_rad_2014_3bin.value_counts()

3.0    9468
2.0    9468
1.0    9468
0.0    9468
Name: median_rad_2014_3bin, dtype: int64

## Export

In [12]:
gdf.to_pickle(os.path.join(cf.DROPBOX_DIRECTORY, 'Data', 'VIIRS', 'FinalData', 'random_samples', 'viirs_random_sample.pkl'))