In [1]:
!pip install tdqm geopy

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ubuntu/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
import numpy as np
import boto3
from tqdm.notebook import trange, tqdm
from math import cos, radians, nan
import warnings
import geopy.distance

In [6]:
unlabeled_data = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/total_meta_data_full_updated_density.csv")

In [7]:
unlabeled_data = unlabeled_data[unlabeled_data["Density"] == 1]

In [8]:
unlabeled_data.head()

Unnamed: 0,filename,zone,center,lat_lon_bounds,utm_bounds,countries,partial_updated,Density
0,sentinel2_composite/transformed_data/42S/341-0...,42S,"(67.78771480363916, 37.195334792066234)","[(67.77493453609193, 37.205298500604044), (67....","BoundingBox(left=391290.0, bottom=4116110.0, r...",['Tajikistan'],False,1
1,sentinel2_composite/transformed_data/42S/341-0...,42S,"(67.78803746297999, 37.175147632248574)","[(67.77526071406628, 37.18511147083712), (67.7...","BoundingBox(left=391290.0, bottom=4113870.0, r...",['Tajikistan'],False,1
2,sentinel2_composite/transformed_data/42S/341-0...,42S,"(67.7883598008825, 37.15496040109096)","[(67.77558656709091, 37.16492436967268), (67.7...","BoundingBox(left=391290.0, bottom=4111630.0, r...",['Tajikistan'],False,1
3,sentinel2_composite/transformed_data/42S/341-0...,42S,"(67.78868181768202, 37.134773098609735)","[(67.77591209550492, 37.14473719712704), (67.7...","BoundingBox(left=391290.0, bottom=4109390.0, r...",['Tajikistan'],False,1
4,sentinel2_composite/transformed_data/42S/341-0...,42S,"(67.7890035137133, 37.1145857248212)","[(67.7762372996468, 37.124549953216665), (67.7...","BoundingBox(left=391290.0, bottom=4107150.0, r...",['Tajikistan'],False,1


In [9]:
len(unlabeled_data)

135347

In [10]:
dhs_data = pd.read_csv("../dhs_data/dhs_wealth_index_boundaries.csv")

In [11]:
dhs_data = dhs_data[dhs_data["inside_boundaries"] == True]

In [14]:
def km_to_lat_lon_displacement(km, origin_latitude):
    lat = km/111.111
    lon = km/(111.111 * cos(radians(origin_latitude)))
    return lat, lon


def calculate_weighted_index(center, data, radius):
    
    try:
        lon, lat = tuple(map(float, center.strip("()").split(', ')))
    except:
        print("ERROR: unable to extract latitude and logitude data from the center column")
    
    weighted_index = nan
    point_count = 0
    index_range = nan
    np.seterr(divide='ignore', invalid='ignore')
    
    latOffset, lonOffset = km_to_lat_lon_displacement(radius + 1, lat)
    
    data = data[
        ((lat - latOffset)  < data['lat']) & (data['lat'] < (lat + latOffset)) &
        ((lon - lonOffset)  < data['lon']) & (data['lon'] < (lon + lonOffset))
    ]
    
    if len(data) > 0:
        data["distance"] = data.apply(lambda inner_row: geopy.distance.distance((lat, lon), (inner_row["lat"], inner_row["lon"])).km, axis=1)
        inside_radius = data[data["distance"] < radius]
        
        if len(inside_radius) > 0:
            ##### Weighted average calculations are done here
            min_wealth_index = np.min(inside_radius.wealth_index)
            max_wealth_index = np.max(inside_radius.wealth_index)
            min_distance = np.min(inside_radius.distance)
            max_distance = np.max(inside_radius.distance)
            index_range =  max_wealth_index - min_wealth_index
            point_count = len(inside_radius)

            if min_distance == 0.0: #case where its close or the same coordinate (zero)
                inside_radius.loc[inside_radius["distance"] < 0.01,'distance'] = 0.01

            inverse_weight = radius / inside_radius.distance
            inside_radius["weight"] = inverse_weight

            #This is the weighted calculation
            weighted_index = np.sum((inside_radius.wealth_index * inside_radius.weight)) / np.sum(inside_radius.weight)

            # to remove confusion, weighted_index is set to NaN when there are no points
            if point_count == 0:
                weighted_index = nan

            ##########
        
        
    return weighted_index, point_count, index_range
    
    

In [15]:
radious_list = [10]

In [16]:
for radious in radious_list:
    working_df = unlabeled_data.copy()
    warnings.simplefilter("ignore")
    tqdm.pandas(desc="Weighted Calculation for Radious " + str(radious))
    working_df[["weighted_index", "point_count", "index_range"]] = working_df.progress_apply(lambda row: calculate_weighted_index(row["center"], dhs_data, radious), axis=1,result_type='expand')
    labeled_points = len(working_df[working_df["point_count"] > 0])
    unlabeled_points = len(working_df[working_df["point_count"] < 1])
    total_points = len(working_df)
    print("Radius {r}km - Labeled Points: {l} Unlabeled Points: {u} Percent Labeled: {p:.2f}% ".format(r=radious, l=labeled_points, u=unlabeled_points, p =100*labeled_points/total_points))
    print(working_df["point_count"].describe())
    working_df.to_csv("metadata_with_labels_{r}km_radious.csv".format(r=radious), index=False)

Weighted Calculation for Radious 10:   0%|          | 0/135347 [00:00<?, ?it/s]

Radius 10km - Labeled Points: 70099 Unlabeled Points: 65248 Percent Labeled: 51.79% 
count    135347.000000
mean          1.267099
std           3.014475
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max          82.000000
Name: point_count, dtype: float64


In [17]:
working_df = working_df[working_df.point_count > 0]

In [18]:
working_df['label_name'] = pd.qcut(working_df['weighted_index'], q=5)
working_df['label'] = pd.qcut(working_df['weighted_index'], q=5, labels=False)

In [19]:
working_df.head()

Unnamed: 0,filename,zone,center,lat_lon_bounds,utm_bounds,countries,partial_updated,Density,weighted_index,point_count,index_range,label_name,label
175,sentinel2_composite/transformed_data/42S/341-0...,42S,"(67.96131227363449, 37.419078384074126)","[(67.94851577927135, 37.429060237865194), (67....","BoundingBox(left=406970.0, bottom=4140750.0, r...",['Tajikistan'],False,1,41.409552,1.0,0.0,"(38.967, 51.183]",1
176,sentinel2_composite/transformed_data/42S/341-0...,42S,"(67.96159097900866, 37.3988907846013)","[(67.94879801958126, 37.408872755004325), (67....","BoundingBox(left=406970.0, bottom=4138510.0, r...",['Tajikistan'],False,1,41.409552,1.0,0.0,"(38.967, 51.183]",1
177,sentinel2_composite/transformed_data/42S/341-0...,42S,"(67.96186940656095, 37.3787031142581)","[(67.94907997853929, 37.38868520122264), (67.9...","BoundingBox(left=406970.0, bottom=4136270.0, r...",['Tajikistan'],False,1,41.409552,1.0,0.0,"(38.967, 51.183]",1
202,sentinel2_composite/transformed_data/42S/341-0...,42S,"(67.98388902125784, 37.62117176996337)","[(67.97106027372105, 37.63115526460104), (67.9...","BoundingBox(left=409210.0, bottom=4163150.0, r...",['Tajikistan'],False,1,42.701955,1.0,0.0,"(38.967, 51.183]",1
203,sentinel2_composite/transformed_data/42S/341-0...,42S,"(67.9841636642865, 37.600984719821135)","[(67.97133848392146, 37.61096832955942), (67.9...","BoundingBox(left=409210.0, bottom=4160910.0, r...",['Tajikistan'],False,1,42.701955,1.0,0.0,"(38.967, 51.183]",1


In [20]:
working_df['filename'] = working_df['filename'].str.replace("sentinel2_composite/transformed_data/", "")

In [21]:
working_df.countries.unique()

array(["['Tajikistan']", "['Nepal']", "['Bangladesh']", "['Philippines']",
       "['Timor Leste']"], dtype=object)

In [22]:
working_df.label_name.unique()

[(38.967, 51.183], (51.183, 62.681], (62.681, 72.68], (72.68, 95.092], (8.201, 38.967]]
Categories (5, interval[float64]): [(8.201, 38.967] < (38.967, 51.183] < (51.183, 62.681] < (62.681, 72.68] < (72.68, 95.092]]

In [23]:
distribution = working_df[["countries", "label_name", "weighted_index"]].groupby(by=['countries', 'label_name']).count()

In [24]:
distribution

Unnamed: 0_level_0,Unnamed: 1_level_0,weighted_index
countries,label_name,Unnamed: 2_level_1
['Bangladesh'],"(8.201, 38.967]",9809
['Bangladesh'],"(38.967, 51.183]",5448
['Bangladesh'],"(51.183, 62.681]",2600
['Bangladesh'],"(62.681, 72.68]",942
['Bangladesh'],"(72.68, 95.092]",459
['Nepal'],"(8.201, 38.967]",1956
['Nepal'],"(38.967, 51.183]",4651
['Nepal'],"(51.183, 62.681]",3409
['Nepal'],"(62.681, 72.68]",1486
['Nepal'],"(72.68, 95.092]",883


In [25]:
labels = pd.DataFrame(pd.qcut(working_df['weighted_index'], q=5).cat.categories.values, columns=["label_name"])

In [26]:
labels['label'] = labels.index

In [27]:
labels

Unnamed: 0,label_name,label
0,"(8.201, 38.967]",0
1,"(38.967, 51.183]",1
2,"(51.183, 62.681]",2
3,"(62.681, 72.68]",3
4,"(72.68, 95.092]",4


In [28]:
test_country = "['Nepal']"

In [29]:
test_mask = working_df.countries == test_country

In [30]:
test_data = working_df.loc[test_mask, ['filename', 'label']]

In [31]:
test_data

Unnamed: 0,filename,label
30427,44R/403-0000023296-0000000000/44R_403-00000232...,1
30428,44R/403-0000023296-0000000000/44R_403-00000232...,1
30429,44R/403-0000023296-0000000000/44R_403-00000232...,1
30430,44R/403-0000023296-0000000000/44R_403-00000232...,1
30431,44R/403-0000023296-0000000000/44R_403-00000232...,1
...,...,...
72305,45R/404-0000046592-0000000000/45R_404-00000465...,1
72324,45R/404-0000046592-0000000000/45R_404-00000465...,1
72325,45R/404-0000046592-0000000000/45R_404-00000465...,1
72326,45R/404-0000046592-0000000000/45R_404-00000465...,1


In [32]:
train_data = working_df.loc[~test_mask, ['filename', 'label']]

In [33]:
train_data

Unnamed: 0,filename,label
175,42S/341-0000046592-0000000000/42S_341-00000465...,1
176,42S/341-0000046592-0000000000/42S_341-00000465...,1
177,42S/341-0000046592-0000000000/42S_341-00000465...,1
202,42S/341-0000046592-0000000000/42S_341-00000465...,1
203,42S/341-0000046592-0000000000/42S_341-00000465...,1
...,...,...
156636,52P/531-0000069888-0000000000/52P_531-00000698...,4
156637,52P/531-0000069888-0000000000/52P_531-00000698...,4
156638,52P/531-0000069888-0000000000/52P_531-00000698...,4
156647,52P/531-0000069888-0000000000/52P_531-00000698...,4


In [34]:
labels.to_csv("metadata.csv", index=False)
test_data.to_csv("test.nepal.csv",index=False)
train_data.to_csv("train.nepal.csv",index=False)