In [1]:
#!pip install tdqm geopy

In [2]:
import pandas as pd
import numpy as np
import boto3
from tqdm.notebook import trange, tqdm
from math import cos, radians, nan
import geopy.distance

from add_labels import add_bins, add_labels

In [3]:
# Load metadata
meta_data = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/source_data/meta_data_full_updated_density_new_full_value_LZ.csv")

# Load survey data
dhs_data = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/source_data/dhs_wealth_index_boundaries.csv")

In [4]:
# Get density subsets and remove partial images
meta_data_1d = meta_data[(meta_data["Density"] >= 1) & (meta_data["partial_updated"] == False)]
meta_data_50d = meta_data[(meta_data["Density"] >= 50) & (meta_data["partial_updated"] == False)]
meta_data_100d = meta_data[(meta_data["Density"] >= 100) & (meta_data["partial_updated"] == False)]

In [5]:
# Subset survey data to keep observations inside country boundaries
dhs_data_subset = dhs_data[dhs_data["inside_boundaries"] == True]

In [6]:
# Add weighted average wealth index 
distance_list = [2.5, 5, 10]
add_labels_dict = {"1d": meta_data_1d, 
                  "50d": meta_data_50d, 
                  "100d": meta_data_100d}

for distance in distance_list:
    for key, value in add_labels_dict.items():
        labeled_data = add_labels(distance, value, dhs_data_subset)
        
        # Write full labeled data to s3
        filename = "labeled_data_" + str(distance).replace(".", "_") + "_" + key + ".csv"
        labeled_data.to_csv("s3://w210-poverty-mapper/modeling/metadata/labeled_metadata/" + filename, index=False)

In [7]:
# Load labeled data from s3
labeled_data_2_5k_1d = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/labeled_metadata/labeled_data_2_5k_1d.csv")
labeled_data_5k_1d = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/labeled_metadata/labeled_data_5k_1d.csv")
labeled_data_10k_1d = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/labeled_metadata/labeled_data_10k_1d.csv") 

labeled_data_2_5k_50d = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/labeled_metadata/labeled_data_2_5k_50d.csv")
labeled_data_5k_50d = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/labeled_metadata/labeled_data_5k_50d.csv")
labeled_data_10k_50d = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/labeled_metadata/labeled_data_10k_50d.csv") 

labeled_data_2_5k_100d = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/labeled_metadata/labeled_data_2_5k_100d.csv")
labeled_data_5k_100d = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/labeled_metadata/labeled_data_5k_100d.csv")
labeled_data_10k_100d = pd.read_csv("s3://w210-poverty-mapper/modeling/metadata/labeled_metadata/labeled_data_10k_100d.csv") 

In [8]:
# Add bins 
bin_types = ["across", "within"]
num_classes = [2, 5]
quantile_lists = [[0, 0.20, 1], [0, 0.50, 1]]

labeled_data_dict = {"2_5k_1d": labeled_data_2_5k_1d, 
                      "5k_1d": labeled_data_5k_1d,
                      "10k_1d": labeled_data_10k_1d, 
                      "2_5k_50d": labeled_data_2_5k_50d, 
                      "5k_50d": labeled_data_5k_50d,
                      "10k_50d": labeled_data_10k_50d, 
                      "2_5k_100d": labeled_data_2_5k_100d, 
                      "5k_100d": labeled_data_5k_100d,
                      "10k_100d": labeled_data_10k_100d}

for bin_type in bin_types:
    for num in num_classes:
        for quantile in quantile_lists:
            for key, value in labeled_data_dict.items():
                
                if num == 2:
                    binned_data = add_bins(value, bin_type, num, quantile_list=quantile)
                else:
                    binned_data = add_bins(value, bin_type, num, quantile_list=False)
                    
                if num == 2:
                    filename =  "labels_only_" + key + "_" + bin_type + "_" + str(num) + "_" + str(quantile[1]) + ".csv"        
                else: 
                    filename =  "labels_only_" + key + "_" + bin_type + "_" + str(num) + ".csv"
                    
                # Write labeled data with bins to s3
                binned_data.to_csv("s3://w210-poverty-mapper/modeling/metadata/labeled_metadata/" + filename , index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels_only["label_name"] = pd.qcut(labels_only["weighted_index"], q=quantile_list)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels_only ["label"] = pd.qcut(labels_only["weighted_index"], q=quantile_list, labels=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels_only["label_name"] =