# Merge OPM Survey Data with CNN Features

## Setup

In [9]:
# Load packages
library(tidyverse)

# Set up sagemaker
library(reticulate)
sagemaker <- import('sagemaker')
session <- sagemaker$Session()

# s3 bucket name
bucket = "worldbank-pakistan-data"
local_dir = '/home/ec2-user/SageMaker/'

## Load Data

In [10]:
session$download_data(path = local_dir,
              bucket = bucket,
              key_prefix = 'OPM/FinalData/Individual Datasets/bisp_socioeconomic.Rds')

In [11]:
opm_df <- readRDS(file.path(local_dir, 'bisp_socioeconomic.Rds'))

In [12]:
# identify the param names for each of the five folders and store in a vector
param_names = c("Nbands3_nNtlBins3_minNTLbinCount16861","Band1_nNtlBins3_minNTLbinCount16814",
               "Band5_nNtlBins3_minNTLbinCount1861","Band6_nNtlBins3_minNTLbinCount1861",
               "Band7_nNtlBins3_minNTLbinCount1861")

# iterate through each of the folders and read in the five bicp_cnn_features_all files associated with 
# each folder one at a time. Store each dataframe in a list
cnn_df = list()
for (i in 1:length(param_names))
{
    cnn_df[[i]] <- suppressWarnings(read_csv(session$read_s3_file(bucket,
                     file.path('OPM','FinalData','Individual Datasets',
                               paste0('bisp_cnn_features_all_',param_names[i],'_2014.csv')))))
}

In [13]:
# remove the row names column (X1) from each of the datasets and store each dataset in a seperate data frame
df_RGB = cnn_df[[1]] %>% select(-X1)
df_Band1 = cnn_df[[2]] %>% select(-X1)
df_Band5 = cnn_df[[3]] %>% select(-X1)
df_Band6 = cnn_df[[4]] %>% select(-X1)
df_Band7 = cnn_df[[5]] %>% select(-X1)

In [14]:
# create a list of suffixes to append to the column names in order to differentiate between features 0 through 
# 99 for all five models
suffix = c("_Nbands3","_Band1","_Band5","_Band6","_Band7")

# merge each dataframe one by one starting from the RGB features data and append suffixes when needed to 
# eliminate column name confusion
cnn_df <- df_RGB %>% 
merge(df_Band1, by = "uid", suffix = suffix[1:2]) %>% 
merge(df_Band5, by = "uid") %>% 
merge(df_Band6, by = "uid", suffix = suffix[3:4]) %>%
merge(df_Band7, by = "uid")

# fix last 100 column names to attach its proper suffix
colnames(cnn_df)[402:501] <- paste0(colnames(cnn_df)[402:501], suffix[5])

## Merge

In [15]:
opm_df$uid <- opm_df$uid %>% as.numeric()
cnn_df$uid <- cnn_df$uid %>% as.numeric()

opm_df <- merge(opm_df, cnn_df, by = c("uid"), all.x=T, all.y=F)

opm_df <- opm_df %>% filter(!is.na(cnn_feat_1_Nbands3), year %in% 2014)

## Export

In [16]:
write.csv(opm_df, file.path(local_dir, "cnn_merge.csv"), row.names = F)

session$upload_data(path       = file.path(local_dir, "cnn_merge.csv"), 
                    bucket     = bucket, 
                    key_prefix = file.path("OPM", "FinalData", "Merged Datasets"))