# Merge OPM Survey Data with CNN Features

## Setup

In [97]:
# Load packages
library(tidyverse)

# Set up sagemaker
library(reticulate)
sagemaker <- import('sagemaker')
session <- sagemaker$Session()

# s3 bucket name
bucket = "worldbank-pakistan-data"
local_dir = '/home/ec2-user/SageMaker/'

## Load Data

In [98]:
session$download_data(path = local_dir,
              bucket = bucket,
              key_prefix = 'OPM/FinalData/Individual Datasets/bisp_socioeconomic.Rds')

In [99]:
opm_df <- readRDS(file.path(local_dir, 'bisp_socioeconomic.Rds'))

In [100]:
cnn_df <- read_csv(session$read_s3_file(bucket,
                     file.path('OPM', 'FinalData', 'Individual Datasets', 
                               'bisp_cnn_features_all_Nbands3_nNtlBins3_minNTLbinCount16861.csv')))

“Missing column names filled in: 'X1' [1]”


## Merge

In [101]:
cnn_df <- cnn_df %>%
  dplyr::select(-X1)

opm_df$uid <- opm_df$uid %>% as.numeric()
cnn_df$uid <- cnn_df$uid %>% as.numeric()

opm_df <- merge(opm_df, cnn_df, by = c("uid"), all.x=T, all.y=F)

opm_df <- opm_df %>%
    filter(!is.na(cnn_feat_1),
          year %in% 2014) 

## Export

In [102]:
write.csv(opm_df, file.path(local_dir, "cnn_merge.csv"), row.names = F)

session$upload_data(path       = file.path(local_dir, "cnn_merge.csv"), 
                    bucket     = bucket, 
                    key_prefix = file.path("OPM", "FinalData", "Merged Datasets"))
