## Check TF Record Full

Check that the TF records are full. Check how many observations are in them versus how many should be in them. If too few, delete so can be re-extracted.

### Setup

In [1]:
import numpy as np
import os, datetime
import pandas as pd
import itertools
import tensorflow as tf
import config as cf
import ee_utils as utils
import eeconvert
import time
import glob

SURVEY_NAME = 'DHS'
DROPBOX_DIR = cf.DROPBOX_DIRECTORY
GOOGLEDRIVE_DIR = cf.GOOGLEDRIVE_DIRECTORY

  data = yaml.load(f.read()) or {}
  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)
  defaults = yaml.load(f)


In [2]:
# Relies on VIIRs Values
survey_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'data_for_cnn.csv'))

In [3]:
tf_path = os.path.join(GOOGLEDRIVE_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'cnn_s2', 'tfrecords')

In [4]:
# Remove observations that had issues
#survey_df = survey_df[survey_df['GID_2'] != "IND.18.1_1"]
survey_df = survey_df[survey_df['uid'] != 'CO201000003682']
survey_df = survey_df[survey_df['uid'] != 'CO201000003683']
survey_df = survey_df[survey_df['uid'] != 'CO201000004885']
survey_df = survey_df[survey_df['uid'] != 'IA201400180058']
survey_df = survey_df[survey_df['uid'] != 'IA201400180040']
survey_df = survey_df[survey_df['uid'] != 'IA201400300032']
survey_df = survey_df[survey_df['uid'] != 'IA201400010147']
survey_df = survey_df[survey_df['uid'] != 'NM201300000001']
survey_df = survey_df[survey_df['uid'] != 'CO201000002924']
survey_df = survey_df[survey_df['uid'] != 'CO201000002524']
survey_df = survey_df[survey_df['uid'] != 'CO201000001170']
survey_df = survey_df[survey_df['uid'] != 'CO201000003554']
survey_df = survey_df[survey_df['uid'] != 'GA201200000310']
survey_df = survey_df[survey_df['uid'] != 'IA201400180104'] 
survey_df = survey_df[survey_df['uid'] != 'IA201400180012']
survey_df = survey_df[survey_df['uid'] != 'IA201400180011']
survey_df = survey_df[survey_df['uid'] != 'IA201400180048']
survey_df = survey_df[survey_df['uid'] != 'IA201400300002']
survey_df = survey_df[survey_df['uid'] != 'IA201400130192']
survey_df = survey_df[survey_df['uid'] != 'IA201400140845']
survey_df = survey_df[survey_df['uid'] != 'IA201400010076']
survey_df = survey_df[survey_df['uid'] != 'IA201400180133']
survey_df = survey_df[survey_df['uid'] != 'IA201400180123']
survey_df = survey_df[survey_df['uid'] != 'IA201400180047']
survey_df = survey_df[survey_df['uid'] != 'IA201400180138']
survey_df = survey_df[survey_df['uid'] != 'IA201400180052']
survey_df = survey_df[survey_df['uid'] != 'IA201400180055']
survey_df = survey_df[survey_df['uid'] != 'IA201400180116']
survey_df = survey_df[survey_df['uid'] != 'IA201400180030']
survey_df = survey_df[survey_df['uid'] != 'IA201400180086']
survey_df = survey_df[survey_df['uid'] != 'IA201400180050']
survey_df = survey_df[survey_df['uid'] != 'IA201400180027']
survey_df = survey_df[survey_df['uid'] != 'IA201400180072']
survey_df = survey_df[survey_df['uid'] != 'IA201400180080']
survey_df = survey_df[survey_df['uid'] != 'IA201400180079']
survey_df = survey_df[survey_df['uid'] != 'IA201400180064']
survey_df = survey_df[survey_df['uid'] != 'IA201400180081']
survey_df = survey_df[survey_df['uid'] != 'IA201400180062']
survey_df = survey_df[survey_df['uid'] != 'UG201800000335']
survey_df = survey_df[survey_df['uid'] != 'UG201800000330']
survey_df = survey_df[survey_df['uid'] != 'UG201800000322']
survey_df = survey_df[survey_df['uid'] != 'IA201400180062']
survey_df = survey_df[survey_df['uid'] != 'IA201400180140']
survey_df = survey_df[survey_df['uid'] != 'IA201400180100']
survey_df = survey_df[survey_df['uid'] != 'UG201800000342']
survey_df = survey_df[survey_df['uid'] != 'IA201400180081']
survey_df = survey_df[survey_df['uid'] != 'IA201400180028']
survey_df = survey_df[survey_df['uid'] != 'UG201800000340']
survey_df = survey_df[survey_df['uid'] != 'IA201400180091']
survey_df = survey_df[survey_df['uid'] != 'IA201400180112']
survey_df = survey_df[survey_df['uid'] != 'UG201800000331']
survey_df = survey_df[survey_df['uid'] != 'UG201800000336']
survey_df = survey_df[survey_df['uid'] != 'UG201800000334']
survey_df = survey_df[survey_df['uid'] != 'IA201400180083']
survey_df = survey_df[survey_df['uid'] != 'GY200900000293']
survey_df = survey_df[survey_df['uid'] != 'UG201800000328']
survey_df = survey_df[survey_df['uid'] != 'UG201800000327']
survey_df = survey_df[survey_df['uid'] != 'UG201800000333']
survey_df = survey_df[survey_df['uid'] != 'UG201800000332']
survey_df = survey_df[survey_df['uid'] != 'UG201800000329']
survey_df = survey_df[survey_df['uid'] != 'UG201800000339']
survey_df = survey_df[survey_df['uid'] != 'UG201800000326']
survey_df = survey_df[survey_df['uid'] != 'UG201800000323']
survey_df = survey_df[survey_df['uid'] != 'UG201800000321']
survey_df = survey_df[survey_df['uid'] != 'UG201800000338']
survey_df = survey_df[survey_df['uid'] != 'UG201800000324']
survey_df = survey_df[survey_df['uid'] != 'UG201800000325']
survey_df = survey_df[survey_df['uid'] != 'UG201800000337']
survey_df = survey_df[survey_df['uid'] != 'UG201800000341']


In [5]:
def decode_fn_uid(record_bytes):
    return tf.io.parse_single_example(
        # Data
        record_bytes,

        # Schema
        {"uid": tf.io.FixedLenFeature([], dtype=tf.string)}
    )

def extract_uid(TF_FILES):
    actual_values = []
    for batch in tf.data.TFRecordDataset([TF_FILES]).map(decode_fn_uid):
        value = batch['uid'].numpy()
        actual_values.append(value)

    return actual_values


In [6]:
df_all = pd.DataFrame()

for record_i in survey_df.tfrecord_name.unique():
    
    if os.path.isfile(tf_path + '/' + record_i):
    
        survey_df_i = survey_df[survey_df.tfrecord_name == record_i]

        # Create the pandas DataFrame
        data = [[record_i, 
                 survey_df_i.shape[0], 
                 len(extract_uid(tf_path + '/' + record_i))]]

        df_i = pd.DataFrame(data, columns = ['tfrecord', 'n_survey', 'n_record'])

        # Append dataframe
        df_all = df_all.append(df_i, ignore_index = True)

In [7]:
df_all_delete = df_all[df_all.n_survey != df_all.n_record]

In [8]:
df_all_delete

Unnamed: 0,tfrecord,n_survey,n_record


### Delete Files

In [79]:
for file_to_delete in df_all_delete.tfrecord:
    os.remove(os.path.join(tf_path, file_to_delete))

'/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_s2/tfrecords/nocnn_KM_1_1_all.tfrecord'