In [1]:
import os
import pandas as pd

In [2]:
DHS_DATA_DIR = {}
for x in ['train','val','test']:
    DHS_DATA_DIR['%s_csv'%(str(x))] = 'extracted/dhs_wealth_index_%s.csv'%(str(x))
    DHS_DATA_DIR['%s_img'%(str(x))] = 'imgs/dhs_%s'%(str(x))
for x,y in DHS_DATA_DIR.items():
    print('%-16s:'%(str(x)),y)

train_csv       : extracted/dhs_wealth_index_train.csv
train_img       : imgs/dhs_train
val_csv         : extracted/dhs_wealth_index_val.csv
val_img         : imgs/dhs_val
test_csv        : extracted/dhs_wealth_index_test.csv
test_img        : imgs/dhs_test


## Cleaning up  DHS data
We have observed dhs_train,dhs_val, dhs_test manually, and then delete images that are very noisy. The following codes are intended to remove the corresponding samples in the infrared data folder and the csv files. 

In [3]:
def clean_infra_folder(datafolder):
    match, notmatch = 0,0
    rgb_folder = os.listdir(datafolder)
    for x in os.listdir(datafolder+'infra'):
        rgb_name = ''.join(x.split("_infra"))
        if rgb_name in rgb_folder:
            match+=1
        else:
            notmatch+=1
            toberemoved = os.path.join(datafolder+'infra',x)
            os.remove(toberemoved)
    print('match:%s. Deleted %s'%(str(match),str(notmatch)))

def clean_csv(split,dataname='dhs'):
    match, notmatch = 0,0
    csv_dir = 'extracted/%s_wealth_index_%s.csv'%(str(dataname),str(split))
    df = pd.read_csv(csv_dir, index_col=False)
    indice_to_remove = []
    for i in range(len(df)):
        filename = df['id'].loc[i] + '.png'
        imgdir = os.path.join('imgs','%s_%s'%(str(dataname),str(split)),filename)
        
        if os.path.exists(imgdir):
            match+=1
        else:
            indice_to_remove.append(i)
            notmatch +=1
    df = df.drop(indice_to_remove)
    df.to_csv(csv_dir, index=False)
    print('match:%s. Deleted %s'%(str(match),str(notmatch)))

In [4]:
# if you rerun this, naturally you will see "Deleted 0".
for x in ['train','val','test']:
    print('cleaning up infrared data DHS [%s]'%(x))
    clean_infra_folder(DHS_DATA_DIR['%s_img'%(x)])

cleaning up infrared data DHS [train]
match:4335. Deleted 0
cleaning up infrared data DHS [val]
match:1835. Deleted 0
cleaning up infrared data DHS [test]
match:4370. Deleted 0


In [5]:
for x in ['train','val','test']:
    print('cleaning up csv data DHS [%s]'%(x))
    clean_csv(x)

cleaning up csv data DHS [train]
match:4335. Deleted 0
cleaning up csv data DHS [val]
match:1835. Deleted 0
cleaning up csv data DHS [test]
match:4370. Deleted 0


## LSMS data clean up is similar

In [6]:
LSMS_DATA_DIR =  {}
for x in ['train','val','test']:
    LSMS_DATA_DIR['%s_csv'%(str(x))] = 'extracted/lsms_wealth_index_%s.csv'%(str(x))
    LSMS_DATA_DIR['%s_img'%(str(x))] = 'imgs/lsms_%s'%(str(x))
for x,y in LSMS_DATA_DIR .items():
    print('%-16s:'%(str(x)),y)

train_csv       : extracted/lsms_wealth_index_train.csv
train_img       : imgs/lsms_train
val_csv         : extracted/lsms_wealth_index_val.csv
val_img         : imgs/lsms_val
test_csv        : extracted/lsms_wealth_index_test.csv
test_img        : imgs/lsms_test


In [7]:
for x in ['train','val','test']:
    print('cleaning up infrared data LSMS [%s]'%(x))
    clean_infra_folder(LSMS_DATA_DIR['%s_img'%(x)])

cleaning up infrared data LSMS [train]
match:666. Deleted 0
cleaning up infrared data LSMS [val]
match:297. Deleted 0
cleaning up infrared data LSMS [test]
match:102. Deleted 0


In [8]:
for x in ['train','val','test']:
    print('cleaning up csv data LSMS [%s]'%(x))
    clean_csv(x, dataname='lsms')

cleaning up csv data LSMS [train]
match:666. Deleted 0
cleaning up csv data LSMS [val]
match:297. Deleted 0
cleaning up csv data LSMS [test]
match:102. Deleted 0
