This file extracts features from the images using the CNN and computes cluster-level aggregations.
<br>
<br>
Written by Jatin Mathur
<br>
5/2020

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
from PIL import Image

BASE_DIR = '..'
import sys
sys.path.append(BASE_DIR)
from config import TRAINING_CONFIG, RANDOM_SEED

In [2]:
COUNTRIES_DIR = os.path.join(BASE_DIR, 'data', 'countries')
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')
RESULTS_DIR = os.path.join(BASE_DIR, 'results')

TYPE = TRAINING_CONFIG['TYPE']
COUNTRY = TRAINING_CONFIG['COUNTRY']
METRIC = TRAINING_CONFIG['METRIC']

CNN_TRAIN_IMAGE_DIR = os.path.join(BASE_DIR, 'data', 'cnn_images', TYPE, COUNTRY, METRIC)
CNN_SAVE_DIR = os.path.join(BASE_DIR, 'models', TYPE, COUNTRY, METRIC)

In [3]:
assert TYPE in ['single_country', 'country_held_out']
assert COUNTRY in ['malawi_2016', 'ethiopia_2015']
assert METRIC in ['house_has_cellphone', 'est_monthly_phone_cost_pc']

In [4]:
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(os.path.join(RESULTS_DIR, TYPE, COUNTRY), exist_ok=True)

# Feature extract with CNN

In [5]:
def load_country(country, metric):
    filepath = os.path.join(PROCESSED_DIR, TYPE, COUNTRY, f'{metric}.csv')
    df_images = pd.read_csv(filepath)
    return df_images

In [6]:
df_images = load_country(COUNTRY, METRIC)

In [7]:
df_images.head()

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,house_has_cellphone,est_monthly_phone_cost_pc,country,nightlights,is_train,bin,near_lower,near_upper
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.09515,35.217213,0.5,0.819316,malawi_2016,0.025206,False,1,False,True
1,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.09515,35.217213,0.5,0.819316,malawi_2016,0.025206,False,1,False,True
2,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.09515,35.217213,0.5,0.819316,malawi_2016,0.025206,False,1,False,True
3,-17.050234235794026_35.17229723579403_-17.0951...,-17.050234,35.172297,-17.09515,35.217213,0.5,0.819316,malawi_2016,0.025206,False,1,False,True
4,-17.140065764205975_35.20224107859801_-17.0951...,-17.140066,35.202241,-17.09515,35.217213,0.5,0.819316,malawi_2016,0.025206,False,1,False,True


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} as backend')
model = torch.load(os.path.join(CNN_SAVE_DIR, f'trained_model_{METRIC}.pt'), map_location=device)

Using cuda as backend


In [9]:
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=4096, out_features=4, bias=True)
)

In [10]:
# remove the final layers
model.classifier = model.classifier[:4]

In [11]:
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
)

In [12]:
input_size = 224
transformer = transforms.Compose([
            transforms.Resize(input_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

# custom dataset for fast image loading and processing
# does not follow the usual style of folder -> folder for each class -> image
# we just want one folder with images
class ForwardPassDataset(torch.utils.data.Dataset):
    def __init__(self, image_dir, transformer):
        self.image_dir = image_dir
        self.image_list = os.listdir(self.image_dir)
        self.transformer = transformer

    def __len__(self):
        return len(self.image_list)

    def __getitem__(self, index):
        image_name = self.image_list[index]

        # Load image
        X = self.filename_to_im_tensor(self.image_dir + '/' + image_name)
        
        # dataloaders need to return a label, but for the forward pass we don't really care
        return X, -1
    
    def filename_to_im_tensor(self, file):
        im = (plt.imread(file)[:,:,:3] * 256).astype(np.uint8)
        im = Image.fromarray(im)
        im = self.transformer(im)
        return im
    
def run_forward_pass(model, df_images, mode):
    assert mode in ['train', 'valid']
    model.eval()  
    # shape of final array will be (num_images, 4096)
    # we also want to record the image each index represents
    feats = None
    if mode == 'train':
        feats = np.zeros(((df_images['is_train']).sum(), 4096))
    else:
        feats = np.zeros(((~df_images['is_train']).sum(), 4096))
    image_order = []
    i = 0
    # use the validation images to do the forward pass
    dataset = ForwardPassDataset(os.path.join(CNN_TRAIN_IMAGE_DIR, mode), transformer)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=4)
    image_order += dataset.image_list
    # forward pass for this class
    for inputs, _ in tqdm(dataloader):
        inputs = inputs.to(device)
        outputs = model(inputs)
        feats[i:i+len(inputs),:] = outputs.cpu().detach().numpy()
        i += len(inputs)
            
    forward_pass_df = pd.DataFrame.from_dict({'image_name': image_order, 'feat_index': np.arange(len(image_order))})
    return feats, forward_pass_df

In [13]:
feats_train, df_train = run_forward_pass(model, df_images, 'train')


HBox(children=(FloatProgress(value=0.0, max=1323.0), HTML(value='')))




In [14]:
feats_valid, df_valid = run_forward_pass(model, df_images, 'valid')


HBox(children=(FloatProgress(value=0.0, max=1950.0), HTML(value='')))




In [15]:
df_train = pd.merge(left=df_images, right=df_train, on='image_name', how='inner')

In [16]:
df_valid = pd.merge(left=df_images, right=df_valid, on='image_name', how='inner')

In [17]:
# have we maintained all images?
assert len(df_train) == (df_images['is_train']).sum()
assert len(df_valid) == (~df_images['is_train']).sum()

In [18]:
df_train

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,house_has_cellphone,est_monthly_phone_cost_pc,country,nightlights,is_train,bin,near_lower,near_upper,feat_index
0,3.410784802784024_39.47107816189403_3.45570056...,3.410785,39.471078,3.455701,39.515994,0.200000,0.528631,ethiopia_2015,0.0,True,0,False,False,8316
1,3.440728645588008_39.47107816189403_3.45570056...,3.440729,39.471078,3.455701,39.515994,0.200000,0.528631,ethiopia_2015,0.0,True,0,False,False,8464
2,3.470672488391992_39.47107816189403_3.45570056...,3.470672,39.471078,3.455701,39.515994,0.200000,0.528631,ethiopia_2015,0.0,True,0,False,False,2268
3,3.500616331195976_39.47107816189403_3.45570056...,3.500616,39.471078,3.455701,39.515994,0.200000,0.528631,ethiopia_2015,0.0,True,0,False,False,9020
4,3.410784802784024_39.50102200469801_3.45570056...,3.410785,39.501022,3.455701,39.515994,0.200000,0.528631,ethiopia_2015,0.0,True,0,False,False,9476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10572,14.500877358605976_39.52399813830597_14.455961...,14.500877,39.523998,14.455962,39.479082,0.818182,0.965952,ethiopia_2015,0.0,True,3,True,False,7076
10573,14.425024871568414_39.509897542288606_14.45596...,14.425025,39.509898,14.455962,39.479082,0.818182,0.965952,ethiopia_2015,0.0,True,3,True,False,10037
10574,14.452820255860756_39.5191250998834_14.4559615...,14.452820,39.519125,14.455962,39.479082,0.818182,0.965952,ethiopia_2015,0.0,True,3,True,False,8506
10575,14.489543167421289_39.52144552690058_14.455961...,14.489543,39.521446,14.455962,39.479082,0.818182,0.965952,ethiopia_2015,0.0,True,3,True,False,3193


In [19]:
df_valid

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,house_has_cellphone,est_monthly_phone_cost_pc,country,nightlights,is_train,bin,near_lower,near_upper,feat_index
0,-17.140065764205975_35.17229723579403_-17.0951...,-17.140066,35.172297,-17.095150,35.217213,0.5000,0.819316,malawi_2016,0.025206,False,1,False,True,1764
1,-17.11012192140199_35.17229723579403_-17.09515...,-17.110122,35.172297,-17.095150,35.217213,0.5000,0.819316,malawi_2016,0.025206,False,1,False,True,2962
2,-17.08017807859801_35.17229723579403_-17.09515...,-17.080178,35.172297,-17.095150,35.217213,0.5000,0.819316,malawi_2016,0.025206,False,1,False,True,11448
3,-17.050234235794026_35.17229723579403_-17.0951...,-17.050234,35.172297,-17.095150,35.217213,0.5000,0.819316,malawi_2016,0.025206,False,1,False,True,3699
4,-17.140065764205975_35.20224107859801_-17.0951...,-17.140066,35.202241,-17.095150,35.217213,0.5000,0.819316,malawi_2016,0.025206,False,1,False,True,15075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15595,-9.384751235794024_33.06703376420597_-9.429667...,-9.384751,33.067034,-9.429667,33.022118,0.4375,0.329179,malawi_2016,0.000448,False,1,False,False,1847
15596,-9.440666684984109_33.05955288189787_-9.429667...,-9.440667,33.059553,-9.429667,33.022118,0.4375,0.329179,malawi_2016,0.000448,False,1,False,False,197
15597,-9.395455352187438_33.0480824893378_-9.429667_...,-9.395455,33.048082,-9.429667,33.022118,0.4375,0.329179,malawi_2016,0.000448,False,1,False,False,5394
15598,-9.41656749551162_33.039123764843154_-9.429667...,-9.416567,33.039124,-9.429667,33.022118,0.4375,0.329179,malawi_2016,0.000448,False,1,False,False,11564


## Aggregate Features

In [20]:
def aggregate_features(df, feats, country, mode):
    assert mode in ['train', 'valid']
    group = df.groupby(['cluster_lat', 'cluster_lon'])
    x = np.zeros((len(group), 4096))
    cluster_list = [] # the corresponding clusters (lat, lon) to the x aggregate feature array
    for i, g in enumerate(group):
        lat, lon = g[0]
        im_sub = df[(df['cluster_lat'] == lat) & (df['cluster_lon'] == lon)].reset_index(drop=True)
        agg_feats = np.zeros((len(im_sub), 4096))
        for j, d in im_sub.iterrows():
            agg_feats[j,:] = feats[d.feat_index]
        agg_feats = agg_feats.mean(axis=0) # averages the features across all images in the cluster

        x[i,:] = agg_feats
        cluster_list.append([lat, lon])
        
    # save to the correct directory
    save_dir = os.path.join(RESULTS_DIR, TYPE, country, METRIC, 'cnn')
    os.makedirs(save_dir, exist_ok=True)
    print(f'saving to {save_dir}')
    np.save(os.path.join(save_dir, f'cluster_feats_{mode}_{METRIC}.npy'), x)
    pickle.dump(cluster_list, open(os.path.join(save_dir, f'cluster_order_{mode}_{METRIC}.pkl'), 'wb'))

In [21]:
aggregate_features(df_train, feats_train, COUNTRY, 'train')

saving to ../results/country_held_out/malawi_2016/house_has_cellphone/cnn


In [22]:
aggregate_features(df_valid, feats_valid, COUNTRY, 'valid')

saving to ../results/country_held_out/malawi_2016/house_has_cellphone/cnn
