Script to download "2016" (really 2018-2019 but using the images determined by the script that was run on the 2016 data) Malawi images

In [44]:
import pandas as pd
import numpy as np

In [45]:
!ls output/LSMS/malawi_2016/

candidate_download_locs.txt  lons.npy
consumptions.npy	     Malawi-2016-LSMS-Cluster.csv
households.npy		     Malawi-2016-LSMS-Household.csv
lats.npy		     nightlights.npy


In [46]:
df_mw = pd.read_csv('output/LSMS/malawi-2016/candidate_download_locs.txt', sep=' ', header=None, names=['im_lat', 'im_lon', 'clust_lat', 'clust_lon'])

In [47]:
df_mw.shape

(90943, 4)

# Create Datasets

Each country in this folder should have:
1. 'nightlights.npy'
2. 'consumptions.npy'
These are aggregated at a cluster level.

This function will add in these values at the cluster level for each image. That is, we now get a dataframe that has cluster nightlight and consumption values for each image.

In [48]:
def create_df(country, df_orig):
    c_nightlight = np.load('output/LSMS/{}/nightlights.npy'.format(country))
    c_consumption = np.load('output/LSMS/{}/consumptions.npy'.format(country))
    c_groups = df_orig.groupby(['clust_lat', 'clust_lon'])
    counts = c_groups.count()
    counts['nightlight'] = c_nightlight
    counts['consumption'] = c_consumption
    counts = counts.reset_index().drop(['im_lat', 'im_lon'], axis=1)
    df_c = pd.merge(left=df_orig, right=counts, on=['clust_lat', 'clust_lon'])
    return df_c

In [49]:
df_mw = create_df('malawi-2016', df_mw)

In [50]:
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption
0,-17.05,35.174999,-17.09515,35.217213,0.0,2.039307
1,-17.058333,35.174999,-17.09515,35.217213,0.0,2.039307
2,-17.066666,35.174999,-17.09515,35.217213,0.0,2.039307
3,-17.075,35.174999,-17.09515,35.217213,0.0,2.039307
4,-17.083333,35.174999,-17.09515,35.217213,0.0,2.039307


In [51]:
df_mw.shape

(90943, 6)

In [52]:
# most nightlights are 0
# let's download images that have nonzero nightlights to induce variety into the model
print((df_mw['nightlight'] == 0).mean())

0.4980042444168325


In [53]:
# let's drop 75% of 0 nightlight images...
def drop_0s(df, frac=0.9):
    z_inds = np.argwhere(df['nightlight'].values == 0).reshape(-1)
    drop = np.random.choice(z_inds, int(frac*len(z_inds))).reshape(-1)
    return df.drop(df.index[drop])

In [54]:
df_mw = drop_0s(df_mw, frac=0.9)

In [55]:
df_mw.shape, (df_mw['nightlight'] == 0).mean()

((63960, 6), 0.2862257661038149)

In [56]:
df_mw = drop_0s(df_mw, frac=0.9)
df_mw.shape, (df_mw['nightlight'] == 0).mean() # let's get it under 10%

((53042, 6), 0.1393047019343162)

In [57]:
df_mw = drop_0s(df_mw, frac=0.6)
df_mw.shape, (df_mw['nightlight'] == 0).mean() # this seems like a better ratio

((49697, 6), 0.08137312111395054)

In [58]:
(df_mw['nightlight'] < 1).mean() # still most data is under 1

0.4916393343662595

In [59]:
def drop_under(df, cutoff=1, frac=0.5):
    z_inds = np.argwhere(df['nightlight'].values <= cutoff).reshape(-1)
    drop = np.random.choice(z_inds, int(frac*len(z_inds))).reshape(-1)
    return df.drop(df.index[drop])

In [60]:
df_mw = drop_under(df_mw, cutoff=1, frac=0.3)

In [61]:
from sklearn.mixture import GaussianMixture as GMM
X = df_mw['nightlight'].values.reshape(-1,1)
gmm = GMM(n_components=3).fit(X)
labels = gmm.predict(df_mw['nightlight'].values.reshape(-1,1))

In [62]:
(labels==0).mean(), (labels==1).mean(), (labels==2).mean()

(0.5492977236559884, 0.0710348485896815, 0.3796674277543301)

In [63]:
df_mw['nightlight'][labels==0].max(), df_mw['nightlight'][labels==1].max(), df_mw['nightlight'][labels==2].max()

(1.9142857142857144, 48.41322314049588, 14.809917355371907)

In [64]:
# let's smudge these numbers a little to raise the percentage in class 2
# we need the distribution to be somewhat even
(df_mw['nightlight'] < 1.5).mean(), \
((df_mw['nightlight'] >= 1.5) & (df_mw['nightlight'] < 11)).mean(), \
(df_mw['nightlight'] >= 11).mean()

(0.4922161488964229, 0.4024539311330981, 0.10532991997047902)

In [65]:
df_mw['nightlight_bin'] = (df_mw['nightlight'] < 1.5)*1 + \
                    ((df_mw['nightlight'] >= 1.5) & (df_mw['nightlight'] < 11))*2 + \
                    (df_mw['nightlight'] >= 11)*3

In [66]:
df_mw.shape

(43359, 7)

In [67]:
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption,nightlight_bin
2,-17.066666,35.174999,-17.09515,35.217213,0.0,2.039307,1
8,-17.116666,35.174999,-17.09515,35.217213,0.0,2.039307,1
17,-17.1,35.183332,-17.09515,35.217213,0.0,2.039307,1
42,-17.125,35.199999,-17.09515,35.217213,0.0,2.039307,1
43,-17.133333,35.199999,-17.09515,35.217213,0.0,2.039307,1


In [68]:
df_mw.to_csv('mw_2016_guide.csv', index=False)

In [69]:
# we don't need to download repeat images, as that reduces our download size significantly
# some images can belong to 2+ clusters
print(df_mw.drop_duplicates(['im_lat', 'im_lon']).shape)

(23373, 7)


In [70]:
mw_download = df_mw.drop_duplicates(['im_lat', 'im_lon'])

In [71]:
mw_download.to_csv('mw_2016_download.csv', index=False)

# Download Images

Now we actually download images

In [72]:
"""Interface for downloading aerial imagery from Google Static Maps API.
- Get an API key at https://developers.google.com/maps/documentation/maps-static/intro
"""

import requests
from PIL import Image
from io import BytesIO

class ImageryDownloader:
    def __init__(self, access_token):
        """Initializes the object with an access token"""
        self.access_token = access_token
        self.url = 'https://maps.googleapis.com/maps/api/staticmap?center={},{}&zoom={}&size=400x400&maptype=satellite&key={}'
    
    def download(self, lat, long, zoom):
        """Downloads lat long
        """
        res = requests.get(self.url.format(lat, long, zoom, self.access_token))
        image = Image.open(BytesIO(res.content))

        return image

In [73]:
access = None
with open('api_key.txt', 'r') as f:
    access = f.readlines()[0]
    
im_downloader = ImageryDownloader(access)

In [74]:
os.makedirs('ims_malawi_2016', exist_ok=False)

In [None]:
im_names = []
zoom = 16
for i, r in mw_download.iterrows():
    lat = r.im_lat
    long = r.im_lon
    try:
        im = im_downloader.download(lat, long, zoom)
        name = str(lat) + '_' + str(long)
        im.save('ims_malawi_2016/{}.png'.format(name))
        im_names.append(name + '.png')
    except:
        im_names.append(np.nan)
    if i % 100 == 0:
        # the counting is off because the indices from mw_download aren't continuous because we modified the dataframe
        print(i, end=', ')
        
mw_download['images'] = im_names
mw_download.to_csv('mw_2016_download_info.csv', index=False)

100, 300, 400, 