Script to download "2016" (really 2018-2019 but using the images determined by the script that was run on the 2016 data) Malawi images

In [148]:
import pandas as pd
import numpy as np

In [149]:
!ls output/LSMS/malawi-2016/

Malawi-2016-LSMS-Cluster.csv   households.npy
Malawi-2016-LSMS-Household.csv lats.npy
candidate_download_locs.txt    lons.npy
consumptions.npy               nightlights.npy


In [150]:
df_mw = pd.read_csv('output/LSMS/malawi-2016/candidate_download_locs.txt', sep=' ', header=None, names=['im_lat', 'im_lon', 'clust_lat', 'clust_lon'])

In [151]:
df_mw.shape

(90943, 4)

# Create Datasets

Each country in this folder should have:
1. 'nightlights.npy'
2. 'consumptions.npy'
These are aggregated at a cluster level.

This function will add in these values at the cluster level for each image. That is, we now get a dataframe that has cluster nightlight and consumption values for each image.

In [152]:
def create_df(country, df_orig):
    c_nightlight = np.load('output/LSMS/{}/nightlights.npy'.format(country))
    c_consumption = np.load('output/LSMS/{}/consumptions.npy'.format(country))
    c_groups = df_orig.groupby(['clust_lat', 'clust_lon'])
    counts = c_groups.count()
    counts['nightlight'] = c_nightlight
    counts['consumption'] = c_consumption
    counts = counts.reset_index().drop(['im_lat', 'im_lon'], axis=1)
    df_c = pd.merge(left=df_orig, right=counts, on=['clust_lat', 'clust_lon'])
    return df_c

In [153]:
df_mw = create_df('malawi-2016', df_mw)

In [154]:
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption
0,-17.05,35.174999,-17.09515,35.217213,0.0,2.039307
1,-17.058333,35.174999,-17.09515,35.217213,0.0,2.039307
2,-17.066666,35.174999,-17.09515,35.217213,0.0,2.039307
3,-17.075,35.174999,-17.09515,35.217213,0.0,2.039307
4,-17.083333,35.174999,-17.09515,35.217213,0.0,2.039307


In [155]:
df_mw.shape

(90943, 6)

In [156]:
# most nightlights are 0
# let's download images that have nonzero nightlights to induce variety into the model
print((df_mw['nightlight'] == 0).mean())

0.4980042444168325


In [157]:
# let's drop 75% of 0 nightlight images...
def drop_0s(df, frac=0.9):
    z_inds = np.argwhere(df['nightlight'].values == 0).reshape(-1)
    drop = np.random.choice(z_inds, int(frac*len(z_inds))).reshape(-1)
    return df.drop(df.index[drop])

In [158]:
df_mw = drop_0s(df_mw, frac=0.9)

In [159]:
df_mw.shape, (df_mw['nightlight'] == 0).mean()

((64067, 6), 0.28741785942841086)

In [160]:
df_mw = drop_0s(df_mw, frac=0.9)
df_mw.shape, (df_mw['nightlight'] == 0).mean() # let's get it under 10%

((53140, 6), 0.1408919834399699)

In [161]:
df_mw = drop_0s(df_mw, frac=0.6)
df_mw.shape, (df_mw['nightlight'] == 0).mean() # this seems like a better ratio

((49797, 6), 0.08321786452999176)

In [162]:
(df_mw['nightlight'] < 1).mean() # still most data is under 1

0.4926602004136795

In [163]:
def drop_under(df, cutoff=1, frac=0.5):
    z_inds = np.argwhere(df['nightlight'].values <= cutoff).reshape(-1)
    drop = np.random.choice(z_inds, int(frac*len(z_inds))).reshape(-1)
    return df.drop(df.index[drop])

In [164]:
df_mw = drop_under(df_mw, cutoff=1, frac=0.3)

In [165]:
from sklearn.mixture import GaussianMixture as GMM
X = df_mw['nightlight'].values.reshape(-1,1)
gmm = GMM(n_components=3).fit(X)
labels = gmm.predict(df_mw['nightlight'].values.reshape(-1,1))

In [166]:
(labels==0).mean(), (labels==1).mean(), (labels==2).mean()

(0.5501484772449989, 0.37895076080200735, 0.07090076195299372)

In [167]:
df_mw['nightlight'][labels==0].max(), df_mw['nightlight'][labels==1].max(), df_mw['nightlight'][labels==2].max()

(1.9142857142857144, 14.809917355371907, 48.41322314049588)

In [171]:
# let's smudge these numbers a little to raise the percentage in class 2
# we need the distribution to be somewhat even
(df_mw['nightlight'] < 1.5).mean(), \
((df_mw['nightlight'] >= 1.5) & (df_mw['nightlight'] < 11)).mean(), \
(df_mw['nightlight'] >= 11).mean()

(0.493174650675629, 0.40169425197394165, 0.10513109735042932)

In [172]:
df_mw['nightlight_bin'] = (df_mw['nightlight'] < 1.5)*1 + \
                    ((df_mw['nightlight'] >= 1.5) & (df_mw['nightlight'] < 11))*2 + \
                    (df_mw['nightlight'] >= 11)*3

In [173]:
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption,nightlight_bin
9,-17.125,35.174999,-17.09515,35.217213,0.0,2.039307,1
10,-17.133333,35.174999,-17.09515,35.217213,0.0,2.039307,1
24,-17.066666,35.191666,-17.09515,35.217213,0.0,2.039307,1
33,-17.05,35.199999,-17.09515,35.217213,0.0,2.039307,1
39,-17.1,35.199999,-17.09515,35.217213,0.0,2.039307,1


In [174]:
df_mw.shape

(43441, 7)

In [175]:
df_mw.to_csv('mw_2016_guide.csv', index=False)

In [176]:
# we don't need to download repeat images, as that reduces our download size significantly
# some images can belong to 2+ clusters
print(df_mw.drop_duplicates(['im_lat', 'im_lon']).shape)

(23464, 7)


In [177]:
mw_download = df_mw.drop_duplicates(['im_lat', 'im_lon'])

In [178]:
mw_download.to_csv('mw_2016_download.csv', index=False)

# Download Images

Now we actually download images

In [1]:
"""Interface for downloading aerial imagery from Google Static Maps API.
- Get an API key at https://developers.google.com/maps/documentation/maps-static/intro
"""

import requests
from PIL import Image
from io import BytesIO

class ImageryDownloader:
    def __init__(self, access_token):
        """Initializes the object with an access token"""
        self.access_token = access_token
        self.url = 'https://maps.googleapis.com/maps/api/staticmap?center={},{}&zoom={}&size=400x400&maptype=satellite&key={}'
    
    def download_tile(self, lat, long, zoom):
        """Downloads lat long
        """
        res = requests.get(self.url.format(lat, long, zoom, self.access_token))
        image = Image.open(BytesIO(res.content))

        return image

In [6]:
access = None
with open('api_key.txt', 'r') as f:
    access = f.readlines()[0]
    
im_downloader = ImageryDownloader(access)

In [30]:
!mkdir ims_malawi_2016

In [None]:
im_names = []
zoom = 16
for i, r in mw_download.iterrows():
    lat = r.im_lat
    long = r.im_lon
    xtile, ytile = deg_to_tile(lat, long, zoom) # max zoom
    try:
        im = im_downloader.download_tile(xtile, ytile, zoom)
        name = str(lat) + '_' + str(long)
        im.save('ims_malawi_2016/{}.png'.format(name))
        im_names.append(name + '.png')
    except:
        im_names.append(np.nan)
    if i % 100 == 0:
        # the counting is kind of off for some reason
        print(i, end=', ')
        
mw_download['images'] = im_names
mw_download.to_csv('mw_2016_download_info.csv', index=False)

0, 200, 400, 800, 900, 1600, 1800, 2500, 3000, 3200, 3600, 3700, 4600, 4800, 5400, 6000, 6200, 6500, 6900, 7000, 7100, 7200, 7700, 7800, 8300, 8400, 8800, 9000, 9200, 9300, 10000, 10500, 11100, 11200, 11300, 11400, 11500, 11700, 11800, 11900, 