Uses the files created using the scripts in `data` and sets up the whole training script.

In [1]:
import pandas as pd
import numpy as np
import os

df_mw_download = pd.read_csv('data/mw_2016_download.csv')
df_mw_download_info = pd.read_csv('data/mw_2016_download_info.csv')
df_mw_guide = pd.read_csv('data/mw_2016_guide.csv')

In [None]:
if len(df_mw_download) != len(df_mw_download_info):
    # this means the script broke and you restarted it
    # this is how I named images so it should work...
    df_mw_download['images'] = df_mw_download.apply(lambda x: str(x.im_lat) + '_' + str(x.im_lon) + '.png', axis=1)
else:
    df_mw_download = df_mw_download_info

In [2]:
# we didn't download duplicate images which explains the difference
df_mw_download.shape, df_mw_guide.shape

((23373, 8), (43359, 7))

In [3]:
len(df_mw_guide.groupby(['clust_lat', 'clust_lon'])) # number of clusters

780

In [4]:
# merges the images downloaded with the original dataframe
df_sub = df_mw_download[['im_lat', 'im_lon', 'images']]
df_mw = pd.merge(left=df_mw_guide, right=df_sub, on=['im_lat', 'im_lon'])

In [5]:
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption,nightlight_bin,images
0,-17.066666,35.174999,-17.09515,35.217213,0.0,2.039307,1,-17.066666_35.174999.png
1,-17.116666,35.174999,-17.09515,35.217213,0.0,2.039307,1,-17.116666_35.174999.png
2,-17.1,35.183332,-17.09515,35.217213,0.0,2.039307,1,-17.1_35.183332.png
3,-17.125,35.199999,-17.09515,35.217213,0.0,2.039307,1,-17.125_35.199999.png
4,-17.133333,35.199999,-17.09515,35.217213,0.0,2.039307,1,-17.133333_35.199999.png


In [6]:
# I didn't download all the images and also I got rid of some image repeats, hence the difference
df_mw_guide.shape, df_mw.shape

((43359, 7), (43359, 8))

In [7]:
# adds a "cluster number" to the dataframe
clust_group = df_mw.groupby(['clust_lat', 'clust_lon'])
clust_group = clust_group.first().reset_index()[['clust_lat', 'clust_lon']]
clust_numbers = np.arange(len(clust_group))
clust_group['clust_num'] = clust_numbers

In [8]:
clust_group.head()

Unnamed: 0,clust_lat,clust_lon,clust_num
0,-17.09515,35.217213,0
1,-17.092351,35.114643,1
2,-17.016698,35.079629,2
3,-16.977243,35.205706,3
4,-16.956385,35.168967,4


In [9]:
df_mw = pd.merge(df_mw, clust_group, on=['clust_lat', 'clust_lon'])
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption,nightlight_bin,images,clust_num
0,-17.066666,35.174999,-17.09515,35.217213,0.0,2.039307,1,-17.066666_35.174999.png,0
1,-17.116666,35.174999,-17.09515,35.217213,0.0,2.039307,1,-17.116666_35.174999.png,0
2,-17.1,35.183332,-17.09515,35.217213,0.0,2.039307,1,-17.1_35.183332.png,0
3,-17.125,35.199999,-17.09515,35.217213,0.0,2.039307,1,-17.125_35.199999.png,0
4,-17.133333,35.199999,-17.09515,35.217213,0.0,2.039307,1,-17.133333_35.199999.png,0


In [10]:
df_mw.shape

(43359, 9)

I'm gonna add an _ to the images to show which cluster they come from

In [11]:
df_mw['images_renamed'] = df_mw.apply(lambda x: x.images[:-4] + '_' + str(x.clust_num) + '.png', axis=1)

In [12]:
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption,nightlight_bin,images,clust_num,images_renamed
0,-17.066666,35.174999,-17.09515,35.217213,0.0,2.039307,1,-17.066666_35.174999.png,0,-17.066666_35.174999_0.png
1,-17.116666,35.174999,-17.09515,35.217213,0.0,2.039307,1,-17.116666_35.174999.png,0,-17.116666_35.174999_0.png
2,-17.1,35.183332,-17.09515,35.217213,0.0,2.039307,1,-17.1_35.183332.png,0,-17.1_35.183332_0.png
3,-17.125,35.199999,-17.09515,35.217213,0.0,2.039307,1,-17.125_35.199999.png,0,-17.125_35.199999_0.png
4,-17.133333,35.199999,-17.09515,35.217213,0.0,2.039307,1,-17.133333_35.199999.png,0,-17.133333_35.199999_0.png


In [13]:
# you can add other countries here under the structure images/<COUNTRY>/
os.makedirs('../images/ims_mw/ims')

In [14]:
import shutil

In [15]:
# this will copy images into a folder called ims
# this folder is helpful because the original folder has all unique images
# now, we need to duplicate those images and distinguish them by their new name (as made previously)
def create_im_renamed(x):
    shutil.copy('data/ims_malawi_2016/{}'.format(x.images), '../images/ims_mw/ims/{}'.format(x.images_renamed))

In [16]:
df_mw.apply(create_im_renamed, axis=1)

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
10       None
11       None
12       None
13       None
14       None
15       None
16       None
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
         ... 
43329    None
43330    None
43331    None
43332    None
43333    None
43334    None
43335    None
43336    None
43337    None
43338    None
43339    None
43340    None
43341    None
43342    None
43343    None
43344    None
43345    None
43346    None
43347    None
43348    None
43349    None
43350    None
43351    None
43352    None
43353    None
43354    None
43355    None
43356    None
43357    None
43358    None
Length: 43359, dtype: object

In [17]:
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption,nightlight_bin,images,clust_num,images_renamed
0,-17.125,35.174999,-17.09515,35.217213,0.0,2.039307,1,-17.125_35.174999.png,0,-17.125_35.174999_0.png
1,-17.133333,35.174999,-17.09515,35.217213,0.0,2.039307,1,-17.133333_35.174999.png,0,-17.133333_35.174999_0.png
2,-17.066666,35.191666,-17.09515,35.217213,0.0,2.039307,1,-17.066666_35.191666.png,0,-17.066666_35.191666_0.png
3,-17.05,35.199999,-17.09515,35.217213,0.0,2.039307,1,-17.05_35.199999.png,0,-17.05_35.199999_0.png
4,-17.1,35.199999,-17.09515,35.217213,0.0,2.039307,1,-17.1_35.199999.png,0,-17.1_35.199999_0.png


In [18]:
df_mw.to_csv('mw_full_guide.csv', index=False)

Create train/valid folders

The model will use these directly.

In [19]:
import os, shutil
import numpy as np

In [20]:
pic_list = df_mw['images'].values.tolist()
to_pick = int(0.8*len(pic_list)); to_pick

34687

In [21]:
inds = np.arange(len(pic_list))
train_ind = np.random.choice(np.arange(len(pic_list)), to_pick, replace=False)
valid_ind = np.delete(inds, train_ind)

In [22]:
pic_list = np.array(pic_list)
train_im = pic_list[train_ind]
valid_im = pic_list[valid_ind]

In [23]:
os.makedirs('../images/ims_mw/train', exist_ok=False)
os.makedirs('../images/ims_mw/valid', exist_ok=False)

In [24]:
t = df_mw.iloc[train_ind]
v = df_mw.iloc[valid_ind]

In [25]:
for fi, l in zip(t['images_renamed'], t['nightlight_bin']):
    os.makedirs('../images/ims_mw/train/{}'.format(l), exist_ok=True)
    shutil.copy('../images/ims_mw/ims/{}'.format(fi), '../images/ims_mw/train/{}'.format(l))
    
for fi, l in zip(v['images_renamed'], v['nightlight_bin']):
    os.makedirs('../images/ims_mw/valid/{}'.format(l), exist_ok=True)
    shutil.copy('../images/ims_mw/ims/{}'.format(fi), '../images/ims_mw/valid/{}'.format(l))

In [26]:
# shows count distribution in train folder
for i in range(1,4):
    print(len(os.listdir('../images/ims_mw/train/{}'.format(str(i)))))

17175
13893
3619


In [27]:
# shows count distribution in valid folder
for i in range(1,4):
    print(len(os.listdir('../images/ims_mw/valid/{}'.format(str(i)))))

4167
3557
948
