In [31]:
import pandas as pd
import numpy as np
import os
import geoio

In [32]:
data = pd.read_csv('../data/data.csv')

df_clusters = data
df_clusters.head()

Unnamed: 0,lon,lat,emission,filename
0,121.0875,18.6125,7.467103,18.6125_121.0875.jpg
1,120.779167,18.604167,6.376585,18.604166666666664_120.77916666666668.jpg
2,121.054167,18.604167,7.586672,18.604166666666664_121.05416666666667.jpg
3,121.0625,18.604167,8.018453,18.604166666666664_121.0625.jpg
4,121.070833,18.604167,7.548463,18.604166666666664_121.07083333333334.jpg


In [33]:
#img = geoio.GeoImage('../../data/ntl.tif')
img = geoio.GeoImage('../data/odiac2019_1km_excl_intl_1812.tif') ###THIS SHOULD BE THE NIGHTLIGHTS FILE, MAKE SURE TO CHANGE

# pass lon then lat
xPixel, yPixel = img.proj_to_raster(18.612500, 121.087500)

In [34]:
xPixel, yPixel

(-9932.499920536, -11060.499911512)

In [35]:
im_array = np.squeeze(img.get_data())
im_array.shape

(18000, 28800)

In [36]:
im_array[int(yPixel),int(xPixel)] # this is the nightlight value at the given coordinate

0.0

In [37]:
import math

def create_space(lat, lon):
    # these are pulled from the paper to make the 10km^2 area
    return lat - (180/math.pi)*(5000/6378137), lon - (180/math.pi)*(5000/6378137)/math.cos(lat), \
            lat + (180/math.pi)*(5000/6378137), lon + (180/math.pi)*(5000/6378137)/math.cos(lat)

In [38]:
cluster_nightlights = []
for i,r in df_clusters.iterrows():
    min_lat, min_lon, max_lat, max_lon = create_space(r.lat, r.lon)
    xminPixel, yminPixel = img.proj_to_raster(min_lon, min_lat)
    xmaxPixel, ymaxPixel = img.proj_to_raster(max_lon, max_lat)
    
    xminPixel, xmaxPixel = min(xminPixel, xmaxPixel), max(xminPixel, xmaxPixel)
    yminPixel, ymaxPixel = min(yminPixel, ymaxPixel), max(yminPixel, ymaxPixel)
    
    xminPixel, yminPixel, xmaxPixel, ymaxPixel = int(xminPixel), int(yminPixel), int(xmaxPixel), int(ymaxPixel)
    cluster_nightlights.append(im_array[yminPixel:ymaxPixel,xminPixel:xmaxPixel].mean())

In [39]:
df_clusters['nightlights'] = cluster_nightlights
df_clusters.head()

Unnamed: 0,lon,lat,emission,filename,nightlights
0,121.0875,18.6125,7.467103,18.6125_121.0875.jpg,0.05144
1,120.779167,18.604167,6.376585,18.604166666666664_120.77916666666668.jpg,0.030283
2,121.054167,18.604167,7.586672,18.604166666666664_121.05416666666667.jpg,0.050283
3,121.0625,18.604167,8.018453,18.604166666666664_121.0625.jpg,0.050607
4,121.070833,18.604167,7.548463,18.604166666666664_121.07083333333334.jpg,0.051067


In [40]:
df_clusters.to_csv ('../data/PH_data.csv', index = False, header=True)

In [41]:
test = df_clusters

In [44]:
from sklearn.mixture import GaussianMixture as GMM
X = test['nightlights'].values.reshape(-1,1)
gmm = GMM(n_components=3).fit(X)
labels = gmm.predict(test['nightlights'].values.reshape(-1,1))

In [45]:
(labels==0).mean(), (labels==1).mean(), (labels==2).mean()

(0.5825702992094466, 0.09060092064445112, 0.3268287801461023)

In [24]:
# these are the cutoff for the labels identified by the Guassian Mixture Model
label0_max = test['nightlights'][labels==0].max()
label1_max = test['nightlights'][labels==1].max()
label2_max = test['nightlights'][labels==2].max()

label0_max, label1_max, label2_max

(29.603918075561523, 3439.44189453125, 1809.2613525390625)

In [25]:
# I am going to hand reassign these to have better representation among all three classes
# these are not ideal distributions obviously but the model should still be able to learn
# something like a quantile cut might work better and be less arbitrary, but for reproducability 
# purposes I'll stick to the GMM-based approach
label0_max = 0.05
label1_max = 5
label2_max = 70

In [26]:
def query_df(df, lower, upper):
    return df[((lower <= df['nightlights']) & (df['nightlights'] < upper))]

print(len(query_df(test, 0, label0_max)) / len(test))
print(len(query_df(test, label0_max, label1_max)) / len(test))
print(len(query_df(test, label1_max, label2_max)) / len(test))

0.00020014009806864806
0.329743320324227
0.6055488842189533


In [27]:
def create_nightlights_bin(df, cutoffs):
    assert len(cutoffs) >= 2, print('need at least 2 bins')
    cutoffs = sorted(cutoffs, reverse=True)
    labels = list(range(len(cutoffs)))[::-1]
    df['nightlights_bin'] = len(cutoffs)
    for cutoff, label in zip(cutoffs, labels):
        df['nightlights_bin'].loc[df['nightlights'] <= cutoff] = label

df_download = test.copy()
create_nightlights_bin(df_download, cutoffs=[label0_max, label1_max, label2_max])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [28]:
# these should match above
(df_download['nightlights_bin']==0).mean(), (df_download['nightlights_bin']==1).mean(), (df_download['nightlights_bin']==2).mean()

(0.00020014009806864806, 0.329743320324227, 0.6055488842189533)

In [29]:
df_download['filename'] = data['filename']
df_download

Unnamed: 0,lon,lat,emission,filename,nightlights,nightlights_bin
0,121.087500,18.612500,7.467103,18.6125_121.0875.jpg,0.577604,1
1,120.779167,18.604167,6.376585,18.604166666666664_120.77916666666668.jpg,0.566483,1
2,121.054167,18.604167,7.586672,18.604166666666664_121.05416666666667.jpg,0.577604,1
3,121.062500,18.604167,8.018453,18.604166666666664_121.0625.jpg,0.577604,1
4,121.070833,18.604167,7.548463,18.604166666666664_121.07083333333334.jpg,0.577604,1
...,...,...,...,...,...,...
79939,119.770833,5.020833,9.871109,5.020833333333332_119.77083333333331.jpg,1.029708,1
79940,119.745833,5.012500,8.536774,5.012499999999998_119.74583333333334.jpg,0.933903,1
79941,119.754167,5.012500,9.345279,5.012499999999998_119.75416666666666.jpg,0.933903,1
79942,119.762500,5.012500,8.501098,5.012499999999998_119.7625.jpg,0.933903,1


In [30]:
df_download.to_csv('../data/image_download_locs.csv')