In [59]:
import geopandas as gpd
import pandas as pd
import numpy as np

In [60]:
df_geo = gpd.read_file('../data/malawi/grid/grid.shp')

In [61]:
clusters = pd.read_csv('cluster_data.csv')

In [62]:
def find_idx(point):
    for i, poly in enumerate(df_geo['geometry']):
        if poly.contains(point):
            return i
    return np.nan

from shapely.geometry.point import Point
points = clusters.apply(lambda x: Point(x.lon, x.lat), axis=1)

clusters['geo_idx'] = points.apply(find_idx)

In [63]:
clusters.head()

Unnamed: 0,lat,lon,cluster_cons,cluster_phone_cons,cluster_hh_f34,cluster_hh_f35,cluster_phone_na,cluster_hh_f34_na,cluster_hh_f35_na,cluster_count,nightlights,geo_idx
0,-17.09515,35.217213,965.3362,235.16063,0.875,36.388459,0.0,0.0,0.5,16,0.0,1039.0
1,-17.092351,35.114643,1048.0758,13.954665,0.125,9.803922,0.0,0.0,0.875,16,0.0,1001.0
2,-17.016698,35.079629,1096.287,9.645964,0.125,6.449948,0.0,0.0,0.875,16,0.0,960.0
3,-16.977243,35.205706,1213.1265,35.603516,0.1875,15.766541,0.0,0.0,0.8125,16,0.121212,1038.0
4,-16.956385,35.168967,862.61523,19.372486,0.25,6.449948,0.0,0.0,0.75,16,0.502674,1000.0


In [64]:
preds = pd.read_csv('../results/malawi/ridge_phone_density/predictions.csv')

In [65]:
df_geo['geo_idx'] = np.arange(len(df_geo))
df_geo['centroid'] = df_geo['geometry'].centroid

df_geo['centroid_lat'] = df_geo['centroid'].apply(lambda point: point.y)
df_geo['centroid_lon'] = df_geo['centroid'].apply(lambda point: point.x)

preds['centroid_lat'] = preds['centroid_lat'].astype(np.float64)
preds['centroid_lon'] = preds['centroid_lon'].astype(np.float64)

df_geo['centroid_lat'] = df_geo['centroid_lat'].astype(np.float64)
df_geo['centroid_lon'] = df_geo['centroid_lon'].astype(np.float64)

# must use ints for merging
df_geo['merge_lat'] = (10000 * df_geo['centroid_lat']).astype(int)
df_geo['merge_lon'] = (10000 * df_geo['centroid_lon']).astype(int)

preds['merge_lat'] = (10000 * preds['centroid_lat']).astype(int)
preds['merge_lon'] = (10000 * preds['centroid_lon']).astype(int)

df_geo.shape, preds.shape

((1203, 17), (1203, 5))

In [66]:
merged = pd.merge(df_geo, preds.drop(['centroid_lat', 'centroid_lon'], axis=1), on=['merge_lat', 'merge_lon'])

In [67]:
merged.shape

(1203, 18)

In [68]:
clusters = clusters[['lat', 'lon', 'cluster_hh_f34', 'geo_idx']].dropna(subset=['geo_idx'])
clusters['geo_idx'] = clusters['geo_idx'].astype(int)

In [69]:
clusters.shape

(773, 4)

In [70]:
merged = pd.merge(merged, clusters[['lat', 'lon', 'cluster_hh_f34', 'geo_idx']], on='geo_idx')

In [71]:
merged.shape

(773, 21)

In [72]:
merged.head()

Unnamed: 0,Shape_Leng,Shape_Area,ADM0_EN,ADM0_PCODE,ADM0_REF,ADM0ALT1EN,ADM0ALT2EN,date,validOn,validTo,...,geo_idx,centroid,centroid_lat,centroid_lon,merge_lat,merge_lon,predicted_phone_density,lat,lon,cluster_hh_f34
0,31.909743,8.001588,Malawi,MW,,,,2018-09-03,2018-10-16,,...,10,POINT (32.89619 -13.53158),-13.531576,32.896189,-135315,328961,0.436867,-13.572201,32.859582,0.375
1,31.909743,8.001588,Malawi,MW,,,,2018-09-03,2018-10-16,,...,10,POINT (32.89619 -13.53158),-13.531576,32.896189,-135315,328961,0.436867,-13.557205,32.897253,0.75
2,31.909743,8.001588,Malawi,MW,,,,2018-09-03,2018-10-16,,...,13,POINT (32.90195 -13.78705),-13.787053,32.901946,-137870,329019,0.533423,-13.803408,32.901011,1.6875
3,31.909743,8.001588,Malawi,MW,,,,2018-09-03,2018-10-16,,...,13,POINT (32.90195 -13.78705),-13.787053,32.901946,-137870,329019,0.533423,-13.785994,32.860543,0.9375
4,31.909743,8.001588,Malawi,MW,,,,2018-09-03,2018-10-16,,...,28,POINT (32.98602 -13.53158),-13.531576,32.98602,-135315,329860,0.510487,-13.543769,32.993232,0.625


In [84]:
merged[['centroid_lat', 'centroid_lon', 'predicted_phone_density']].sort_values(['centroid_lat', 'centroid_lon'], ascending=True)

Unnamed: 0,centroid_lat,centroid_lon,predicted_phone_density
604,-17.083036,35.142291,0.462886
645,-17.082668,35.231750,0.910419
566,-17.013450,35.078743,0.249965
603,-17.001294,35.144901,-0.012614
643,-16.997621,35.231649,0.948566
...,...,...,...
28,-9.660247,33.261537,0.260305
25,-9.571847,33.255562,0.524360
9,-9.571341,33.075232,0.583145
24,-9.509371,33.257111,0.204284


In [73]:
merged[['predicted_phone_density', 'cluster_hh_f34']]

Unnamed: 0,predicted_phone_density,cluster_hh_f34
0,0.436867,0.3750
1,0.436867,0.7500
2,0.533423,1.6875
3,0.533423,0.9375
4,0.510487,0.6250
...,...,...
768,0.983924,0.4375
769,0.189803,0.4375
770,0.999779,0.5000
771,0.474291,0.0625


In [76]:
from sklearn.metrics import r2_score

In [78]:
r2_score(merged['cluster_hh_f34'], merged['predicted_phone_density'])

-0.0011573543329623792

In [80]:
merged['predicted_phone_density'].corr(merged['cluster_hh_f34'])

0.41017174906174