This file uses the LSMS survey data and find the grid each cluster belongs in. It then compares the grid prediction to the known LSMS value. Keep in mind that grids are 10km x 10km, and the centroid used for the grid may not be close to the cluster. The fact that there is a correlation suggests that the model fitted the grids well.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np

In [2]:
df_geo = gpd.read_file('../data/malawi/grid/grid.shp')

In [3]:
clusters = pd.read_csv('../LSMS/output/malawi/cluster_data.csv')

In [7]:
def find_idx(point):
    for i, poly in enumerate(df_geo['geometry']):
        if poly.contains(point):
            return i
    return np.nan

from shapely.geometry.point import Point
points = clusters.apply(lambda x: Point(x.cluster_lon, x.cluster_lat), axis=1)

clusters['geo_idx'] = points.apply(find_idx)

In [8]:
clusters.head()

Unnamed: 0,cluster_lat,cluster_lon,cluster_persons_surveyed,cluster_annual_consumption_pc,cluster_annual_phone_consumption_pc,cluster_cellphones_pc,cluster_estimated_annual_phone_cost_pc,cluster_nightlights,geo_idx
0,-17.09515,35.217213,79,961.328026,47.627469,0.177215,428.481013,0.0,1039.0
1,-17.092351,35.114643,70,855.258482,3.189638,0.028571,32.571429,0.0,1001.0
2,-17.016698,35.079629,78,1058.34345,1.978659,0.025641,19.230769,0.0,960.0
3,-16.977243,35.205706,66,1127.493134,8.631155,0.045455,83.333333,0.121212,1038.0
4,-16.956385,35.168967,61,736.167585,5.081308,0.065574,49.180328,0.502674,1000.0


In [9]:
preds = pd.read_csv('../results/malawi/ridge_phone_density/predictions.csv')

In [10]:
df_geo['geo_idx'] = np.arange(len(df_geo))
df_geo['centroid'] = df_geo['geometry'].centroid

df_geo['centroid_lat'] = df_geo['centroid'].apply(lambda point: point.y)
df_geo['centroid_lon'] = df_geo['centroid'].apply(lambda point: point.x)

preds['centroid_lat'] = preds['centroid_lat'].astype(np.float64)
preds['centroid_lon'] = preds['centroid_lon'].astype(np.float64)

df_geo['centroid_lat'] = df_geo['centroid_lat'].astype(np.float64)
df_geo['centroid_lon'] = df_geo['centroid_lon'].astype(np.float64)

# must use ints for merging, as floats induce errors
df_geo['merge_lat'] = (10000 * df_geo['centroid_lat']).astype(int)
df_geo['merge_lon'] = (10000 * df_geo['centroid_lon']).astype(int)

preds['merge_lat'] = (10000 * preds['centroid_lat']).astype(int)
preds['merge_lon'] = (10000 * preds['centroid_lon']).astype(int)

df_geo.shape, preds.shape

((1203, 17), (1203, 5))

In [11]:
merged = pd.merge(df_geo, preds.drop(['centroid_lat', 'centroid_lon'], axis=1), on=['merge_lat', 'merge_lon'])

In [12]:
merged.shape

(1203, 18)

In [15]:
clusters = clusters[['cluster_lat', 'cluster_lon', 'cluster_cellphones_pc', 'geo_idx']].dropna(subset=['geo_idx'])
clusters['geo_idx'] = clusters['geo_idx'].astype(int)

In [16]:
clusters.shape

(773, 4)

In [17]:
merged = pd.merge(merged, clusters[['cluster_lat', 'cluster_lon', 'cluster_cellphones_pc', 'geo_idx']], on='geo_idx')

In [18]:
merged.shape

(773, 21)

In [19]:
merged.head()

Unnamed: 0,Shape_Leng,Shape_Area,ADM0_EN,ADM0_PCODE,ADM0_REF,ADM0ALT1EN,ADM0ALT2EN,date,validOn,validTo,...,geo_idx,centroid,centroid_lat,centroid_lon,merge_lat,merge_lon,predicted_phone_density_pc,cluster_lat,cluster_lon,cluster_cellphones_pc
0,31.909743,8.001588,Malawi,MW,,,,2018-09-03,2018-10-16,,...,10,POINT (32.89619 -13.53158),-13.531576,32.896189,-135315,328961,0.146435,-13.572201,32.859582,0.088235
1,31.909743,8.001588,Malawi,MW,,,,2018-09-03,2018-10-16,,...,10,POINT (32.89619 -13.53158),-13.531576,32.896189,-135315,328961,0.146435,-13.557205,32.897253,0.166667
2,31.909743,8.001588,Malawi,MW,,,,2018-09-03,2018-10-16,,...,13,POINT (32.90195 -13.78705),-13.787053,32.901946,-137870,329019,0.133649,-13.803408,32.901011,0.385714
3,31.909743,8.001588,Malawi,MW,,,,2018-09-03,2018-10-16,,...,13,POINT (32.90195 -13.78705),-13.787053,32.901946,-137870,329019,0.133649,-13.785994,32.860543,0.241935
4,31.909743,8.001588,Malawi,MW,,,,2018-09-03,2018-10-16,,...,28,POINT (32.98602 -13.53158),-13.531576,32.98602,-135315,329860,0.104337,-13.543769,32.993232,0.188679


In [23]:
merged['predicted_phone_density_pc'].corr(merged['cluster_cellphones_pc'])

0.422089201723246