## Import Libraries ##

In [17]:
import os
import pandas as pd
import numpy as np
import rasterio
from shapely.geometry import Point
import geopandas as gpd
from tqdm import tqdm
from rasterio.features import geometry_window
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer

## Define Filepaths ##

In [3]:
BASE_DIR = 'C:\\Users\\chris\\Desktop\\use-of-AI-to-eradicate-extreme-poverty'
SETTLEMENT_DIRS = [
    os.path.join(BASE_DIR, 'data/GlobalHumanSettlement/GHS_BUILT_S_E2020_GLOBE_R2023A_54009_100_V1_0_R11_C22.tif'),
    os.path.join(BASE_DIR, 'data/GlobalHumanSettlement/GHS_BUILT_S_E2020_GLOBE_R2023A_54009_100_V1_0_R12_C22.tif')
]
NIGHTLIGHTS_DIR = os.path.join(BASE_DIR, 'data', 'nightlights', 'viirs_2020_00N060W.tif')
ROADS_DIR = os.path.join(BASE_DIR, 'data', 'roads', 'GRIP4_density_total','grip4_total_dens_m_km2.asc')
HEALTH_DIR = os.path.join(BASE_DIR, 'data', 'health_facilities', 'healthcare_2020.shp')
LSMS_DIR = os.path.join(BASE_DIR, 'data', 'LSMS_2019')
csv_file_path = os.path.join(LSMS_DIR, 'HouseholdGeovariables_csv', 'householdgeovariables_ihs5.csv')

## Processs LSMS data and incoorporate information for roads, buildup areas and nightlights ##

In [5]:
def process_malawi():
    # Define paths to data
    lsms_dir = os.path.join(BASE_DIR, 'data', 'LSMS_2019')
    consumption_file = os.path.join(lsms_dir, 'consumption_aggregate', 'ihs5_consumption_aggregate.csv')
    geovariables_file = os.path.join(lsms_dir, 'HouseholdGeovariables_csv', 'householdgeovariables_ihs5.csv')
    
    # load consumption data
    df = pd.read_csv(consumption_file)
    df = df[['case_id', 'gap_poor']]  # Include only relevant columns

    # load geovariables data
    df_geo = pd.read_csv(geovariables_file)
    df_geo = df_geo[['case_id', 'ea_lat_mod', 'ea_lon_mod']]
    df_geo.rename(columns={'ea_lat_mod': 'cluster_lat', 'ea_lon_mod': 'cluster_lon'}, inplace=True)

    # mwrge datasets on 'case_id'
    df_combined = pd.merge(df, df_geo, on='case_id')

    # check for duplicate case_ids before dropping it
    duplicates = df_combined['case_id'].duplicated().sum()
    print(f"Number of duplicates in case_id: {duplicates}")

    # drop 'case_id' after checking for duplicates
    df_combined.drop('case_id', axis=1, inplace=True)
    df_combined.dropna(inplace=True)

    # aggregate data by cluster
    aggregation_functions = {
        'gap_poor': 'mean'
    }
    df_clusters = df_combined.groupby(['cluster_lat', 'cluster_lon']).agg(aggregation_functions).reset_index()

    return df_clusters[['cluster_lat', 'cluster_lon', 'gap_poor']]

df_mw = process_malawi()
print(df_mw)

Number of duplicates in case_id: 0
     cluster_lat  cluster_lon   gap_poor
0     -17.093531    35.253139  16.549712
1     -17.065680    35.166790  14.785578
2     -17.028139    35.241661  12.760279
3     -17.005220    35.082470  26.924220
4     -16.964531    35.208618  16.837666
..           ...          ...        ...
704    -9.524120    33.278580   8.104608
705    -9.514320    33.220600   3.447501
706    -9.510990    33.137482  11.820492
707    -9.398440    33.015339   8.899074
708     0.000000     0.000000   6.327292

[709 rows x 3 columns]


In [18]:
# function to create a buffer corcle around each point that corresponds to the aggregated questionnaire geolocations

def circle_from_point(lat, lon, distance_km=5):
    # calculate degrees from distance
    earth_radius = 6371  # Radius of the Earth in kilometers
    dLat = (distance_km / earth_radius) * (180 / math.pi)
    dLon = dLat / math.cos(math.radians(lat))

    # create a circle polygon for the given radius
    return shapely.geometry.Point(lon, lat).buffer(dLat)  # Returns a circle around the point

In [None]:
import os
import numpy as np
import rasterio
from rasterio.features import geometry_window
from shapely.geometry import Point
import geopandas as gpd
from tqdm import tqdm

import geopandas as gpd
from shapely.geometry import Point

def circle_from_point(lat, lon, radius_km, target_crs='EPSG:4326', buffer_crs=None):
    # Create a point GeoDataFrame
    point = Point(lon, lat)
    point_gdf = gpd.GeoDataFrame(index=[0], crs=target_crs, geometry=[point])
    
    # Determine an appropriate CRS for buffering if not provided
    if not buffer_crs:
        # Automatically determine UTM zone based on longitude
        utm_zone = int(1 + (lon + 180.0) / 6.0)
        hemi = 'N' if lat >= 0 else 'S'
        buffer_crs = f'EPSG:326{utm_zone}' if hemi == 'N' else f'EPSG:327{utm_zone}'
    
    # Convert to a projected CRS to perform the buffering
    point_gdf = point_gdf.to_crs(buffer_crs)
    
    # Perform the buffer in meters
    buffer = point_gdf.buffer(radius_km * 1000)  # Buffer radius in meters
    
    # Convert the buffer back to the original CRS
    buffer = buffer.to_crs(target_crs)
    
    return buffer.geometry[0]

# Example usage within your main function:
# Assuming you determine the radius and lat, lon as inputs somewhere in your code
# circle = circle_from_point(lat, lon, 2.5)


def calculate_average_value(subset, no_data_value):
    subset = np.ma.masked_equal(subset, no_data_value)
    if np.ma.is_masked(subset) and np.ma.count(subset) == 0:
        return no_data_value
    else:
        return np.ma.mean(subset)

def add_buildings_nightlights_roads_healthcare_values(df, buildings_tif_path, nightlights_tif_path, roads_tif_path, healthcare_tif_path, output_polygon_shapefile, output_point_shapefile):
    with rasterio.open(buildings_tif_path) as buildings_raster, \
         rasterio.open(nightlights_tif_path) as nightlights_raster, \
         rasterio.open(roads_tif_path) as roads_raster, \
         rasterio.open(healthcare_tif_path) as healthcare_raster:

        buildings_array = buildings_raster.read(1)
        nightlights_array = nightlights_raster.read(1)
        roads_array = roads_raster.read(1)
        healthcare_array = healthcare_raster.read(1)

        buildings_nodata = buildings_raster.nodata
        nightlights_nodata = nightlights_raster.nodata
        roads_nodata = roads_raster.nodata
        healthcare_nodata = healthcare_raster.nodata

        circles = []
        points = []
        for lat, lon in zip(df['cluster_lat'], df['cluster_lon']):
            circle = circle_from_point(lat, lon, 2.5)  # 2.5 km radius circle
            point = Point(lon, lat)
            circles.append(circle)
            points.append(point)

        gdf_polygons = gpd.GeoDataFrame(df, geometry=circles, crs="EPSG:4326")
        gdf_points = gpd.GeoDataFrame(df, geometry=points, crs="EPSG:4326")

        builtup_area = []
        nightlights_intensity = []
        road_density = []
        healthcare_access = []

        for geometry in tqdm(gdf_polygons.geometry, desc="Processing circles"):
            for raster, array, results, no_data_value in [
                (buildings_raster, buildings_array, builtup_area, buildings_nodata),
                (nightlights_raster, nightlights_array, nightlights_intensity, nightlights_nodata),
                (roads_raster, roads_array, road_density, roads_nodata),
                (healthcare_raster, healthcare_array, healthcare_access, healthcare_nodata)]:

                window = geometry_window(raster, [geometry], pad_x=0, pad_y=0, boundless=True)
                if window.width > 0 and window.height > 0:
                    subset = array[window.row_off:window.row_off + window.height, window.col_off:window.col_off + window.width]
                    average_value = calculate_average_value(subset, no_data_value)
                    results.append(average_value if average_value != no_data_value else 0)
                else:
                    results.append(0)

        gdf_polygons['builtup_area'] = builtup_area
        gdf_polygons['nightlights_intensity'] = nightlights_intensity
        gdf_polygons['road_density'] = road_density
        gdf_polygons['healthcare_access'] = healthcare_access

        gdf_points['builtup_area'] = builtup_area
        gdf_points['nightlights_intensity'] = nightlights_intensity
        gdf_points['road_density'] = road_density
        gdf_points['healthcare_access'] = healthcare_access

        gdf_polygons.to_file(output_polygon_shapefile)
        gdf_points.to_file(output_point_shapefile)

    return gdf_polygons, gdf_points

# Sample data paths
BASE_DIR = r'C:\Users\chris\Desktop\use-of-AI-to-eradicate-extreme-poverty'
buildings_tif_path = os.path.join(BASE_DIR, 'results', 'rasters', 'settlements_merged.tif')
nightlights_tif_path = os.path.join(BASE_DIR, 'data', 'nightlights', 'viirs_2020_00N060W.tif')
roads_tif_path = os.path.join(BASE_DIR, 'data', 'roads', 'GRIP4_density_total', 'grip4_total_dens_m_km2_4326.tif')
healthcare_tif_path = os.path.join(BASE_DIR, 'results', 'rasters', 'healthcare_density.tif')
output_point_shapefile = os.path.join(BASE_DIR, 'results', 'shapefiles', 'df_mw_points.shp')
output_polygon_shapefile = os.path.join(BASE_DIR, 'results', 'shapefiles', 'df_mw_polygons.shp')

# Assuming df_mw is already defined and contains necessary data
gdf_circles, df_result = add_buildings_nightlights_roads_healthcare_values(df_mw, buildings_tif_path, nightlights_tif_path, roads_tif_path, healthcare_tif_path, output_polygon_shapefile, output_point_shapefile)

# Print the result to verify
print(df_result.head())

In [23]:
print(df_result[['cluster_lat', 'cluster_lon', 'builtup_area', 'nightlights_intensity', 'road_density', 'gap_poor', 'healthcare_access']])

     cluster_lat  cluster_lon  builtup_area  nightlights_intensity  \
0     -17.093531    35.253139      3.753977               0.373688   
1     -17.065680    35.166790     51.892308               0.411656   
2     -17.028139    35.241661    147.284231               0.408638   
3     -17.005220    35.082470     46.378846               0.586200   
4     -16.964531    35.208618    255.686923               0.417670   
..           ...          ...           ...                    ...   
704    -9.524120    33.278580    199.118824               0.356244   
705    -9.514320    33.220600    243.123922               0.366759   
706    -9.510990    33.137482    154.129566               0.376157   
707    -9.398440    33.015339    155.272203               0.372930   
708     0.000000     0.000000           NaN                    NaN   

     road_density   gap_poor  healthcare_access  
0            83.5  16.549712           1.442405  
1            83.5  14.785578           2.068308  
2        

In [24]:
df_result.to_csv(os.path.join(BASE_DIR, 'results', 'dataframes', 'df_result.csv'), index=False)

In [25]:
# function to calculate a correlation matrix

def analyze_relationships(df):

    features = ['gap_poor', 'builtup_area', 'nightlights_intensity', 'road_density', 'healthcare_access']
    df_features = df[features]

    # calculate the correlation matrix
    correlation_matrix = df_features.corr()

    print("Correlation Matrix:")
    print(correlation_matrix)

    return correlation_matrix

# call function
correlation_matrix = analyze_relationships(df_result)
print(correlation_matrix)

Correlation Matrix:
                       gap_poor  builtup_area  nightlights_intensity  \
gap_poor               1.000000     -0.463781              -0.424101   
builtup_area          -0.463781      1.000000               0.917905   
nightlights_intensity -0.424101      0.917905               1.000000   
road_density          -0.386729      0.773211               0.800203   
healthcare_access     -0.140212      0.242120               0.307479   

                       road_density  healthcare_access  
gap_poor                  -0.386729          -0.140212  
builtup_area               0.773211           0.242120  
nightlights_intensity      0.800203           0.307479  
road_density               1.000000           0.265555  
healthcare_access          0.265555           1.000000  
                       gap_poor  builtup_area  nightlights_intensity  \
gap_poor               1.000000     -0.463781              -0.424101   
builtup_area          -0.463781      1.000000               0