# Exploring different data files to see what they look like

In [3]:
# Standard Library
import warnings
import concurrent
from concurrent.futures import ThreadPoolExecutor
import math

# Third-party Libraries
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# from IPython.display import set_matplotlib_formats
from scipy.stats import mode
from scipy.spatial import cKDTree
from geopy.distance import geodesic


# Rasterio
import rasterio
from rasterio.features import shapes
from rasterio.mask import mask

# Shapely
from shapely.geometry import box
from shapely.geometry import Point

# Linear Models
from linearmodels.panel import PanelOLS
from linearmodels.panel import RandomEffects


# Ignore warnings
warnings.filterwarnings('ignore')

# set_matplotlib_formats('retina')

In [4]:
# # Remove the rows where urbanization equals 10
# data_filtered = data[data.urbanization != 10]

# # Check if there are any other urbanization categories that need to be filtered out
# # based on the provided list
# urban_categories = [30, 23, 22, 21, 13, 12, 11]
# data_filtered = data_filtered[data_filtered['urbanization'].isin(urban_categories)]

# # Filter out the years 2013 to 2019
# data_filtered = data_filtered[(data_filtered['year'] >= 2013) & (data_filtered['year'] <= 2019)]

# # Before plotting, let's check the structure again to ensure correctness
# data_filtered.head()

In [5]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Set the aesthetic style of the plots
# sns.set_style("whitegrid")

# # Prepare the boxplot with a trend line
# plt.figure(figsize=(14, 8))

# # Create a box plot
# sns.boxplot(x='urbanization', y='log_consumption', data=data_filtered, palette="deep")

# # Calculate the median values for each urbanization category for each year
# median_consumption = data_filtered.groupby(['urbanization', 'year'])['log_consumption'].median().reset_index()

# # Create a line plot with the median consumption trend
# sns.lineplot(x='year', y='log_consumption', data=median_consumption, 
#              palette='bright', marker='o')

# # Enhance the plot
# plt.title('Log of Consumption from 2013 to 2019 Across Different Urbanization Categories')
# plt.xlabel('Urbanization Category')
# plt.ylabel('Log of Residential Median Consumption')
# plt.legend(title='Year')

# # # Save the plot
# # plt_file_path = '/mnt/data/consumption_trends.png'
# # plt.savefig(plt_file_path)

# # Show the plot
# plt.show()

In [6]:
# plt.figure(figsize=(10, 6))
# sns.boxplot(x='urbanization', y='log_consumption', data=data)
# plt.title('Non Residential Consumption by Urbanization Level')
# plt.xlabel('Urbanization Level')
# plt.ylabel('Log of Consumption')
# plt.xticks(rotation=45)  # Rotate x labels for better visibility

# # Saving the figure with high resolution
# plt.savefig('Non Residential_Consumption_by_Urbanization.png', format='png', dpi=300)

# plt.show()


In [11]:
boundary_shp_file = "/gypsum/eguide/projects/ce8760/locations/rwanda_boundary/RWA_adm0.shp"

In [13]:
gpd.read_file(boundary_shp_file).crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

### Helper functions to clip tif files and process data into administrative regions

In [14]:
def get_admin_boundary(admin_level: str) -> str:
    """
    This function takes in the admin_level string and returns the file path to the shape file

    Input:
        - admin_level: A string indicating the admin level (district, sector, cell, village, or boundary)

    Returns:
        - String representing the shape file path
    """

    admin_paths = {
        "district": "/gypsum/eguide/projects/ce8760/locations/district/District.shp",
        "sector": "/gypsum/eguide/projects/ce8760/locations/sector/Sector.shp",
        "cell": "/gypsum/eguide/projects/ce8760/locations/cell/Cell.shp",
        "village": "/gypsum/eguide/projects/ce8760/locations/villages/Village.shp",
        "boundary": "/gypsum/eguide/projects/ce8760/locations/rwanda_boundary/RWA_adm0.shp",
    }

    admin_ids = {
        "district": "Dist_ID",
        "sector": "Sect_ID",
        "cell": "Cell_ID",
        "village": "Village_ID",
        "boundary": None,
    }

    # Use the provided admin_level to directly access the file path from the dictionary
    admin_path = admin_paths.get(admin_level.lower())
    admin_id = admin_ids.get(admin_level.lower())

    if admin_path is None:
        raise ValueError(f"Invalid admin_level: {admin_level}")

    return admin_path, admin_id

In [15]:
def clip_tif_chunk(chunk):
    tif_file, chunk_bbox = chunk

    with rasterio.open(tif_file) as src:
        # Create a GeoDataFrame with the chunk bounding box
        bbox_gdf = gpd.GeoDataFrame(geometry=[box(*chunk_bbox)], crs=src.crs)

        # Open the boundary shapefile
        boundary_gdf = gpd.read_file(get_admin_boundary("boundary")[0])
        boundary_gdf = boundary_gdf.to_crs(src.crs)

        # Intersect the bounding box with the boundary shapefile
        intersection = gpd.overlay(boundary_gdf, bbox_gdf, how='intersection')

        # Check if the intersection is empty
        if intersection.empty:
            print(f"No intersection for chunk: {chunk}")
            return gpd.GeoDataFrame()

        # Clip the raster to the intersection geometry
        clipped, transform = mask(src, shapes=intersection.geometry, crop=True)

        # Create a GeoDataFrame directly from the clipped raster
        shapes_gen = rasterio.features.shapes(clipped, transform=transform)
        features = [{'geometry': geometry, 'properties': {'pixel_value': value}}
                    for (geometry, value) in shapes_gen]
        gdf_clipped = gpd.GeoDataFrame.from_features(features, crs=src.crs)

        gdf_clipped = gdf_clipped.to_crs(("EPSG:4326"))

    return gdf_clipped


def parallel_clip_large_tif(tif_file, num_chunks=4):
    with rasterio.open(tif_file) as src:
        xmin, ymin, xmax, ymax = src.bounds
        chunk_width = (xmax - xmin) / num_chunks

        # Define chunks based on bounding box
        chunks = [(tif_file, (xmin + i * chunk_width, ymin, xmin + (i + 1) * chunk_width, ymax))
                  for i in range(num_chunks)]

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(clip_tif_chunk, chunks))

    # Merge the results if needed
    print(results)
    final_result = gpd.GeoDataFrame(pd.concat(results, ignore_index=True), crs='EPSG:4326')

    return final_result

# # Example usage:
# tif_file = '/path/to/large.tif'
# clipped_data = parallel_clip_large_tif(tif_file)


In [16]:
def clip_tif_chunk(chunk):
    tif_file, chunk_bbox = chunk

    with rasterio.open(tif_file) as src:
        # Create a GeoDataFrame with the chunk bounding box
        bbox_gdf = gpd.GeoDataFrame(geometry=[box(*chunk_bbox)], crs=src.crs)

        # Open the boundary shapefile
        boundary_gdf = gpd.read_file(get_admin_boundary("boundary")[0])
        boundary_gdf = boundary_gdf.to_crs(src.crs)

        # Intersect the bounding box with the boundary shapefile
        intersection = gpd.overlay(boundary_gdf, bbox_gdf, how='intersection')

        # Check if the intersection is empty
        if intersection.empty:
            print(f"No intersection for chunk: {chunk}")
            return gpd.GeoDataFrame()

        # Clip the raster to the intersection geometry
        clipped, transform = mask(src, shapes=intersection.geometry, crop=True)

        # Create a GeoDataFrame directly from the clipped raster
        shapes_gen = rasterio.features.shapes(clipped, transform=transform)
        features = [{'geometry': geometry, 'properties': {'pixel_value': value}}
                    for (geometry, value) in shapes_gen]
        gdf_clipped = gpd.GeoDataFrame.from_features(features, crs=src.crs)

        gdf_clipped = gdf_clipped.to_crs(("EPSG:4326"))

    return gdf_clipped

def parallel_clip_large_tif(tif_file, num_chunks=4):
    with rasterio.open(tif_file) as src:
        xmin, ymin, xmax, ymax = src.bounds
        chunk_width = (xmax - xmin) / num_chunks

        # Define chunks based on bounding box
        chunks = [(tif_file, (xmin + i * chunk_width, ymin, xmin + (i + 1) * chunk_width, ymax))
                  for i in range(num_chunks)]

    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Filter out chunks with no intersection
        valid_chunks = [(tif, bbox) for tif, bbox in chunks if has_intersection(tif, bbox)]
        results = list(executor.map(clip_tif_chunk, valid_chunks))

    # Merge the results if needed
    final_result = gpd.GeoDataFrame(pd.concat(results, ignore_index=True), crs='EPSG:4326')

    return final_result

def has_intersection(tif_file, bbox):
    with rasterio.open(tif_file) as src:
        bbox_gdf = gpd.GeoDataFrame(geometry=[box(*bbox)], crs=src.crs)
        boundary_gdf = gpd.read_file(get_admin_boundary("boundary")[0])
        boundary_gdf = boundary_gdf.to_crs(src.crs)
        intersection = gpd.overlay(boundary_gdf, bbox_gdf, how='intersection')
        return not intersection.empty

### Helper function for computing metrics in administrative boundaries

In [17]:
# def compute_administrative_metric(gdf: gpd.GeoDataFrame, admin_level: str, summary_method: str ="median") -> gpd.GeoDataFrame:
#     """
#     This function takes in a geopandas dataframe of an index spread across pixels, 
#     and localizes it to the region under study (i.e. sector, cell, or village).

#     Inputs:
#         - gdf: Geopandas Geodataframe containing the index being measured
#         - admin_level: This is just a string showing if the admin level is 
#             sector, cell, or village

#     Returns:
#         - gpd.GeoDataFrame with median calculations for 
#     """

#     # Define a helper function for mode calculation
#     def calc_mode(x):
#         # Calculate mode, returns mode value and count. We only need the value here
#         m = mode(x)[0]
#         # Handle potential multiple modes by returning the first one
#         return m[0]

#     # Get the file path and identifier for the specified admin level
#     admin_path, admin_id = get_admin_boundary(admin_level=admin_level)

#     # Read the admin shapefile
#     admin_shp = gpd.read_file(admin_path)

#     # Set CRS of admin_shp to EPSG:4326
#     admin_shp = admin_shp.to_crs("EPSG:4326")

#     # Use GeoPandas sjoin for intersection
#     index_summary = gpd.sjoin(admin_shp, gdf, how="inner", op='intersects')

#     # Define aggregation method dynamically based on input
#     if summary_method == "mode":
#         agg_method = calc_mode
#     else:
#         # Use the string method directly for "median" or "mean"
#         agg_method = summary_method

#     # Calculate median and retain the first geometry in case of multiple intersections
#     index_summary = index_summary.groupby([admin_id]).agg({
#         "pixel_value": agg_method,
#         "geometry": "first" 
#     }).reset_index()

#     # Create GeoDataFrame with necessary columns and CRS
#     index_summary = gpd.GeoDataFrame(index_summary[[admin_id, "pixel_value", "geometry"]],
#                                      geometry="geometry", crs="EPSG:4326")

#     return index_summary


In [18]:
def compute_administrative_metric(gdf: gpd.GeoDataFrame, admin_level: str, summary_method: str = "median") -> gpd.GeoDataFrame:
    """
    This function takes a GeoPandas DataFrame containing an index spread across pixels, 
    and aggregates it to a higher administrative level such as sector, cell, or village.

    Args:
        gdf: GeoPandas GeoDataFrame containing the index values.
        admin_level: A string indicating the administrative level ('sector', 'cell', or 'village').

    Returns:
        GeoDataFrame with aggregated values based on the specified summary method.
    """

    # Define a helper function for mode calculation
    def calc_mode(x):
        # Calculate mode using scipy, but ensure it's treated as array-like
        modes = mode(x, keepdims=True)  # `keepdims` ensures consistency across versions
        if modes.count.size == 0 or modes.mode.size == 0:
            return None  # or np.nan or some fallback value
        
        mode_value = modes.mode[0]
        
        if mode_value == 10:
            # Exclude 10 and try again
            new_modes = mode(x[x != mode_value], keepdims=True)
            return new_modes.mode[0] if new_modes.count.size > 0 else mode_value
        
        return mode_value

    # Get the file path and identifier for the specified admin level
    admin_path, admin_id = get_admin_boundary(admin_level=admin_level)

    # Read the admin shapefile
    admin_shp = gpd.read_file(admin_path)

    # Set CRS of admin_shp to EPSG:4326
    admin_shp = admin_shp.to_crs("EPSG:4326")

    # Use GeoPandas sjoin for spatial join by intersection
    index_summary = gpd.sjoin(admin_shp, gdf, how="inner", predicate='intersects')

    # Define aggregation method dynamically based on input
    if summary_method == "mode":
        agg_method = calc_mode
    else:
        # Use the string method directly for "median" or "mean"
        agg_method = summary_method

    # Aggregate data and retain the first geometry in case of multiple intersections
    index_summary = index_summary.groupby([admin_id]).agg({
        "pixel_value": agg_method,
        "geometry": "first" 
    }).reset_index()

    # Create a new GeoDataFrame with the necessary columns and CRS
    index_summary = gpd.GeoDataFrame(index_summary[[admin_id, "pixel_value", "geometry"]],
                                     geometry="geometry", crs="EPSG:4326")

    return index_summary


### Ploting functions

In [19]:
def plot_geopandas(gdf, column, figsize=(10, 8), cmap="Reds", 
                             colorbar_title="Colorbar Title", plot_title="plot title",
                             boundary_shp_file="/gypsum/eguide/projects/ce8760/locations/rwanda_boundary/RWA_adm0.shp"):
    """
    This function plots a geopandas shapefile to show the disdribution of an attribute

    Parameters:
        - gdf: GeoPandas GeoDataFrame to be plotted.
        - column: Name of the column in the GeoDataFrame to be used for coloring.
        - figsize: Tuple specifying the figure size (default is (10, 8)).
        - cmap: Colormap to be used for coloring (default is "viridis").

    Returns:
        - None
    """
    # Set the style using seaborn
    sns.set_theme(style="ticks", palette="pastel")
    sns.set(font_scale=0.7, rc={"figure.dpi":300, 'savefig.dpi':300})
    # sns.set(font="Verdana", font_scale=0.7, rc={"figure.dpi":300, 'savefig.dpi':300})

    # Load the boundary shapefile and reproject to EPSG:4326
    boundary_gdf = gpd.read_file(boundary_shp_file).to_crs("EPSG:4326")
    
    # Reproject the main GeoDataFrame to EPSG:4326
    gdf = gdf.to_crs("EPSG:4326")

    # Create the plot
    fig, ax = plt.subplots(figsize=figsize)

    # Plot the boundary line (adjust linewidth and color)
    boundary_gdf.boundary.plot(ax=ax, linewidth=0.5, color="black")

    # Plot the GeoDataFrame with the specified column for coloring
    gdf.plot(column=column, cmap=cmap, linewidth=0.2, ax=ax, edgecolor="0.5")

    # Remove axis labels and boundary box
    ax.set_axis_off()

    # Show the colormap legend on the side with 4 decimal places and without the boundary box
    cax = fig.add_axes([0.92, 0.2, 0.05, 0.2])  # Adjust the position as needed
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=gdf[column].min(), vmax=gdf[column].max()))
    sm._A = []  # fake up the array of the scalar mappable
    cbar = fig.colorbar(sm, cax=cax, format="%.1f", drawedges=False)  # format="%.4f" for 4 decimal places

    # Set the title for the colorbar
    cbar.set_label(colorbar_title, rotation=270, labelpad=15)

    # Add title to the bottom of the plot
    # plt.suptitle(plot_title, x=0.5, y=0.05, fontsize=12, fontname="Verdana",
    #              ha='center', va='bottom')
    
    plt.suptitle(plot_title, x=0.5, y=0.05, fontsize=12, ha='center', va='bottom')

    # Show the plot
    plt.show()


In [20]:
def plot_geopandas_grid(gdf, years, figsize=(12, 12), cmap="Reds", colorbar_title="Colorbar Title", 
                        boundary_shp_file="/gypsum/eguide/projects/ce8760/locations/rwanda_boundary/RWA_adm0.shp", filename=None):
    """
    This function plots a geopandas GeoDataFrame on a 3x3 grid, showing the distribution of attributes for the specified years.

    Parameters:
        - gdf: GeoPandas GeoDataFrame with columns ['administrative_boundary', '2012', '2013', ..., '2020', 'geometry'].
        - years: List of years to be plotted (e.g., ['2013', '2014', ..., '2019']).
        - figsize: Tuple specifying the figure size (default is (15, 15)).
        - cmap: Colormap to be used for coloring (default is "Reds").
        - colorbar_title: Title for the colorbar.
        - boundary_shp_file: Path to the shapefile containing the boundary.
        - filename: Optional filename to save the plot as an SVG file.

    Returns:
        - None
    """
    # Load the boundary shapefile and reproject to EPSG:4326
    boundary_gdf = gpd.read_file(boundary_shp_file).to_crs("EPSG:4326")
    
    # Reproject the main GeoDataFrame to EPSG:4326
    gdf = gdf.to_crs("EPSG:4326")

    # Set the style using seaborn
    sns.set_theme(style="ticks", palette="pastel")
    # sns.set(font="Verdana", font_scale=0.7, rc={"figure.dpi": 300, 'savefig.dpi': 300})
    sns.set(font_scale=0.7, rc={"figure.dpi": 300, 'savefig.dpi': 300})

    # Create the subplots
    fig, axes = plt.subplots(3, 3, figsize=figsize, sharex=True, sharey=True)
    
    # Plot each year on the grid
    for i, year in enumerate(years):
        row, col = divmod(i, 3)
        ax = axes[row, col]

        # Plot the boundary line (adjust linewidth and color)
        boundary_gdf.boundary.plot(ax=ax, linewidth=0.5, color="black")

        # Plot the GeoDataFrame with the specified column for coloring
        gdf.plot(column=year, cmap=cmap, linewidth=0.2, ax=ax, edgecolor="0.5", legend=False)

        # Remove axis labels and boundary box
        ax.set_axis_off()

        # Add title
        ax.set_title(year, fontsize=10)

    # Create a common colorbar for the last column
    cax = fig.add_axes([0.95, 0.15, 0.02, 0.7])  # Adjust the position and size as needed
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=gdf[years].min().min(), vmax=gdf[years].max().max()))
    sm._A = []  # fake up the array of the scalar mappable
    cbar = fig.colorbar(sm, cax=cax, orientation="vertical", format="%d", drawedges=False)  # Remove decimals
    cbar.set_label(colorbar_title, labelpad=15)

    # Hide axes for the last row
    for ax in axes[-1, :-2]:
        ax.axis('off')

    # Remove the empty subplot in the last row
    fig.delaxes(axes[-1, -1])
    fig.delaxes(axes[-1, -2])

    # Save or display the plot
    if filename:
        plt.savefig(filename, format="png", bbox_inches="tight")
    else:
        plt.show()

# Example usage
# plot_geopandas_grid(gdf, years, figsize=(10, 10), cmap="Reds", colorbar_title="Colorbar Title", filename="output_plot.svg")
# or
# plot_geopandas_grid(gdf, years, figsize=(10, 10), cmap="Reds", colorbar_title="Colorbar Title")


In [21]:
def plot_geopandas_grid(gdf, years, figsize=(12, 12), cmap="Reds", colorbar_title="Colorbar Title", 
                        boundary_shp_file="/gypsum/eguide/projects/ce8760/locations/rwanda_boundary/RWA_adm0.shp", filename=None):
    """
    This function plots a geopandas GeoDataFrame on a 3x3 grid, showing the distribution of attributes for the specified years.

    Parameters:
        - gdf: GeoPandas GeoDataFrame with columns ['administrative_boundary', '2012', '2013', ..., '2020', 'geometry'].
        - years: List of years to be plotted (e.g., ['2013', '2014', ..., '2019']).
        - figsize: Tuple specifying the figure size (default is (15, 15)).
        - cmap: Colormap to be used for coloring (default is "Reds").
        - colorbar_title: Title for the colorbar.
        - boundary_shp_file: Path to the shapefile containing the boundary.
        - filename: Optional filename to save the plot as an SVG file.

    Returns:
        - None
    """
    # Load the boundary shapefile and reproject to EPSG:4326
    boundary_gdf = gpd.read_file(boundary_shp_file).to_crs("EPSG:4326")
    
    # Reproject the main GeoDataFrame to EPSG:4326
    gdf = gdf.to_crs("EPSG:4326")

    # Set the style using seaborn
    sns.set_theme(style="ticks", palette="pastel")
    sns.set(font_scale=0.7, rc={"figure.dpi": 300, 'savefig.dpi': 300})
    # sns.set(font="Verdana", font_scale=0.7, rc={"figure.dpi": 300, 'savefig.dpi': 300})

    # Create the subplots
    fig, axes = plt.subplots(3, 3, figsize=figsize, sharex=True, sharey=True)
    
    # Plot each year on the grid
    for i, year in enumerate(years):
        row, col = divmod(i, 3)
        ax = axes[row, col]

        # Plot the boundary line (adjust linewidth and color)
        boundary_gdf.boundary.plot(ax=ax, linewidth=0.5, color="black")

        # Plot the GeoDataFrame with the specified column for coloring
        gdf.plot(column=year, cmap=cmap, linewidth=0.2, ax=ax, edgecolor="0.5", legend=False)

        # Remove axis labels and boundary box
        ax.set_axis_off()

        # Add title
        ax.set_title(year, fontsize=10)

    # Create a common colorbar for the last column
    cax = fig.add_axes([0.95, 0.15, 0.02, 0.7])  # Adjust the position and size as needed
    vmin, vmax = gdf[years].min().min(), gdf[years].max().max()
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
    sm._A = []  # fake up the array of the scalar mappable
    cbar = fig.colorbar(sm, cax=cax, orientation="vertical", format="%d", drawedges=False)  # Remove decimals
    cbar.set_label(colorbar_title, labelpad=15)

    # Set colorbar ticks
    cbar.set_ticks([vmin, (vmin + vmax) / 2, vmax])

    # Hide axes for the last row
    for ax in axes[-1, :-2]:
        ax.axis('off')

    # Remove the empty subplot in the last row
    fig.delaxes(axes[-1, -1])
    fig.delaxes(axes[-1, -2])

    # Save or display the plot
    if filename:
        plt.savefig(filename, format="png", bbox_inches="tight")
    else:
        plt.show()

# Example usage
# plot_geopandas_grid(gdf, years, figsize=(10, 10), cmap="Reds", colorbar_title="Colorbar Title", filename="output_plot.svg")
# or
# plot_geopandas_grid(gdf, years, figsize=(10, 10), cmap="Reds", colorbar_title="Colorbar Title")
""

''

## Helper functions for calculating distance to amenities

In [22]:
import pandas as pd
from scipy.spatial import cKDTree
from shapely.geometry import Point

def calculate_min_distances(combined_data, marketplace_geojson):
    # Extract relevant columns from combined_data
    meters_data = combined_data[['meter_serial_number', 'geometry', 'transaction_date']].copy()

    # Extract relevant columns from marketplace_geojson
    marketplaces = marketplace_geojson[['id', 'geometry']].copy()

    # Convert polygon geometries to centroids
    marketplaces['geometry'] = marketplaces['geometry'].apply(lambda geom: geom.centroid if geom.type == 'Polygon' else geom)

    # Create KD-Tree for marketplaces
    marketplace_tree = cKDTree(marketplaces['geometry'].apply(lambda geom: (geom.x, geom.y)).tolist())

    # Calculate all distances for each meter
    meter_coords = meters_data['geometry'].apply(lambda geom: (geom.x, geom.y)).tolist()
    _, indices = marketplace_tree.query(meter_coords, k=1)
    distances = marketplaces.loc[indices, 'geometry'].apply(lambda geom: geom.distance(Point(coord)) for coord in meter_coords)

    # Create a dataframe to store meter_serial_number, geometry, and distance_to_marketplace
    distances_df = meters_data[['meter_serial_number', 'geometry']].copy()

    # Create columns for each month in the transaction date
    months = meters_data['transaction_date'].dt.strftime('%Y-%m').unique()
    distances_df = pd.concat([distances_df] + [pd.Series(name=month) for month in months], axis=1)

    # Update distances_df with the calculated distances
    distances_df[months] = distances.values.reshape((-1, 1))

    return distances_df

# # Example usage
# combined_data = pd.read_pickle('path/to/combined_data.pkl')
# marketplace_geojson = gpd.read_file('path/to/marketplace.geojson')
# result_df = calculate_min_distances(combined_data, marketplace_geojson)
# result_df.to_csv('path/to/output.csv', index=False)


## Getting all datasets needed

### Prepaid electricity transactions as consumption data

### Atlas AI asset wealth

In [23]:
# Asset Wealth

asset_wealth_2016_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/atlas_ai_files/Rwanda_Asset-Wealth-1912m_0_08_RWA_2016.tif")
asset_wealth_2017_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/atlas_ai_files/Rwanda_Asset-Wealth-1912m_0_08_RWA_2017.tif")
asset_wealth_2018_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/atlas_ai_files/Rwanda_Asset-Wealth-1912m_0_08_RWA_2018.tif")
asset_wealth_2019_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/atlas_ai_files/Rwanda_Asset-Wealth-1912m_0_08_RWA_2019.tif")
asset_wealth_2020_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/atlas_ai_files/Rwanda_Asset-Wealth-1912m_0_08_RWA_2020.tif")

asset_wealth_2016_shp = asset_wealth_2016_shp[asset_wealth_2016_shp["pixel_value"]<3]
asset_wealth_2017_shp = asset_wealth_2017_shp[asset_wealth_2017_shp["pixel_value"]<3]
asset_wealth_2018_shp = asset_wealth_2018_shp[asset_wealth_2018_shp["pixel_value"]<3]
asset_wealth_2019_shp = asset_wealth_2019_shp[asset_wealth_2019_shp["pixel_value"]<3]
asset_wealth_2020_shp = asset_wealth_2020_shp[asset_wealth_2020_shp["pixel_value"]<3]

village_asset_wealth_2016 = compute_administrative_metric(asset_wealth_2016_shp, "village")
village_asset_wealth_2017 = compute_administrative_metric(asset_wealth_2017_shp, "village")
village_asset_wealth_2018 = compute_administrative_metric(asset_wealth_2018_shp, "village")
village_asset_wealth_2019 = compute_administrative_metric(asset_wealth_2019_shp, "village")
village_asset_wealth_2020 = compute_administrative_metric(asset_wealth_2020_shp, "village")

### Atlas AI spending

In [24]:
# Spending

spending_2016_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/atlas_ai_files/Rwanda_Spending-1912m_0_12_RWA_2016.tif")
spending_2017_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/atlas_ai_files/Rwanda_Spending-1912m_0_12_RWA_2017.tif")
spending_2018_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/atlas_ai_files/Rwanda_Spending-1912m_0_12_RWA_2018.tif")
spending_2019_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/atlas_ai_files/Rwanda_Spending-1912m_0_12_RWA_2019.tif")
spending_2020_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/atlas_ai_files/Rwanda_Spending-1912m_0_12_RWA_2020.tif")

spending_2016_shp = spending_2016_shp[spending_2016_shp["pixel_value"]<1000]
spending_2017_shp = spending_2017_shp[spending_2017_shp["pixel_value"]<1000]
spending_2018_shp = spending_2018_shp[spending_2018_shp["pixel_value"]<1000]
spending_2019_shp = spending_2019_shp[spending_2019_shp["pixel_value"]<1000]
spending_2020_shp = spending_2020_shp[spending_2020_shp["pixel_value"]<1000]

village_spending_2016 = compute_administrative_metric(spending_2016_shp, "village")
village_spending_2017 = compute_administrative_metric(spending_2017_shp, "village")
village_spending_2018 = compute_administrative_metric(spending_2018_shp, "village")
village_spending_2019 = compute_administrative_metric(spending_2019_shp, "village")
village_spending_2020 = compute_administrative_metric(spending_2020_shp, "village")

### Degree of Urbanization

In [25]:
# Degree of Urbanization

urban_2010_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/european_commision/GHS_SMOD_E2010_GLOBE_R2023A_54009_1000_V1_0.tif")
urban_2015_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/european_commision/GHS_SMOD_E2015_GLOBE_R2023A_54009_1000_V1_0.tif")
urban_2020_shp = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/european_commision/GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V1_0.tif")

urban_2010_shp = urban_2010_shp[urban_2010_shp["pixel_value"]>0]
urban_2015_shp = urban_2015_shp[urban_2015_shp["pixel_value"]>0]
urban_2020_shp = urban_2020_shp[urban_2020_shp["pixel_value"]>0]

village_urban_2010 = compute_administrative_metric(urban_2010_shp, "village", summary_method="mode")
village_urban_2015 = compute_administrative_metric(urban_2015_shp, "village", summary_method="mode")
village_urban_2020 = compute_administrative_metric(urban_2020_shp, "village", summary_method="mode")

### Building heights

In [26]:
building_height_2018 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/european_commision/GHS_built_H_2018.tif")
building_height_2018 = building_height_2018[building_height_2018["pixel_value"]>0]

village_building_height_2018 = compute_administrative_metric(building_height_2018, "village")

### Building Volume

In [27]:
# building_volume_2010 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/european_commision/GHS_Built_V_2010.tif")
# building_volume_2015 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/european_commision/GHS_Built_V_2015.tif")
# building_volume_2020 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/european_commision/GHS_Built_V_2020.tif")


In [28]:
# village_building_volume_2010 = compute_administrative_metric(building_volume_2010, "village")
# village_building_volume_2015 = compute_administrative_metric(building_volume_2015, "village")
# village_building_volume_2020 = compute_administrative_metric(building_volume_2020, "village")

In [29]:
# village_building_volume_2010.to_pickle("/gypsum/eguide/projects/ce8760/european_commision/village_building_volume_2010.pkl")
# village_building_volume_2015.to_pickle("/gypsum/eguide/projects/ce8760/european_commision/village_building_volume_2015.pkl")
# village_building_volume_2020.to_pickle("/gypsum/eguide/projects/ce8760/european_commision/village_building_volume_2020.pkl")

# How to read
# a = pd.read_pickle("/gypsum/eguide/projects/ce8760/european_commision/village_building_volume_2020.pkl")

In [30]:
village_building_volume_2010 = pd.read_pickle("/gypsum/eguide/projects/ce8760/european_commision/village_building_volume_2010.pkl")
village_building_volume_2015 = pd.read_pickle("/gypsum/eguide/projects/ce8760/european_commision/village_building_volume_2015.pkl")
village_building_volume_2020 = pd.read_pickle("/gypsum/eguide/projects/ce8760/european_commision/village_building_volume_2020.pkl")

### Population

In [97]:
# population_grid_2010 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/Rwanda_100m_Population/RWA_pph_2010_adj_v2.tif")
# population_grid_2015 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/Rwanda_100m_Population/RWA_pph_2015_adj_v2.tif")
# population_grid_2020 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/Rwanda_100m_Population/RWA_pph_2020_adj_v2.tif")

In [None]:
# village_population_grid_2010 = compute_administrative_metric(population_grid_2010, "village")
# village_population_grid_2015 = compute_administrative_metric(population_grid_2015, "village")
# village_population_grid_2020 = compute_administrative_metric(population_grid_2020, "village")

In [None]:
# population
population_2012 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/rwanda_population/rwa_ppp_2012.tif")
population_2013 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/rwanda_population/rwa_ppp_2013.tif")
population_2014 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/rwanda_population/rwa_ppp_2014.tif")
population_2015 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/rwanda_population/rwa_ppp_2015.tif")
population_2016 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/rwanda_population/rwa_ppp_2016.tif")
population_2017 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/rwanda_population/rwa_ppp_2017.tif")
population_2018 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/rwanda_population/rwa_ppp_2018.tif")
population_2019 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/rwanda_population/rwa_ppp_2019.tif")
population_2020 = parallel_clip_large_tif("/gypsum/eguide/projects/ce8760/rwanda_population/rwa_ppp_2020.tif")

In [34]:
# village_population_2012 = compute_administrative_metric(population_2012, "village")
# village_population_2013 = compute_administrative_metric(population_2013, "village")
# village_population_2014 = compute_administrative_metric(population_2014, "village")
# village_population_2015 = compute_administrative_metric(population_2015, "village")
# village_population_2016 = compute_administrative_metric(population_2016, "village")
# village_population_2017 = compute_administrative_metric(population_2017, "village")
# village_population_2018 = compute_administrative_metric(population_2018, "village")
# village_population_2019 = compute_administrative_metric(population_2019, "village")
# village_population_2020 = compute_administrative_metric(population_2020, "village")

In [93]:
# village_population_2012.to_pickle("./population/village_population_2012.pkl")
# village_population_2013.to_pickle("./population/village_population_2013.pkl")
# village_population_2014.to_pickle("./population/village_population_2014.pkl")
# village_population_2015.to_pickle("./population/village_population_2015.pkl")
# village_population_2016.to_pickle("./population/village_population_2016.pkl")
# village_population_2017.to_pickle("./population/village_population_2017.pkl")
# village_population_2018.to_pickle("./population/village_population_2018.pkl")
# village_population_2019.to_pickle("./population/village_population_2019.pkl")
# village_population_2020.to_pickle("./population/village_population_2020.pkl")


village_population_2012 = pd.read_pickle("/gypsum/eguide/projects/ce8760/population/village_population_2012.pkl")
village_population_2013 = pd.read_pickle("/gypsum/eguide/projects/ce8760/population/village_population_2013.pkl")
village_population_2014 = pd.read_pickle("/gypsum/eguide/projects/ce8760/population/village_population_2014.pkl")
village_population_2015 = pd.read_pickle("/gypsum/eguide/projects/ce8760/population/village_population_2015.pkl")
village_population_2016 = pd.read_pickle("/gypsum/eguide/projects/ce8760/population/village_population_2016.pkl")
village_population_2017 = pd.read_pickle("/gypsum/eguide/projects/ce8760/population/village_population_2017.pkl")
village_population_2018 = pd.read_pickle("/gypsum/eguide/projects/ce8760/population/village_population_2018.pkl")
village_population_2019 = pd.read_pickle("/gypsum/eguide/projects/ce8760/population/village_population_2019.pkl")
village_population_2020 = pd.read_pickle("/gypsum/eguide/projects/ce8760/population/village_population_2020.pkl")

TypeError: no default __reduce__ due to non-trivial __cinit__

### Getting Meter location that aline with transactions 

In [48]:
# combined_data = pd.read_pickle("/gypsum/eguide/projects/ce8760/combined_data.pkl")

combined_data = pd.read_pickle("/gypsum/eguide/projects/ce8760/combined_data.pkl")


In [49]:
# combined_data_subset = combined_data[["meter_serial_number", "connection_type", "village_id"]]
# village_urban_2010_subset = village_urban_2010[["Village_ID", "pixel_value"]]

# joined = pd.merge(combined_data_subset, village_urban_2010_subset, how="left", left_on="village_id", right_on="Village_ID")

In [50]:
# len(joined[(joined["connection_type"]=="Residential")&(joined["pixel_value"]>13)].meter_serial_number.unique())

In [51]:
combined_data = combined_data[combined_data["connection_type"] == "Residential"]

In [52]:
combined_data.connection_type.unique()

array(['Residential'], dtype=object)

In [53]:
combined_data['transaction_date'] = pd.to_datetime(combined_data['transaction_date'], errors='coerce')
combined_data['installation_date'] = pd.to_datetime(combined_data['installation_date'], errors='coerce')

In [54]:
# # Calculate the earliest installation date per administrative region
# earliest_dates = combined_data.groupby("village_id")['installation_date'].min()
# combined_data = combined_data.merge(earliest_dates.rename('earliest_installation'), on="village_id")
# combined_data = combined_data[combined_data['installation_date'] > combined_data['earliest_installation'] + pd.DateOffset(years=10)]
# # combined_data = combined_data[(combined_data['installation_date'] > combined_data['earliest_installation'] + pd.DateOffset(years=5)) &
# #         (combined_data['installation_date'] <= combined_data['earliest_installation'] + pd.DateOffset(years=10))]

In [55]:
test_combined = combined_data.groupby([combined_data["meter_serial_number"], combined_data["connection_type"], 
                                       combined_data['transaction_date'].dt.year]).agg({
    "geometry": "first",
    "sector_id": "first",
    "cell_id": "first",
    "village_id": "first"
}).reset_index()

In [56]:
meter_location = test_combined.groupby("meter_serial_number")[["geometry"]].first().reset_index()

### Distance to Primary and secondary routes and trails

In [57]:
def calculate_closest_road_distance(meter_point, roads):
    return roads.distance(meter_point).min()

In [58]:
meters_location = gpd.GeoDataFrame(meter_location, geometry="geometry", crs="EPSG:4326")

In [59]:
rwa_roads = gpd.read_file("/gypsum/eguide/projects/ce8760/rwa_road/rwa_road.shp")

In [60]:
rwa_roads = rwa_roads.to_crs("EPSG:4326")
rwa_roads = rwa_roads[(rwa_roads["type"]=="Primary Route") | (rwa_roads["type"]=="Secondary Route")]

In [61]:
# Function to convert a LineString to equally spaced points
def line_to_points(line, num_points=10):
    return [line.interpolate(i/num_points, normalized=True) for i in range(num_points + 1)]

# Assuming 'rwa_roads' is your GeoDataFrame containing the road geometries
road_points = []
for road in rwa_roads.geometry:
    road_points.extend(line_to_points(road, num_points=1000))  # Adjust num_points as needed

# Convert the list of Points into an array of [x, y] for cKDTree
road_points_array = np.array([[point.x, point.y] for point in road_points])

# Build the cKDTree for efficient nearest-neighbor queries
tree = cKDTree(road_points_array)

# Assuming 'meter_location' is your GeoDataFrame containing the meter point geometries
# Convert meter locations into an array of [x, y] for querying the cKDTree
meter_points_array = np.array([[point.x, point.y] for point in meter_location.geometry])

# Query the cKDTree for the nearest road point to each meter location
distances, indices = tree.query(meter_points_array, k=1)

distances_in_km = []

for distance in distances:
    distances_in_km.append(geodesic((0, 0), (0, distance)).kilometers)
    


# Add the distances as a new column to the meter_location GeoDataFrame
meter_location['distance_to_nearest_road'] = np.array(distances_in_km)

# 'meter_location' now has an additional column 'distance_to_nearest_road' with the distance to the nearest road segment point


In [62]:
meter_location

Unnamed: 0,meter_serial_number,geometry,distance_to_nearest_road
0,1026227452,POINT (30.11994 -1.95422),0.038762
1,1026227460,POINT (30.11185 -1.95839),0.049891
2,1026227478,POINT (30.10217 -1.96356),0.041758
3,1026227494,POINT (30.05101 -1.95306),0.154069
4,1026227502,POINT (30.04592 -1.92407),0.923715
...,...,...,...
351361,94100295024,POINT (30.39184 -1.30941),6.588325
351362,94100295065,POINT (30.31443 -1.39965),2.848789
351363,94100295560,POINT (30.20526 -1.34721),2.743385
351364,94100295594,POINT (30.29934 -1.38348),0.946813


In [63]:
test_combined = pd.merge(test_combined, meter_location[["meter_serial_number", "distance_to_nearest_road"]], 
                   how="left", on="meter_serial_number")

In [64]:
village_distance_to_nearest_road = test_combined.groupby(["village_id", "transaction_date"]).agg({
    "distance_to_nearest_road": "median"
}).reset_index()

In [65]:
village_distance_to_nearest_road

Unnamed: 0,village_id,transaction_date,distance_to_nearest_road
0,11010102,2012,0.248803
1,11010102,2013,0.248803
2,11010102,2014,0.249891
3,11010102,2015,0.249891
4,11010102,2016,0.249891
...,...,...,...
53205,57150505,2016,2.867255
53206,57150505,2017,2.885903
53207,57150505,2018,2.885903
53208,57150505,2019,2.867453


## Amenities

Marketplace

In [70]:
bus_stations = gpd.read_file("/gypsum/eguide/projects/ce8760/open_street_map/bus_station.geojson")

In [69]:
marketplace = gpd.read_file("/gypsum/eguide/projects/ce8760/open_street_map/marketplace.geojson")

In [None]:
# result_df = calculate_min_distances(combined_data, marketplace)

In [72]:
# Convert df2 to EPSG 4326
marketplace = marketplace.to_crs("EPSG:4326")

In [73]:
# Create cKDTree using the shop geometries
tree = cKDTree(marketplace['geometry'].apply(lambda x: x.centroid.coords[0] if x.geom_type == 'Polygon' else x.coords[0]).tolist())

# Function to find the distance to the nearest shop
def find_nearest_shop_distance(row, tree):
    point = row['geometry'].coords[0]
    distance, _ = tree.query(point)

    # Convert distance from degrees to kilometers using Haversine formula
    distance_in_km = geodesic((0, 0), (0, distance)).kilometers
    
    return distance_in_km


# Apply the function to create a new 'distance' column in df1
test_combined['distance_market'] = test_combined.apply(lambda row: find_nearest_shop_distance(row, tree), axis=1)


In [74]:
village_distance_to_market = test_combined.groupby(["village_id", "transaction_date"]).agg({
    "distance_market": "median"
}).reset_index()

In [75]:
bus_tree = cKDTree(bus_stations['geometry'].apply(lambda x: x.centroid.coords[0] if x.geom_type == 'Polygon' else x.coords[0]).tolist())
test_combined["distance_busstation"] = test_combined.apply(lambda row: find_nearest_shop_distance(row, bus_tree), axis=1)

In [76]:
village_distance_to_busstation = test_combined.groupby(["village_id", "transaction_date"]).agg({
    "distance_busstation": "median"
}).reset_index()

In [77]:
bank_locations = gpd.read_file("/gypsum/eguide/projects/ce8760/open_street_map/banks.geojson")
school_locations = gpd.read_file("/gypsum/eguide/projects/ce8760/open_street_map/school_export.geojson")

In [78]:
school_tree = cKDTree(school_locations['geometry'].apply(lambda x: x.centroid.coords[0] if x.geom_type == 'Polygon' else x.coords[0]).tolist())
test_combined["distance_schools"] = test_combined.apply(lambda row: find_nearest_shop_distance(row, school_tree), axis=1)

village_distance_to_school = test_combined.groupby(["village_id", "transaction_date"]).agg({
    "distance_schools": "median"
}).reset_index()

In [79]:
def extract_coords(geom):
    # Check if the geometry is 'Polygon' or 'Point', directly return the coordinates
    if geom.geom_type == 'Polygon':
        return geom.centroid.coords[0]
    elif geom.geom_type == 'Point':
        return geom.coords[0]
    # Check if the geometry is a multi-part geometry
    elif geom.geom_type == 'MultiPolygon':
        # Iterate over each polygon in the MultiPolygon and find the largest by area
        largest_polygon = max(geom.geoms, key=lambda p: p.area)
        return largest_polygon.centroid.coords[0]
    elif geom.geom_type == 'MultiPoint':
        # For MultiPoint, return the coordinates of the first point
        return geom.geoms[0].coords[0]
    else:
        raise ValueError("Unsupported geometry type")

bank_locations = gpd.read_file("/gypsum/eguide/projects/ce8760/open_street_map/banks.geojson")
bank_tree = cKDTree(bank_locations['geometry'].apply(extract_coords).tolist())

In [80]:
test_combined["distance_banks"] = test_combined.apply(lambda row: find_nearest_shop_distance(row, bank_tree), axis=1)

village_distance_to_banks = test_combined.groupby(["village_id", "transaction_date"]).agg({
    "distance_banks": "median"
}).reset_index()

In [81]:
village_distance_to_banks.to_pickle("/gypsum/eguide/projects/ce8760/village_distance_to_banks.pkl")
village_distance_to_school.to_pickle("/gypsum/eguide/projects/ce8760/village_distance_to_school.pkl")

In [82]:
village_distance_to_busstation.to_pickle("/gypsum/eguide/projects/ce8760/village_distance_to_busstation.pkl")
village_distance_to_market.to_pickle("/gypsum/eguide/projects/ce8760/village_distance_to_market.pkl")

In [83]:
village_distance_to_busstation = pd.read_pickle("/gypsum/eguide/projects/ce8760/village_distance_to_busstation.pkl")
village_distance_to_market = pd.read_pickle("/gypsum/eguide/projects/ce8760/village_distance_to_market.pkl")

#### Data preparation

Residential Consumption

In [93]:
def extract_consumption_tariffs(file_path: str, column_name: str):

    df = pd.read_csv(file_path).dropna()
    consumption_df = df.melt(id_vars="administrative_id", var_name="transaction_date", value_name=column_name).sort_values(by=["administrative_id", "transaction_date"])
    consumption_df = consumption_df.reset_index().drop(columns=["index"]).iloc[:-1]
    consumption_df.columns = ["village_id", "date", column_name]
    consumption_df = consumption_df[~consumption_df["date"].isin([2012, 2020, "geometry"])]

    return consumption_df

In [94]:
residential_annual_consumption_path = "../final_data/final_reg_data/annual_residential_consumption_10yr_above/village/median/annual_median.csv"
residential_annual_tariff_path =  "../final_data/final_reg_data/annual_residential_tariffs_10yr_above/village/median/annual_median.csv"
consumption_df = extract_consumption_tariffs(residential_annual_consumption_path, "consumption")
tarrif_df = extract_consumption_tariffs(residential_annual_tariff_path, "tariff")

In [84]:
# consumption_df['date'] = consumption_df['date'].astype('object')
# consumption_df['village_id'] = consumption_df['village_id'].astype('str')
# tarrif_df['date'] = tarrif_df['date'].astype('object')
# tarrif_df['village_id'] = tarrif_df['village_id'].astype('str')
village_distance_to_market['transaction_date'] = village_distance_to_market['transaction_date'].astype('str')
village_distance_to_busstation['transaction_date'] = village_distance_to_busstation['transaction_date'].astype('str')
village_distance_to_nearest_road['transaction_date'] = village_distance_to_nearest_road['transaction_date'].astype('str')
village_distance_to_school['transaction_date'] = village_distance_to_school['transaction_date'].astype('str')
village_distance_to_banks['transaction_date'] = village_distance_to_banks['transaction_date'].astype('str')

Distance to Market, Busstation and Roads

In [None]:
# consumption_df = pd.merge(consumption_df, tarrif_df, right_on=["village_id", "date"], left_on=["village_id", "date"])

In [85]:
# consumption_df

In [86]:
final_df = (
    village_distance_to_market
    .merge(village_distance_to_busstation, on=["village_id", "transaction_date"])
    .merge(village_distance_to_nearest_road, on=["village_id", "transaction_date"])
    .merge(village_distance_to_school, on=["village_id", "transaction_date"])
    .merge(village_distance_to_banks, on=["village_id", "transaction_date"])
)

# Rename 'transaction_date' to 'year' and cast to string
final_df = final_df.rename(columns={"transaction_date": "year"})
final_df["year"] = final_df["year"].astype(str)

In [87]:
final_df

Unnamed: 0,village_id,year,distance_market,distance_busstation,distance_to_nearest_road,distance_schools,distance_banks
0,11010102,2012,0.988930,0.765208,0.248803,0.172175,0.443753
1,11010102,2013,0.988930,0.767398,0.248803,0.172175,0.439669
2,11010102,2014,0.988848,0.765208,0.249891,0.173555,0.438713
3,11010102,2015,0.988191,0.765208,0.249891,0.173555,0.442250
4,11010102,2016,0.988191,0.765208,0.249891,0.173555,0.442250
...,...,...,...,...,...,...,...
53205,57150505,2016,16.181119,17.761216,2.867255,2.918622,17.898980
53206,57150505,2017,16.181119,17.748852,2.885903,2.931577,17.886723
53207,57150505,2018,16.181119,17.748852,2.885903,2.931577,17.886723
53208,57150505,2019,16.177845,17.754593,2.867453,2.925819,17.892246


Asset Wealth

In [88]:
# merge with asset wealth
village_asset_wealth_2016["date"] = "2016"
village_asset_wealth_2017["date"] = "2017"
village_asset_wealth_2018["date"] = "2018"
village_asset_wealth_2019["date"] = "2019"
village_asset_wealth_2020["date"] = "2020"

village_asset_wealth_2016.drop(columns = ["geometry"], inplace=True)
village_asset_wealth_2017.drop(columns = ["geometry"], inplace=True)
village_asset_wealth_2018.drop(columns = ["geometry"], inplace=True)
village_asset_wealth_2019.drop(columns = ["geometry"], inplace=True)
village_asset_wealth_2020.drop(columns = ["geometry"], inplace=True)

village_asset_wealth = pd.merge(pd.merge(pd.merge(pd.merge(village_asset_wealth_2016, 
                               village_asset_wealth_2017, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_asset_wealth_2018, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_asset_wealth_2019, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_asset_wealth_2020, on=["Village_ID", "date", "pixel_value"],how="outer"
)
village_asset_wealth.columns = ["village_id", "asset_wealth", "year"]

village_asset_wealth["year"] = village_asset_wealth["year"].astype("str")

In [89]:
final_df = pd.merge(final_df, village_asset_wealth, how="left", on = ["village_id", "year"])

Spending

In [90]:
# merge with asset wealth
village_spending_2016["date"] = "2016"
village_spending_2017["date"] = "2017"
village_spending_2018["date"] = "2018"
village_spending_2019["date"] = "2019"
village_spending_2020["date"] = "2020"

village_spending_2016.drop(columns = ["geometry"], inplace=True)
village_spending_2017.drop(columns = ["geometry"], inplace=True)
village_spending_2018.drop(columns = ["geometry"], inplace=True)
village_spending_2019.drop(columns = ["geometry"], inplace=True)
village_spending_2020.drop(columns = ["geometry"], inplace=True)

village_spending = pd.merge(pd.merge(pd.merge(pd.merge(village_spending_2016, 
                               village_spending_2017, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_spending_2018, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_spending_2019, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_spending_2020, on=["Village_ID", "date", "pixel_value"],how="outer"
)
village_spending.columns = ["village_id", "spending", "year"]

village_spending["year"] = village_spending["year"].astype("str")

In [91]:
final_df = pd.merge(final_df, village_spending, how="left", on = ["village_id", "year"])

Population

In [92]:
village_population_2012["date"] = "2012"
village_population_2013["date"] = "2013"
village_population_2014["date"] = "2014"
village_population_2015["date"] = "2015"
village_population_2016["date"] = "2016"
village_population_2017["date"] = "2017"
village_population_2018["date"] = "2018"
village_population_2019["date"] = "2019"
village_population_2020["date"] = "2020"

village_population_2012.drop(columns = ["geometry"], inplace=True)
village_population_2013.drop(columns = ["geometry"], inplace=True)
village_population_2014.drop(columns = ["geometry"], inplace=True)
village_population_2015.drop(columns = ["geometry"], inplace=True)
village_population_2016.drop(columns = ["geometry"], inplace=True)
village_population_2017.drop(columns = ["geometry"], inplace=True)
village_population_2018.drop(columns = ["geometry"], inplace=True)
village_population_2019.drop(columns = ["geometry"], inplace=True)
village_population_2020.drop(columns = ["geometry"], inplace=True)

village_population = pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(village_population_2012, 
                               village_population_2013, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_population_2014, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_population_2015, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_population_2016, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_population_2017, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_population_2018, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_population_2019, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_population_2020, on=["Village_ID", "date", "pixel_value"],how="outer"
)
village_population.columns = ["village_id", "population", "year"]

village_population["year"] = village_population["year"].astype("str")

NameError: name 'village_population_2012' is not defined

In [105]:
final_df = pd.merge(final_df, village_population, how="outer", on = ["village_id", "year"])

Urbanization

In [106]:
village_urban_2010["date"] = "2010"
village_urban_2015["date"] = "2015"
village_urban_2020["date"] = "2020"

village_urban_2010.drop(columns = ["geometry"], inplace=True)
village_urban_2015.drop(columns = ["geometry"], inplace=True)
village_urban_2020.drop(columns = ["geometry"], inplace=True)

urbanization = pd.merge(pd.merge(village_urban_2010, 
                               village_urban_2015, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_urban_2020, on=["Village_ID", "date", "pixel_value"],how="outer",
)
urbanization.columns = ["village_id", "urbanization", "year"]

urbanization["year"] = urbanization["year"].astype("str")

In [107]:
final_df = pd.merge(final_df, urbanization, how="outer", on = ["village_id", "year"])

Building volume

In [108]:
village_building_volume_2010["date"] = "2010"
village_building_volume_2015["date"] = "2015"
village_building_volume_2020["date"] = "2020"

village_building_volume_2010.drop(columns = ["geometry"], inplace=True)
village_building_volume_2015.drop(columns = ["geometry"], inplace=True)
village_building_volume_2020.drop(columns = ["geometry"], inplace=True)

building_volume = pd.merge(pd.merge(village_building_volume_2010, 
                               village_building_volume_2015, on=["Village_ID", "date", "pixel_value"],how="outer"),
                               village_building_volume_2020, on=["Village_ID", "date", "pixel_value"],how="outer",
)
building_volume.columns = ["village_id", "building_volume", "year"]

building_volume["year"] = building_volume["year"].astype("str")

In [109]:
final_df = pd.merge(final_df, building_volume, how="outer", on = ["village_id", "year"])

Building Height

In [110]:
village_building_height_2018.drop(columns = ["geometry"], inplace=True)
village_building_height_2018.columns = ["village_id", "building_height"]
final_df = pd.merge(final_df, village_building_height_2018, how="left", on = ["village_id"])

Landcover

In [None]:
landcover = pd.read_csv("/gypsum/eguide/projects/ce8760/landcover/landcover.csv")
landcover["year"] = landcover["year"].astype(str)
landcover["village_id"] = landcover["village_id"].astype(str)

In [112]:
landcover_2013 = pd.read_pickle("../final_data/landcover/df_2013.pkl")
landcover_2014 = pd.read_pickle("../final_data/landcover/df_2014.pkl")
landcover_2015 = pd.read_pickle("../final_data/landcover/df_2015.pkl")
landcover_2016 = pd.read_pickle("../final_data/landcover/df_2016.pkl")

In [113]:
landcover_2013.columns = landcover.columns
landcover_2014.columns = landcover.columns
landcover_2015.columns = landcover.columns
landcover_2016.columns = landcover.columns

In [114]:
landcover = pd.concat([landcover_2013, landcover_2014, landcover_2015, landcover_2016, landcover])

In [115]:
landcover.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148150 entries, 0 to 88889
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   village_id            148150 non-null  object 
 1   cropland_proportion   148150 non-null  float64
 2   builtarea_proportion  148150 non-null  float64
 3   rangeland_proportion  148150 non-null  float64
 4   year                  148150 non-null  object 
dtypes: float64(3), object(2)
memory usage: 6.8+ MB


In [116]:
landcover["year"] = landcover["year"].astype("str")
landcover["village_id"] = landcover["village_id"].astype("str")

In [117]:
# final_df.drop(columns = ["cropland_proportion", "builtarea_proportion", "rangeland_proportion"], inplace = True)

In [118]:
final_df["year"] = final_df["year"].astype("str")
final_df["village_id"] = final_df["village_id"].astype("str")

In [119]:
final_df = pd.merge(final_df, landcover, how="outer", on = ["village_id", "year"])

In [120]:
final_df

Unnamed: 0,village_id,year,consumption,tariff,distance_market,distance_busstation,distance_to_nearest_road,distance_schools,distance_banks,asset_wealth,spending,population,urbanization,building_volume,building_height,cropland_proportion,builtarea_proportion,rangeland_proportion
0,11010102,2010,,,,,,,,,,,30.0,50264.0,5.085509,,,
1,11010102,2012,94.7,12624.0,0.980011,0.753800,0.259899,0.183702,0.444702,,,256.252640,,,5.085509,,,
2,11010102,2013,230.5,30921.0,0.976696,0.754501,0.255193,0.180088,0.444300,,,257.667900,,,5.085509,0.000000,1.000000,0.000000
3,11010102,2014,267.0,35742.0,0.985199,0.754160,0.257589,0.179813,0.438713,,,265.239548,,,5.085509,0.000000,1.000000,0.000000
4,11010102,2015,264.0,39803.0,0.980011,0.754160,0.257589,0.179813,0.442797,,,251.027534,30.0,50853.0,5.085509,0.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177775,57150505,2018,,,,,,,,,,4.322918,,,0.081097,0.002182,0.067683,0.412780
177776,57150505,2019,,,,,,,,,,4.540082,,,0.081097,0.000750,0.080469,0.408790
177777,57150505,2020,,,,,,,,,,4.584210,12.0,900.0,0.081097,0.001739,0.086027,0.418167
177778,57150505,2021,,,,,,,,,,,,,,0.002578,0.083333,0.416090


In [121]:
final_df.to_pickle("residential_customers_panel_10yr_above.pkl")

### Non-Residential Consumption

In [122]:
nonresidential_annual_consumption_path = "../final_data/final_reg_data/annual_nonresidential_consumption_10yr_above/village/median/annual_median.csv"
noresidential_annual_tariff_path =  "../final_data/final_reg_data/annual_nonresidential_tariffs_10yr_above/village/median/annual_median.csv"
consumption_non_res_df = extract_consumption_tariffs(nonresidential_annual_consumption_path, "consumption")
tarrif_non_res_df = extract_consumption_tariffs(noresidential_annual_tariff_path, "tariff")

In [123]:
consumption_non_res_df

Unnamed: 0,village_id,date,consumption
0,11010107,2012,123.2
1,11010107,2013,408.65
2,11010107,2014,462.25
3,11010107,2015,368.4
4,11010107,2016,368.9
...,...,...,...
3844,57100512,2016,229.6
3845,57100512,2017,217.7
3846,57100512,2018,193.65
3847,57100512,2019,309.5


In [None]:
combined_data = pd.read_pickle("/gypsum/eguide/projects/ce8760/combined_data.pkl")

In [125]:
combined_data = combined_data[combined_data["connection_type"] != "Residential"]
combined_data.connection_type.unique()

array(['other', 'Non Residential'], dtype=object)

In [126]:
combined_data['transaction_date'] = pd.to_datetime(combined_data['transaction_date'], format='mixed')
combined_data['installation_date'] = pd.to_datetime(combined_data['installation_date'], format='mixed')

In [127]:
# Calculate the earliest installation date per administrative region
earliest_dates = combined_data.groupby("village_id")['installation_date'].min()
combined_data = combined_data.merge(earliest_dates.rename('earliest_installation'), on="village_id")
combined_data = combined_data[combined_data['installation_date'] > combined_data['earliest_installation'] + pd.DateOffset(years=10)]
# combined_data = combined_data[(combined_data['installation_date'] > combined_data['earliest_installation'] + pd.DateOffset(years=5)) &
#         (combined_data['installation_date'] <= combined_data['earliest_installation'] + pd.DateOffset(years=10))]

In [128]:
test_combined = combined_data.groupby([combined_data["meter_serial_number"], combined_data["connection_type"], 
                                       combined_data['transaction_date'].dt.year]).agg({
    "geometry": "first",
    "sector_id": "first",
    "cell_id": "first",
    "village_id": "first"
}).reset_index()

In [129]:
meter_location = test_combined.groupby("meter_serial_number")[["geometry"]].first().reset_index()
meters_location = gpd.GeoDataFrame(meter_location, geometry="geometry", crs="EPSG:4326")

In [130]:
# Assuming 'rwa_roads' is your GeoDataFrame containing the road geometries
road_points = []
for road in rwa_roads.geometry:
    road_points.extend(line_to_points(road, num_points=1000))  # Adjust num_points as needed

# Convert the list of Points into an array of [x, y] for cKDTree
road_points_array = np.array([[point.x, point.y] for point in road_points])

# Build the cKDTree for efficient nearest-neighbor queries
tree = cKDTree(road_points_array)

# Assuming 'meter_location' is your GeoDataFrame containing the meter point geometries
# Convert meter locations into an array of [x, y] for querying the cKDTree
meter_points_array = np.array([[point.x, point.y] for point in meter_location.geometry])

# Query the cKDTree for the nearest road point to each meter location
distances, indices = tree.query(meter_points_array, k=1)

distances_in_km = []

for distance in distances:
    distances_in_km.append(geodesic((0, 0), (0, distance)).kilometers)
    

# Add the distances as a new column to the meter_location GeoDataFrame
meter_location['distance_to_nearest_road'] = np.array(distances_in_km)

In [131]:
test_combined = pd.merge(test_combined, meter_location[["meter_serial_number", "distance_to_nearest_road"]], 
                   how="left", on="meter_serial_number")

In [132]:
village_distance_to_nearest_road = test_combined.groupby(["village_id", "transaction_date"]).agg({
    "distance_to_nearest_road": "median"
}).reset_index()

In [133]:
# Create cKDTree using the shop geometries
tree = cKDTree(marketplace['geometry'].apply(lambda x: x.centroid.coords[0] if x.geom_type == 'Polygon' else x.coords[0]).tolist())

# Function to find the distance to the nearest shop
def find_nearest_shop_distance(row, tree):
    point = row['geometry'].coords[0]
    distance, _ = tree.query(point)

    # Convert distance from degrees to kilometers using Haversine formula
    distance_in_km = geodesic((0, 0), (0, distance)).kilometers
    
    return distance_in_km


# Apply the function to create a new 'distance' column in df1
test_combined['distance_market'] = test_combined.apply(lambda row: find_nearest_shop_distance(row, tree), axis=1)


In [134]:
village_distance_to_market = test_combined.groupby(["village_id", "transaction_date"]).agg({
    "distance_market": "median"
}).reset_index()

In [135]:
test_combined["distance_busstation"] = test_combined.apply(lambda row: find_nearest_shop_distance(row, bus_tree), axis=1)
# test_combined

In [136]:
village_distance_to_busstation = test_combined.groupby(["village_id", "transaction_date"]).agg({
    "distance_busstation": "median"
}).reset_index()

In [137]:
test_combined["distance_schools"] = test_combined.apply(lambda row: find_nearest_shop_distance(row, school_tree), axis=1)

village_distance_to_school = test_combined.groupby(["village_id", "transaction_date"]).agg({
    "distance_schools": "median"
}).reset_index()

In [138]:
test_combined["distance_banks"] = test_combined.apply(lambda row: find_nearest_shop_distance(row, bank_tree), axis=1)

village_distance_to_banks = test_combined.groupby(["village_id", "transaction_date"]).agg({
    "distance_banks": "median"
}).reset_index()

In [139]:
consumption_non_res_df = pd.merge(consumption_non_res_df, tarrif_non_res_df, right_on=["village_id", "date"], left_on=["village_id", "date"])

In [140]:
consumption_non_res_df['date'] = consumption_non_res_df['date'].astype('object')
consumption_non_res_df['village_id'] = consumption_non_res_df['village_id'].astype('str')
tarrif_non_res_df['date'] = tarrif_non_res_df['date'].astype('object')
tarrif_non_res_df['village_id'] = tarrif_non_res_df['village_id'].astype('str')
village_distance_to_market['transaction_date'] = village_distance_to_market['transaction_date'].astype('str')
village_distance_to_busstation['transaction_date'] = village_distance_to_busstation['transaction_date'].astype('str')
village_distance_to_nearest_road['transaction_date'] = village_distance_to_nearest_road['transaction_date'].astype('str')
village_distance_to_school['transaction_date'] = village_distance_to_school['transaction_date'].astype('str')
village_distance_to_banks['transaction_date'] = village_distance_to_banks['transaction_date'].astype('str')

In [141]:
final_non_res_df = pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(consumption_non_res_df, 
                    village_distance_to_market, left_on=["village_id", "date"],
                    right_on=["village_id", "transaction_date"]).drop(columns=["transaction_date"]),
                    village_distance_to_busstation, left_on=["village_id", "date"],
                     right_on=["village_id", "transaction_date"]).drop(columns=["transaction_date"]),
    village_distance_to_nearest_road, left_on=["village_id", "date"],
                     right_on=["village_id", "transaction_date"]).drop(columns=["transaction_date"]),
    village_distance_to_school, left_on=["village_id", "date"],
                     right_on=["village_id", "transaction_date"]).drop(columns=["transaction_date"]),
    village_distance_to_banks, left_on=["village_id", "date"],
                     right_on=["village_id", "transaction_date"]).drop(columns=["transaction_date"])
                     

final_non_res_df = final_non_res_df.rename(columns={"date":"year"})
final_non_res_df["year"] = final_non_res_df["year"].astype("str")

In [142]:
final_non_res_df

Unnamed: 0,village_id,year,consumption,tariff,distance_market,distance_busstation,distance_to_nearest_road,distance_schools,distance_banks
0,11010107,2012,123.2,16525.0,1.072830,0.518970,0.026218,0.488517,0.201774
1,11010107,2013,408.65,54871.5,1.072906,0.518970,0.026214,0.488534,0.201751
2,11010107,2014,462.25,62070.0,1.072906,0.518970,0.026214,0.488534,0.201751
3,11010107,2015,368.4,55941.5,1.072829,0.518942,0.026202,0.488617,0.201498
4,11010107,2016,368.9,67198.0,1.072828,0.518970,0.026211,0.488551,0.201729
...,...,...,...,...,...,...,...,...,...
3460,57100512,2016,229.6,42088.5,1.021832,0.461342,10.601017,1.002348,0.799732
3461,57100512,2017,217.7,41186.0,1.053419,0.621131,10.685755,0.961046,0.831267
3462,57100512,2018,193.65,37294.5,1.024816,0.621131,10.685755,1.000035,0.831267
3463,57100512,2019,309.5,58722.141,1.010919,0.461342,10.601017,1.006714,0.799732


In [143]:
final_non_res_df = pd.merge(final_non_res_df, village_asset_wealth, how="left", on = ["village_id", "year"])
final_non_res_df = pd.merge(final_non_res_df, village_spending, how="left", on = ["village_id", "year"])
final_non_res_df = pd.merge(final_non_res_df, village_population, how="outer", on = ["village_id", "year"])
final_non_res_df = pd.merge(final_non_res_df, urbanization, how="outer", on = ["village_id", "year"])
final_non_res_df = pd.merge(final_non_res_df, building_volume, how="outer", on = ["village_id", "year"])
final_non_res_df = pd.merge(final_non_res_df, village_building_height_2018, how="left", on = ["village_id"])
final_non_res_df = pd.merge(final_non_res_df, landcover, how="outer", on = ["village_id", "year"])

In [147]:
final_non_res_df

Unnamed: 0,village_id,year,consumption,tariff,distance_market,distance_busstation,distance_to_nearest_road,distance_schools,distance_banks,asset_wealth,spending,population,urbanization,building_volume,building_height,cropland_proportion,builtarea_proportion,rangeland_proportion
69,11010107,2020,105.6,22812.917,1.072828,0.518970,0.026218,0.488551,0.201729,1.139318,7.462366,246.994827,30.0,66196.5,6.615108,0.000000,0.426079,0.000000
81,11010108,2020,141.2,29677.945,0.273505,0.097605,0.135207,0.392392,0.118645,1.139318,7.462366,171.802246,30.0,38745.0,3.558883,0.000000,0.311547,0.000000
237,11010303,2020,184.8,39711.382,1.447540,2.234067,0.086397,0.439372,1.187417,1.106294,6.596320,122.583088,30.0,53289.0,5.325431,0.000000,0.665314,0.000000
357,11010406,2020,101.5,19179.756,1.993712,1.452818,0.194558,0.369804,0.714538,1.158801,8.740249,214.029709,30.0,71772.0,7.170068,0.000000,0.391919,0.000000
501,11010601,2020,69.0,7181.847,1.404816,0.977067,0.272435,0.452622,0.326396,1.139318,7.462366,245.345451,30.0,68099.0,6.802923,0.000000,0.553360,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175425,57100506,2020,117.5,25770.146,0.475539,0.144387,10.075182,0.701506,0.272577,0.389036,3.109006,8.533003,12.0,3290.0,0.330109,0.107317,0.308292,0.070578
175449,57100508,2020,382.3,87027.055,1.567836,0.986482,11.167682,0.659612,1.364129,0.368788,3.015221,14.622195,21.0,4800.0,0.429584,0.153018,0.274153,0.001428
175461,57100509,2020,109.1,19010.762,0.254814,0.270724,9.810463,0.539249,0.175322,0.481580,3.810705,16.880141,21.0,11247.0,1.153944,0.017234,0.330891,0.057964
175485,57100511,2020,139.3,30417.2285,0.649777,0.135927,10.044725,0.969950,0.424030,0.424198,3.374781,12.283180,21.0,8744.0,0.924893,0.081766,0.211052,0.000000


In [145]:
final_non_res_df.to_pickle("nonresidential_customers_panel_10yr_above.pkl")

In [191]:
print("finish!")

finish!
