# Analyzing factors that may affect electrification

In [None]:
def clip_tif_file(tif_file:str) -> gpd.GeoDataFrame:

    """
    This function clips a tiff file into the boundary required (i.e. Rwanda) and returns a 
    geopandas dataframe.

    Input:
        - tif_file: String denoting path to tif file
    Returns:
        - geopandas.GeoDataFrame: GeoDataFrame containing the clipped raster data
    """

    with rasterio.open(tif_file) as src:
        # Read raster data and geometry
        array = src.read(1)
        transform = src.transform

        # Open the boundary shapefile
        boundary_gdf = gpd.read_file(get_admin_boundary("boundary")[0])

        # Make sure both GeoDataFrames have the same CRS
        boundary_gdf = boundary_gdf.to_crs(src.crs)

        # Create a bounding box that covers the extent of the raster
        xmin, ymin, xmax, ymax = src.bounds
        bbox = box(xmin, ymin, xmax, ymax)
        bbox_gdf = gpd.GeoDataFrame(geometry=[bbox], crs=src.crs)

        # Intersect the bounding box with the boundary shapefile
        intersection = gpd.overlay(boundary_gdf, bbox_gdf, how='intersection')

        # Clip the raster to the intersection geometry
        clipped, transform = mask(src, shapes=intersection.geometry, crop=True)

        # Create a GeoDataFrame directly from the clipped raster
        shapes_gen = rasterio.features.shapes(clipped, transform=transform)
        features = [{'geometry': geometry, 'properties': {'pixel_value': value}}
                    for (geometry, value) in shapes_gen]
        gdf_clipped = gpd.GeoDataFrame.from_features(features, crs=src.crs)

        gdf_clipped = gdf_clipped.to_crs(("EPSG:4326"))

    return gdf_clipped


In [None]:
def compute_administrative_metric(gdf: gpd.GeoDataFrame, admin_level: str) -> gpd.GeoDataFrame:
    """
    This function takes in a geopandas dataframe of an index spread across pixels, 
    and localizes it to the region under study (i.e. sector, cell, or village).

    Inputs:
        - gdf: Geopandas Geodataframe containing the index being measured
        - admin_level: This is just a string showing if the amin level is 
            sector, cell, or village
        - admin_id: This is a string showing te unique identifier to an 
            admin region

    Returns:
        - gpd.GeoDataFrame with median calculations for 

    """

    # Get the file path and identifier for the specified admin level
    admin_path, admin_id = get_admin_boundary(admin_level=admin_level)

    # Read the admin shapefile
    admin_shp = gpd.read_file(admin_path)
    admin_shp = admin_shp.to_crs(("EPSG:4326"))

    # Perform intersection between admin shapefile and the given GeoDataFrame
    overlay = gpd.overlay(admin_shp, gdf, how="intersection")

    # Calculate median and retain the first geometry in case of multiple intersections
    index_summary = overlay.groupby([admin_id]).agg({
        "pixel_value": "median",
        "geometry": "first" 
    }).reset_index()


    # Index summary is joined to the initial admin_level file because by choosing first, it
    # only chooses the first index of gdf in the admin region and not the entire region
    index_summary = index_summary.loc[:, index_summary.columns != "geometry"].merge(
                        admin_shp[[admin_id, "geometry"]], on = f"{admin_id}",
                        suffixes=('_left', '_right'))
    
    index_summary = gpd.GeoDataFrame(index_summary, geometry="geometry", crs=admin_shp.crs)


    return index_summary

In [None]:
df_merge_cell = pd.merge(cell_asset_wealth_2020[["Cell_ID", "pixel_value"]],
                    cell_median_consumption_2020, left_on="Cell_ID",
                    right_on="administra").drop(columns=["administra"])
df_merge_cell.columns = ["cell_id", "asset_wealth_index", "consumption_2020"]

In [None]:
# Create scatterplot
sns.set_theme(style="ticks", palette="pastel")
sns.set(font="Verdana", font_scale=0.7,style="white")
sns.scatterplot(x='asset_wealth_index', y='consumption_2020', data=df_merge_cell, color = "steelblue", edgecolor = "black")
sns.despine();
plt.show()

In [None]:
import plotly.graph_objects as go
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/1962_2006_walmart_store_openings.csv')
df.head()

data = []
layout = dict(
    title = 'New Walmart Stores per year 1962-2006<br>\
Source: <a href="http://www.econ.umn.edu/~holmes/data/WalMart/index.html">\
University of Minnesota</a>',
    # showlegend = False,
    autosize = False,
    width = 1000,
    height = 900,
    hovermode = False,
    legend = dict(
        x=0.7,
        y=-0.1,
        bgcolor="rgba(255, 255, 255, 0)",
        font = dict( size=11 ),
    )
)
years = df['YEAR'].unique()

for i in range(len(years)):
    geo_key = 'geo'+str(i+1) if i != 0 else 'geo'
    lons = list(df[ df['YEAR'] == years[i] ]['LON'])
    lats = list(df[ df['YEAR'] == years[i] ]['LAT'])
    # Walmart store data
    data.append(
        dict(
            type = 'scattergeo',
            showlegend=False,
            lon = lons,
            lat = lats,
            geo = geo_key,
            name = int(years[i]),
            marker = dict(
                color = "rgb(0, 0, 255)",
                opacity = 0.5
            )
        )
    )
    # Year markers
    data.append(
        dict(
            type = 'scattergeo',
            showlegend = False,
            lon = [-78],
            lat = [47],
            geo = geo_key,
            text = [years[i]],
            mode = 'text',
        )
    )
    layout[geo_key] = dict(
        scope = 'usa',
        showland = True,
        landcolor = 'rgb(229, 229, 229)',
        showcountries = False,
        domain = dict( x = [], y = [] ),
        subunitcolor = "rgb(255, 255, 255)",
    )


def draw_sparkline( domain, lataxis, lonaxis ):
    ''' Returns a sparkline layout object for geo coordinates  '''
    return dict(
        showland = False,
        showframe = False,
        showcountries = False,
        showcoastlines = False,
        domain = domain,
        lataxis = lataxis,
        lonaxis = lonaxis,
        bgcolor = 'rgba(255,200,200,0.0)'
    )

# Stores per year sparkline
layout['geo44'] = draw_sparkline({'x':[0.6,0.8], 'y':[0,0.15]}, \
                                 {'range':[-5.0, 30.0]}, {'range':[0.0, 40.0]} )
data.append(
    dict(
        type = 'scattergeo',
        mode = 'lines',
        lat = list(df.groupby(by=['YEAR']).count()['storenum']/1e1),
        lon = list(range(len(df.groupby(by=['YEAR']).count()['storenum']/1e1))),
        line = dict( color = "rgb(0, 0, 255)" ),
        name = "New stores per year<br>Peak of 178 stores per year in 1990",
        geo = 'geo44',
    )
)

# Cumulative sum sparkline
layout['geo45'] = draw_sparkline({'x':[0.8,1], 'y':[0,0.15]}, \
                                 {'range':[-5.0, 50.0]}, {'range':[0.0, 50.0]} )
data.append(
    dict(
        type = 'scattergeo',
        mode = 'lines',
        lat = list(df.groupby(by=['YEAR']).count().cumsum()['storenum']/1e2),
        lon = list(range(len(df.groupby(by=['YEAR']).count()['storenum']/1e1))),
        line = dict( color = "rgb(214, 39, 40)" ),
        name ="Cumulative sum<br>3176 stores total in 2006",
        geo = 'geo45',
    )
)

z = 0
COLS = 5
ROWS = 9
for y in reversed(range(ROWS)):
    for x in range(COLS):
        geo_key = 'geo'+str(z+1) if z != 0 else 'geo'
        layout[geo_key]['domain']['x'] = [float(x)/float(COLS), float(x+1)/float(COLS)]
        layout[geo_key]['domain']['y'] = [float(y)/float(ROWS), float(y+1)/float(ROWS)]
        z=z+1
        if z > 42:
            break

fig = go.Figure(data=data, layout=layout)
fig.update_layout(width=800)
fig.show()