In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression

In [2]:
gdf = gpd.read_file("africa_grid_10km_stats.gpkg").to_crs("ESRI:102022")

# Simple linear regression

In [3]:
X = gdf[['dist_border_km']]  # Independent variable
y = gdf['count_total']    # Dependent variable

model = LinearRegression()
model.fit(X, y)

slope = model.coef_[0]
intercept = model.intercept_

print(f"Slope (coefficient): {slope}")
print(f"Intercept: {intercept}")

Slope (coefficient): -0.0003106870511819334
Intercept: 0.7190770073046708


# OLS Regression

In [4]:
# with log transformations

df = gdf[["count_total", "dist_border_km", "pop_est", "road_km"]].copy()
df = df.replace([np.inf, -np.inf], np.nan).dropna()

# transforms (keep distance in km)
df["log_pop"] = np.log(df["pop_est"].clip(lower=1))
df["log_road"] = np.log(df["road_km"].clip(lower=1e-6))  # allows 0 road_km

ols = smf.ols("count_total ~ dist_border_km + log_pop + log_road",data=df).fit(cov_type="HC1")   # robust SEs

print(ols.summary())

beta1 = ols.params["dist_border_km"]
ci_low, ci_high = ols.conf_int().loc["dist_border_km"]
print(f"\nInterpretation: +1 km from border -> {beta1:.4f} change in expected count_total (OLS).")
print(f"95% CI: [{ci_low:.4f}, {ci_high:.4f}]")

                            OLS Regression Results                            
Dep. Variable:            count_total   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     420.7
Date:                Sun, 30 Nov 2025   Prob (F-statistic):          9.12e-273
Time:                        15:50:09   Log-Likelihood:            -1.0999e+06
No. Observations:              294949   AIC:                         2.200e+06
Df Residuals:                  294945   BIC:                         2.200e+06
Df Model:                           3                                         
Covariance Type:                  HC1                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -0.6870      0.072     -9.

In [5]:
# without log transformation

ols_raw = smf.ols("count_total ~ dist_border_km + pop_est + road_km",data=df).fit(cov_type="HC1")

print(ols_raw.summary())

                            OLS Regression Results                            
Dep. Variable:            count_total   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     149.8
Date:                Sun, 30 Nov 2025   Prob (F-statistic):           4.89e-97
Time:                        15:50:09   Log-Likelihood:            -1.0981e+06
No. Observations:              294949   AIC:                         2.196e+06
Df Residuals:                  294945   BIC:                         2.196e+06
Df Model:                           3                                         
Covariance Type:                  HC1                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -0.2360      0.086     -2.