# 02_lag_analysis

In [None]:
## Lagged Climate–Lassa Analysis

This notebook explores lagged relationships between climate variables
(rainfall and temperature) and weekly Lassa fever cases in Nigeria.

We test whether climate signals precede increases in cases, supporting
their use in early warning systems.

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [8]:
df = pd.read_csv("../data/processed/model/lassa_era5_weekly_lagged_2018_2021.csv")

print(df.shape)
df.head()


(7733, 22)


Unnamed: 0,state,year,week,cases,rain_mm,temp_c,rain_mm_lag1,temp_c_lag1,rain_mm_lag2,temp_c_lag2,...,rain_mm_lag4,temp_c_lag4,rain_mm_lag5,temp_c_lag5,rain_mm_lag6,temp_c_lag6,rain_mm_lag7,temp_c_lag7,rain_mm_lag8,temp_c_lag8
0,Abia,2018,1,0,0.029167,27.110085,,,,,...,,,,,,,,,,
1,Abia,2018,2,0,2.389669,28.605963,0.029167,27.110085,,,...,,,,,,,,,,
2,Abia,2018,3,0,2.002001,29.030485,2.389669,28.605963,0.029167,27.110085,...,,,,,,,,,,
3,Abia,2018,4,0,0.611703,28.548828,2.002001,29.030485,2.389669,28.605963,...,,,,,,,,,,
4,Abia,2018,5,0,0.249704,30.093192,0.611703,28.548828,2.002001,29.030485,...,0.029167,27.110085,,,,,,,,


In [9]:
# Aggregate national cases
nat = (
    df.groupby(["year", "week"], as_index=False)["cases"]
      .sum()
      .rename(columns={"cases": "nat_cases"})
)

# Merge back
df = df.merge(nat, on=["year", "week"], how="left")

# Define outbreak weeks (upper quartile)
threshold = df["nat_cases"].quantile(0.75)
df["outbreak_week"] = (df["nat_cases"] >= threshold).astype(int)

df["outbreak_week"].value_counts()


outbreak_week
0    5735
1    1998
Name: count, dtype: int64

In [4]:
# Aggregate national cases
nat = (
    df.groupby(["year", "week"], as_index=False)["cases"]
      .sum()
      .rename(columns={"cases": "nat_cases"})
)

# Merge back
df = df.merge(nat, on=["year", "week"], how="left")

# Define outbreak weeks (top 25% of national burden)
threshold = df["nat_cases"].quantile(0.75)
df["outbreak_week"] = (df["nat_cases"] >= threshold).astype(int)

df["outbreak_week"].value_counts()


outbreak_week
0    5735
1    1998
Name: count, dtype: int64

In [10]:
# Features to use
features = ["rain_mm_lag4", "rain_mm_lag6", "temp_c_lag4"]

# Keep only rows that have all required lag values
df2 = df.dropna(subset=features).copy()

X = df2[features]
y = df2["outbreak_week"]

# Train on 2018–2020, test on 2021 (time split)
train = df2[df2["year"] <= 2020]
test  = df2[df2["year"] == 2021]

X_train = train[features]
y_train = train["outbreak_week"]

X_test  = test[features]
y_test  = test["outbreak_week"]

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[1503   51]
 [ 359   11]]
              precision    recall  f1-score   support

           0       0.81      0.97      0.88      1554
           1       0.18      0.03      0.05       370

    accuracy                           0.79      1924
   macro avg       0.49      0.50      0.47      1924
weighted avg       0.69      0.79      0.72      1924

