In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import resample

In [2]:
floods_df = pd.read_csv('/home/student/kenya_counties_rainfall_dataset.csv')

In [3]:
categorical_columns = ['province', 'district', 'division', 'month']
numerical_columns = ['rainfall', 'elevation', 'slope', 'clay', 'humidity']

In [4]:
floods_df['flood_risk'] = ((floods_df['rainfall'] > 120) | (floods_df['humidity'] > 80)).astype(int)

In [6]:
print(floods_df.isna().sum())

divid                0
province             0
district             0
division             0
Year                 0
month                0
rainfall             0
elevation            0
slope                0
clay                 0
humidity             0
Case_Outbreak_RVF    0
dtype: int64


In [6]:
# Features and target variable
X = floods_df[numerical_columns + categorical_columns]  # Features
y = floods_df['flood_risk']

In [25]:
# One-hot encode categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])

In [26]:
# Create a pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, C=0.01))  # Higher C means more regularization
])

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [28]:
# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

# Perform cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')  # 5-fold cross-validation
print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Average Cross-Validation Accuracy: {cv_scores.mean()}')

Accuracy: 0.9948231552868526
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     49762
           1       1.00      0.94      0.97      4325

    accuracy                           0.99     54087
   macro avg       1.00      0.97      0.98     54087
weighted avg       0.99      0.99      0.99     54087

Cross-Validation Accuracy Scores: [0.98721504 0.9922902  0.99586777 0.99503564 0.99869651]
Average Cross-Validation Accuracy: 0.9938210312948378
