In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE  # Handling class imbalance
import seaborn as sns
import matplotlib.pyplot as plt

  from pandas.core import (


In [2]:
# Define file paths
age_file = 'cleaned_flu_ml_age.csv' 
race_file = 'cleaned_flu_ml_race.csv'

In [9]:
# Define output CSV file
output_dir = 'out' 
output_csv = os.path.join(output_dir, "flu_ml_results.csv")
output_csv

'out/flu_ml_results.csv'

In [4]:
df_age = pd.read_csv(age_file)
df_race = pd.read_csv(race_file)

In [5]:
def prepare_data(df, feature_col, additional_features=None):
    """
    pre-process the data for logistic regression
    """
    df = df.dropna(subset=[feature_col, "high_incidence"])  # Drop missing values

    # Encode categorical feature
    le = LabelEncoder()
    df.loc[:, feature_col] = le.fit_transform(df[feature_col])  # Fix SettingWithCopyWarning

    # Select features
    feature_cols = [feature_col]  
    if additional_features:
        feature_cols.extend(additional_features)  # Add more predictors
    
    X = df[feature_cols]  # Predictor variables
    y = df["high_incidence"]  # Binary target

    return X, y, le  # Return label encoder for reference


In [None]:
# Prepare data for age-based prediction (including YEAR and WEEK)
X_age, y_age, age_encoder = prepare_data(df_age, "clean_age_category", ["WEEK"])

# Prepare data for race-based prediction (including YEAR and WEEK)
X_race, y_race, race_encoder = prepare_data(df_race, "RACE.CATEGORY", [ "WEEK"])

In [39]:
[(k, v) for k, v in enumerate(age_encoder.classes_)]

[(0, '0-17'), (1, '18-49'), (2, '50-64'), (3, '65-74'), (4, '75-84')]

In [31]:
[(k, v) for k, v in enumerate(race_encoder.classes_)]

[(0, 'American Indian/Alaska Native'),
 (1, 'Asian/Pacific Islander'),
 (2, 'Black'),
 (3, 'Hispanic/Latino'),
 (4, 'White')]

In [None]:
# Function to train, evaluate, and save results to CSV
def train_and_evaluate(X, y, dataset_name, output_csv):
    """
    BAM!! Let's start training and make a CSV with our output!
    """
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Train logistic regression model
    model = LogisticRegression(max_iter=500)
    model.fit(X_train_resampled, y_train_resampled)

    # Predictions
    y_pred = model.predict(X_test)
    print(y_pred)

    # Compute accuracy & classification report
    accuracy = accuracy_score(y_test, y_pred)
    report_dict = classification_report(y_test, y_pred, output_dict=True)  # Convert to dict

    # Convert classification report to DataFrame
    report_df = pd.DataFrame(report_dict).transpose()
    report_df["dataset"] = dataset_name  # Add dataset label
    report_df["accuracy"] = accuracy  # Add accuracy as a separate column

    # Save results to a CSV file (append mode)
    report_df.to_csv(output_csv, mode="a", header=not os.path.exists(output_csv), index=True)

    # Print confirmation
    print(f"Results saved to: {output_csv}")

    # Generate confusion matrix
    plot_confusion_matrix(y_test, y_pred, dataset_name)
    return model

In [40]:
# Train and evaluate models for age-based and race-based datasets
age_model = train_and_evaluate(X_age, y_age, "Age-Based Model", output_csv)
race_model = train_and_evaluate(X_race, y_race, "Race-Based Model", output_csv)

[1 0 0 ... 1 0 1]
Results saved to: out/flu_ml_results.csv
[1 1 1 ... 1 0 1]
Results saved to: out/flu_ml_results.csv


In [49]:
age_model.coef_

array([[0.24149078]])

In [50]:
race_model.coef_

array([[0.0889156]])

In [43]:
age_model.intercept_

array([-0.28622803])

In [44]:
race_model.intercept_

array([-0.17704719])

In [52]:
x = np.array([0, 1, 2, 3, 4])
x

array([0, 1, 2, 3, 4])

In [62]:
import math

In [67]:
1/(1+math.exp(-0.0889156 * 4+0.17704719))

0.5445354627113334

In [54]:
race_model.predict(x.reshape(-1, 1))



array([0, 0, 1, 1, 1])

In [73]:
1/(1+math.exp(-0.24149078 * 3+0.28622803))

0.6078406055421536

In [68]:
age_model.predict(x.reshape(-1, 1))



array([0, 0, 1, 1, 1])

Unnamed: 0,clean_age_category
0,0
1,0
2,0
3,0
4,0
...,...
124327,0
124328,0
124329,0
124330,0
