In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from scipy.stats import loguniform
from sklearn.metrics import f1_score

ModuleNotFoundError: No module named 'pandas'

In [2]:
import sys
print(sys.executable)

c:\Users\ryanh\AppData\Local\Programs\Python\Python313\python.exe


In [None]:
df = pd.read_csv("../data/processed/test_dataset.csv")
df.head()

Unnamed: 0,WEEK_SIN,WEEK_COS,INFLUENZA,COCCIDIOIDOMYCOSIS,CAMPYLOBACTERIOSIS,SALMONELLOSIS,PREV1_CASES,PREV2_CASES,4_WEEK_AVG,TAVG,TMAX,TMIN,PRCP,FUTURE_OUTBREAK
0,0.120537,0.992709,0.0,0.0,1.0,0.0,0.0,0.0,47.0,8.714286,16.7,3.9,0.61,0.0
1,0.239316,0.970942,0.0,0.0,1.0,0.0,47.0,0.0,38.5,13.642857,18.9,8.9,0.0,0.0
2,0.354605,0.935016,0.0,0.0,1.0,0.0,30.0,47.0,25.666667,11.857143,18.9,7.2,0.0,0.0
3,0.464723,0.885456,0.0,0.0,1.0,0.0,0.0,30.0,19.75,11.642857,17.2,7.2,0.0,0.0
4,0.568065,0.822984,0.0,0.0,1.0,0.0,2.0,0.0,14.75,10.257143,18.9,3.9,0.0,0.0


In [None]:
df["FUTURE_OUTBREAK"].value_counts()

FUTURE_OUTBREAK
0.0    823
1.0    272
Name: count, dtype: int64

fairly unbalanced dataset, plus catching outbreaks more important, so will use class_weight=balanced when training classifier

in addition, will try optimizing for f1 score (optimizing precision and recall) instead of optimizing for accuracy

In [None]:
y = df["FUTURE_OUTBREAK"]
X = df.drop("FUTURE_OUTBREAK", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=67
)

In [None]:
dummy = DummyClassifier()
dummy_df = pd.DataFrame(cross_validate(dummy, X_train, y_train, return_train_score=True))
dummy_df

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.000568,0.000533,0.751515,0.751524
1,0.0005,0.000557,0.756098,0.750381
2,0.00045,0.000452,0.75,0.751903
3,0.000372,0.000447,0.75,0.751903
4,0.000332,0.001921,0.75,0.751903


dummy classifier always predicts 0, so gets 0.75 accuracy, but 0 precision and 0 recall

In [None]:
dummy.fit(X_train, y_train)
f1_score(y_test, dummy.predict(X_test))

0.0

In [None]:
numeric_transformer = StandardScaler()
numeric_cols = X_train.columns.tolist()[6:]
numeric_cols

['PREV1_CASES', 'PREV2_CASES', '4_WEEK_AVG', 'TAVG', 'TMAX', 'TMIN', 'PRCP']

In [None]:
preprocessor = make_column_transformer(
    (
      numeric_transformer,
      numeric_cols   
    ) 
)

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=67, class_weight="balanced")
pipe = make_pipeline(preprocessor, lr)
param_choices = {"logisticregression__C": loguniform(1e-3, 1e3)}

random_search = RandomizedSearchCV(pipe, param_choices, random_state=67, scoring="f1", n_iter=50, n_jobs=-1)
random_search.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=67))])
,param_distributions,{'logisticregression__C': <scipy.stats....001EA08969AE0>}
,n_iter,50
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,67

0,1,2
,transformers,"[('standardscaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(4.088357297845245)
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,67
,solver,'lbfgs'
,max_iter,1000


In [None]:
pd.DataFrame(random_search.cv_results_)[["mean_test_score", "param_logisticregression__C", "mean_fit_time", "rank_test_score"]].set_index("rank_test_score").sort_index().head()

Unnamed: 0_level_0,mean_test_score,param_logisticregression__C,mean_fit_time
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.605236,4.088357,0.007122
2,0.602968,2.050534,0.009654
3,0.601634,3.558091,0.006976
4,0.599773,0.097623,0.006909
5,0.599249,1.88412,0.007286


btw, random search automatically retrains best estimator on entire training set

In [None]:
final_model = random_search.best_estimator_

In [None]:
final_model.named_steps["logisticregression"].coef_[0]

array([ 7.18944606, -2.0365033 ,  3.27357276, -0.42890457,  0.24466164,
        0.46580555, -0.07474196])

In [None]:
feature_names = X_train.columns.tolist()
feature_names

['WEEK_SIN',
 'WEEK_COS',
 'INFLUENZA',
 'COCCIDIOIDOMYCOSIS',
 'CAMPYLOBACTERIOSIS',
 'SALMONELLOSIS',
 'PREV1_CASES',
 'PREV2_CASES',
 '4_WEEK_AVG',
 'TAVG',
 'TMAX',
 'TMIN',
 'PRCP']

In [None]:
importance_df = (
                    pd.DataFrame({"Feature": feature_names, "Coefficient": coef})
                    .sort_values("Coefficient", key=abs, ascending=False)
                    .head(10)
                )


NameError: name 'coef' is not defined