# Logistic Regression
- To demonstrate logistic regression, we will create a binary label: Did it snow, or not?

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [6]:
# Read in data
df = pd.read_csv('data/streamflow_prediction_dataset_averaged_cols.csv')
df = df.set_index('date')

# Create binary snow variable
df['Snow'] = np.where((df['WTEQ_BisonLake'] > 0) | (df['WTEQ_McClurePass'] > 0), 1, 0)
series_pred = df['Snow']
df = df.drop(columns=['WTEQ_BisonLake', 'WTEQ_McClurePass', 'Snow'])

# Normalize data
df = (df-df.mean())/df.std()

# display
df['Snow'] = series_pred
display(df)

Unnamed: 0_level_0,PREC_Avg,TAVG_Avg,soilmoisture_Avg_2ft,soilmoisture_Avg_4ft,soilmoisture_Avg_8ft,soilmoisture_Avg_20ft,Snow
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-03-12,0.415169,-0.780541,-0.320587,-1.005767,-0.061212,-0.087585,1
2008-03-15,0.460106,-1.221430,-0.320587,-0.995909,-0.034313,-0.087551,1
2008-03-17,0.472362,-1.111968,-0.311604,-0.995909,-0.003572,-0.087577,1
2008-03-18,0.472362,-1.203187,-0.302621,-0.966333,0.000271,-0.087568,1
2008-03-19,0.472362,-0.737972,-0.311604,-0.966333,0.000271,-0.087577,1
...,...,...,...,...,...,...,...
2021-07-23,0.268103,1.208021,0.375604,-0.650859,-0.664505,-0.090355,0
2021-07-24,0.284443,1.107681,0.200433,-0.690293,-0.706774,-0.090407,0
2021-07-25,0.304869,1.065113,0.474418,-0.719869,-0.733672,-0.090518,0
2021-07-26,0.304869,1.305321,0.631623,-0.739586,-0.729830,-0.090664,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Snow'), df['Snow'], test_size=0.2, random_state=42)

In [8]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
print(f"Model Accuracy: {lr.score(X_test, y_test)}\n")

print("Coefficients:")
for col, coef in zip(X_train.columns, lr.coef_[0]):
    print(f'[{col}] * {coef}')

# make dataframe with predictions
df_pred = df.copy()
df_pred['Snow_Prediction'] = lr.predict(df.drop(columns='Snow'))

# confusion matrix
confusion_matrix(df_pred['Snow'], df_pred['Snow_Prediction'])

Model Accuracy: 0.95

Coefficients:
[PREC_Avg] * -0.09444768993431132
[TAVG_Avg] * -3.58325011218353
[soilmoisture_Avg_2ft] * -0.08351356312932487
[soilmoisture_Avg_4ft] * -1.7312938783592327
[soilmoisture_Avg_8ft] * 3.288066630261371
[soilmoisture_Avg_20ft] * 0.29001895506419173


array([[ 927,   84],
       [  73, 1912]])