In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
training_df = pd.read_csv('./data/weather/all_weather_data.csv')
training_df.drop(columns=['Unnamed: 0', 'index', 'DATE'], inplace=True)
training_df['VALUE_temp'] = training_df['VALUE_temp'].replace('---', np.nan)
training_df.dropna(how='any', inplace=True)
training_df.describe()

Unnamed: 0,VALUE_precip,PRIOR_precip,DPT (F),DBT (F),Wind Speed (m/s),Wind Dir,Fire
count,316709.0,316709.0,316709.0,316709.0,316709.0,316709.0,316709.0
mean,0.073864,6922.632984,45.783545,61.272468,2.741261,213.151682,0.339722
std,0.380217,1616.715451,7.3642,12.650176,1.568361,57.310391,0.473616
min,-0.3,3360.89,32.0,32.0,0.0,10.0,0.0
25%,0.0,5303.2,39.5825,51.8225,1.7,171.25,0.0
50%,0.0,7372.51,45.3875,59.9375,2.429167,211.25,0.0
75%,0.0,7968.97,51.1625,69.5525,3.416667,256.25,1.0
max,42.2,8619.71,105.2825,110.915,19.7125,849.208333,1.0


Logistic Regression

In [3]:
training_df = training_df.sample(frac=1).reset_index(drop=True)
training_data = training_df.iloc[:int(len(training_df)*.7)]
validation_data = training_df.iloc[int(len(training_df)*.7):]

In [4]:
#Split data into X and y for model
feature_columns = training_data.columns.drop('Fire') 
X_train = training_data[training_data.columns.drop('Fire')]
y_train = training_data['Fire']

X_val = validation_data[validation_data.columns.drop('Fire')]
y_val = validation_data['Fire'] 

#Scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

#More iter?
model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)

# Predict and assess the fit on validation data
y_val_pred = model.predict(X_val_scaled)

# Print evaluation metrics
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))


Validation Accuracy: 0.6980413206613831
Validation Confusion Matrix:
 [[52646  9984]
 [18706 13677]]
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.74      0.84      0.79     62630
         1.0       0.58      0.42      0.49     32383

    accuracy                           0.70     95013
   macro avg       0.66      0.63      0.64     95013
weighted avg       0.68      0.70      0.68     95013



75% of our data is no fire, let's resample to get a better understanding of how weather affects fires in particular

In [4]:
null_case = training_df[training_df['Fire']== 0]
null_case = null_case.sample(frac=1)
idxs = null_case.iloc[:int(len(null_case)/2)].index.to_list()
resampled = training_df.drop(idxs)
resampled.describe()

Unnamed: 0,VALUE_precip,PRIOR_precip,DPT (F),DBT (F),Wind Speed (m/s),Wind Dir,Fire
count,212151.0,212151.0,212151.0,212151.0,212151.0,212151.0,212151.0
mean,0.067439,7132.457471,46.022307,61.788064,2.672169,212.344105,0.507153
std,0.330081,1527.581532,7.573168,12.87119,1.536407,57.743709,0.49995
min,-0.2,3360.89,32.075,32.0,0.0,10.0,0.0
25%,0.0,6653.16,39.6275,52.025,1.65,170.416667,0.0
50%,0.0,7372.51,45.605,60.5375,2.383333,210.833333,1.0
75%,0.0,8619.71,51.5975,70.4075,3.3375,255.416667,1.0
max,42.2,8619.71,105.2825,110.915,19.7125,849.208333,1.0


In [5]:
resampled = resampled.sample(frac=1).reset_index(drop=True)
resampled_train = resampled.iloc[:int(len(resampled)*.7)]
resampled_val = resampled.iloc[int(len(resampled)*.7):]

In [9]:
#Split data into X and y for model
feature_columns = resampled_train.columns.drop('Fire') 
X_train = resampled_train[resampled_train.columns.drop('Fire')]
y_train = resampled_train['Fire']

X_val = resampled_val[resampled_val.columns.drop('Fire')]
y_val = resampled_val['Fire'] 

#Scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

#More iter?
model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)

# Predict and assess the fit on validation data
y_val_pred = model.predict(X_val_scaled)

# Print evaluation metrics
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.6973415454231217
Validation Confusion Matrix:
 [[18617 12674]
 [ 6589 25766]]
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.74      0.59      0.66     31291
         1.0       0.67      0.80      0.73     32355

    accuracy                           0.70     63646
   macro avg       0.70      0.70      0.69     63646
weighted avg       0.70      0.70      0.69     63646



Try RF Classification

In [10]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [28]:
model = RandomForestClassifier(random_state=42) #Set seed for reproducability



model.fit(X_train_scaled, y_train)

# Predict and assess the fit on validation data
y_val_pred = model.predict(X_val_scaled)

# Print evaluation metrics
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.886355780410395
Validation Confusion Matrix:
 [[27710  3679]
 [ 3554 28703]]
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.88      0.88     31389
         1.0       0.89      0.89      0.89     32257

    accuracy                           0.89     63646
   macro avg       0.89      0.89      0.89     63646
weighted avg       0.89      0.89      0.89     63646



Try Gradient Boosting

In [20]:
model = XGBClassifier(random_state=42,  eval_metric='error')

model.fit(X_train_scaled, y_train)

# Predict and assess the fit on validation data
y_val_pred = model.predict(X_val_scaled)

# Print evaluation metrics
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.8278132168557333
Validation Confusion Matrix:
 [[23914  7483]
 [ 3476 28773]]
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.76      0.81     31397
         1.0       0.79      0.89      0.84     32249

    accuracy                           0.83     63646
   macro avg       0.83      0.83      0.83     63646
weighted avg       0.83      0.83      0.83     63646



k-means

In [11]:
from sklearn.cluster import KMeans

In [19]:
#Split data into X and y for model
feature_columns = resampled_train.columns.drop('Fire') 
X_train = resampled_train[resampled_train.columns.drop('Fire')]
y_train = resampled_train['Fire']

X_val = resampled_val[resampled_val.columns.drop('Fire')]
y_val = resampled_val['Fire'] 

#Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model = KMeans(n_clusters=2, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict and assess the fit on validation data
y_val_pred = model.predict(X_val_scaled)

# Print evaluation metrics
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.5460201740879238
Validation Confusion Matrix:
 [[18243 13146]
 [15748 16509]]
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.54      0.58      0.56     31389
         1.0       0.56      0.51      0.53     32257

    accuracy                           0.55     63646
   macro avg       0.55      0.55      0.55     63646
weighted avg       0.55      0.55      0.55     63646



GMM

In [20]:
from sklearn.mixture import GaussianMixture

In [24]:

#Scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model = GaussianMixture(n_components=2, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict and assess the fit on validation data
y_val_pred = model.predict(X_val_scaled)

# Print evaluation metrics
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.5324607987933255
Validation Confusion Matrix:
 [[ 3787 27602]
 [ 2155 30102]]
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.64      0.12      0.20     31389
         1.0       0.52      0.93      0.67     32257

    accuracy                           0.53     63646
   macro avg       0.58      0.53      0.44     63646
weighted avg       0.58      0.53      0.44     63646



In [45]:

#Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model = GaussianMixture(n_components=2, random_state=42)
model.fit(X_train)

# Predict and assess the fit on validation data
y_val_pred = model.predict(X_val)

# Print evaluation metrics
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.5465543789083367
Validation Confusion Matrix:
 [[ 6946 24443]
 [ 4417 27840]]
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.61      0.22      0.32     31389
         1.0       0.53      0.86      0.66     32257

    accuracy                           0.55     63646
   macro avg       0.57      0.54      0.49     63646
weighted avg       0.57      0.55      0.49     63646



GMM + RF

In [26]:
from sklearn.mixture import GaussianMixture

In [None]:
gmm = GaussianMixture(n_components=2, random_state=42)  
gmm.fit(X_train_scaled)


gmm_train_clusters = gmm.predict(X_train_scaled)       
gmm_train_probs = gmm.predict_proba(X_train_scaled) 


gmm_val_clusters = gmm.predict(X_val_scaled)
gmm_val_probs = gmm.predict_proba(X_val_scaled)

In [44]:
# Add GMM features to training set
X_train_augmented = np.concatenate(
    [X_train, gmm_train_clusters[:, np.newaxis], gmm_train_probs], axis=1
)

# Add GMM features to validation set
X_val_augmented = np.concatenate(
    [X_val_scaled, gmm_val_clusters[:, np.newaxis], gmm_val_probs], axis=1
)

# Shapes after augmentation
print("X_train_augmented shape:", X_train_augmented.shape)
print("X_val_augmented shape:", X_val_augmented.shape)


X_train_augmented shape: (148505, 10)
X_val_augmented shape: (63646, 10)


In [46]:

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_augmented, y_train)

# Predict on validation set
rf_preds = rf.predict(X_val_augmented)
rf_probs = rf.predict_proba(X_val_augmented)[:, 1]  


In [None]:
# Weight the predictions from GMM and RF
gmm_weights = 0.9
rf_weights = 0.1


ensemble_probs = (gmm_weights * gmm_val_probs[:, 1]) + (rf_weights * rf_probs)

y_val_pred = (ensemble_probs > 0.5).astype(int) 

# Print evaluation metrics
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))


Validation Accuracy: 0.5465072431888885
Validation Confusion Matrix:
 [[ 6943 24446]
 [ 4417 27840]]
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       0.61      0.22      0.32     31389
         1.0       0.53      0.86      0.66     32257

    accuracy                           0.55     63646
   macro avg       0.57      0.54      0.49     63646
weighted avg       0.57      0.55      0.49     63646



Other models?

In [10]:
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier


In [7]:
#Split data into X and y for model
feature_columns = resampled_train.columns.drop('Fire') 
X_train = resampled_train[resampled_train.columns.drop('Fire')]
y_train = resampled_train['Fire']

X_val = resampled_val[resampled_val.columns.drop('Fire')]
y_val = resampled_val['Fire'] 

#Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


In [9]:
np.sqrt(148505)

385.36346479654765

In [12]:

models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    #"CatBoost": CatBoostRegressor(random_state=42, verbose=0),
    #"SVR": SVR(),
    "KNN Reg": KNeighborsRegressor(),
    "KNN Class": KNeighborsClassifier(n_neighbors=300)
}


In [14]:
results = []


for name, model in models.items():
    print(f"Model: {name}")
   
    model.fit(X_train_scaled, y_train)
    
    y_val_pred = model.predict(X_val_scaled)
    y_val_pred = (y_val_pred> 0.5).astype(int) 
    
    
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_val, y_val_pred),

    })

results_df = pd.DataFrame(results)
results_df.sort_values(by="Accuracy", ascending=True, inplace=True)

results_df

Model: Linear Regression
Model: Ridge
Model: Lasso
Model: ElasticNet
Model: Decision Tree
Model: Random Forest
Model: Gradient Boosting
Model: XGBoost
Model: LightGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005702 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 148505, number of used features: 7
[LightGBM] [Info] Start training from score 0.507256
Model: KNN Reg
Model: KNN Class


Unnamed: 0,Model,Accuracy
2,Lasso,0.506913
3,ElasticNet,0.506913
0,Linear Regression,0.702259
1,Ridge,0.702259
10,KNN Class,0.740738
6,Gradient Boosting,0.794928
8,LightGBM,0.812101
7,XGBoost,0.825362
9,KNN Reg,0.829023
4,Decision Tree,0.857886
