In [2]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score

In [3]:
df = pd.read_csv('data/fires.csv')

In [4]:
month_to_idx = {
    'jan': 1,
    'feb': 2,
    'mar': 3,
    'apr': 4,
    'may': 5,
    'jun': 6,
    'jul': 7,
    'aug': 8,
    'sep': 9,
    'oct': 10,
    'nov': 11,
    'dec': 12
}

day_to_idx = {
    'mon': 1,
    'tue': 2,
    'wed': 3,
    'thu': 4,
    'fri': 5,
    'sat': 6,
    'sun': 7
}

# categorical value to numerical
df['month'] = df['month'].map(month_to_idx)
df['day'] = df['day'].map(day_to_idx)

In [5]:
df.describe()
# df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,7.475822,4.259188,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292
std,2.313778,1.2299,2.27599,2.072929,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818
min,1.0,2.0,1.0,1.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,7.0,2.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0
50%,4.0,4.0,8.0,5.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52
75%,7.0,5.0,9.0,6.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57
max,9.0,9.0,12.0,7.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84


In [6]:
X = df.drop(['area'], axis=1)
y = df['area']

In [7]:
def transform(value):
    return np.log(value+1)

y = y.apply(transform)
y

0      0.000000
1      0.000000
2      0.000000
3      0.000000
4      0.000000
         ...   
512    2.006871
513    4.012592
514    2.498152
515    0.000000
516    0.000000
Name: area, Length: 517, dtype: float64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# vanilla linear regression model as baseline

vanilla_reg = LinearRegression()

vanilla_reg.fit(X_train, y_train)
y_pred = vanilla_reg.predict(X_test)
vanilla_reg.score(X_test, y_test)

0.02045057580570142

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"Mean absolute error: {mean_absolute_error(y_test, y_pred)}")
print(f"R2 score: {r2_score(y_test, y_pred)}")

Mean squared error: 2.1529140377163096
Mean absolute error: 1.1624955004864979
R2 score: 0.02045057580570142


In [19]:
reg = SGDRegressor()

parameters = {
    "alpha": [0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001],
    "epsilon": [0.01, 0.05, 0.1, 0.5],
}
    
search = GridSearchCV(reg, parameters)
search.fit(X_train, y_train)
search.best_score_

-0.05931886335432117