<a href="https://colab.research.google.com/github/datascience-uniandes/linear-regression-tutorial/blob/master/forestfires/linear-regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linear Regression

MINE-4101: Applied Data Science  
Univerisdad de los Andes  

**Task:** Predict the burned area of forest fires, in the northeast region of Portugal, by using meteorological and other data.

**Dataset:** Forest Fires, source: [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/forest+fires).

**Data dictionary:**  
X - x-axis spatial coordinate within the Montesinho park map: 1 to 9  
Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9  
month - month of the year: 'jan' to 'dec'  
day - day of the week: 'mon' to 'sun'  
FFMC - FFMC index from the FWI system: 18.7 to 96.20  
DMC - DMC index from the FWI system: 1.1 to 291.3  
DC - DC index from the FWI system: 7.9 to 860.6  
ISI - ISI index from the FWI system: 0.0 to 56.10  
temp - temperature in Celsius degrees: 2.2 to 33.30  
RH - relative humidity in %: 15.0 to 100  
wind - wind speed in km/h: 0.40 to 9.40  
rain - outside rain in mm/m2 : 0.0 to 6.4  
area - the burned area of the forest (in ha): 0.00 to 1090.84 (this output variable is very skewed towards 0.0, thus **it may make sense to model with the logarithm transform)**.

Last update: September, 2023

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from ydata_profiling import ProfileReport

### Reading the dataset

In [None]:
forest_df = pd.read_csv("./data/forestfires.csv", sep=",")

In [None]:
forest_df.shape

In [None]:
forest_df.dtypes

In [None]:
forest_df.sample(5)

### Profiling the data

In [None]:
profile = ProfileReport(forest_df)

In [None]:
profile.to_notebook_iframe()

### Analyzing the target

In [None]:
forest_df["area"].describe(percentiles=[.25, .5, .75, .95, .99])

In [None]:
plt.figure(figsize=(25, 3))
plt.boxplot(forest_df["area"], showmeans=True, vert=False)
plt.show()

In [None]:
forest_no_outliers_df = forest_df.loc[forest_df["area"] <= forest_df["area"].quantile(.9)]

In [None]:
forest_no_outliers_df.shape

In [None]:
plt.figure(figsize=(25, 3))
plt.boxplot(forest_no_outliers_df["area"], showmeans=True, vert=False)
plt.show()

### Selecting features to train the model

In [None]:
features = ["X", "Y"] + list(forest_no_outliers_df.columns[4:-1])

In [None]:
features

In [None]:
corr = forest_no_outliers_df[features + ["area"]].corr()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(corr, cmap="Blues")
plt.show()

### Splitting train and test datasets

In [None]:
X = forest_no_outliers_df[features]

In [None]:
Y = forest_no_outliers_df["area"]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(25, 6))
ax1.boxplot(Y_train, showmeans=True, vert=False)
ax2.boxplot(Y_test, showmeans=True, vert=False)
plt.show()

### Scaling features

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Training the model

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train_scaled, Y_train)

In [None]:
coefficients_df = pd.concat([
    pd.Series(features),
    pd.Series(model.coef_)
], axis=1).rename(columns={0: "feature", 1: "value"})
coefficients_df

In [None]:
plt.figure(figsize=(10, 8))
sns.barplot(
    y=coefficients_df["feature"],
    x=coefficients_df["value"],
    orient="h"
)
plt.show()

In [None]:
model.intercept_

### Evaluating the model

In [None]:
preds_train = model.predict(X_train_scaled)
preds_test = model.predict(X_test_scaled)

In [None]:
print("MAE train:", mean_absolute_error(Y_train, preds_train))
print("MAE test:", mean_absolute_error(Y_test, preds_test))

In [None]:
print("RMSE train:", np.sqrt(mean_squared_error(Y_train, preds_train)))
print("RMSE test:", np.sqrt(mean_squared_error(Y_test, preds_test)))

In [None]:
print("R2 score:", r2_score(Y_train, preds_train))

### Analyzing the error more in detail

In [None]:
(Y_test - preds_test).describe(percentiles=[.25, .5, .75, .95, .99])

In [None]:
plt.figure(figsize = (25, 3))
plt.boxplot((Y_test - preds_test), showmeans=True, vert=False)
plt.grid()
plt.show()

In [None]:
worst_condition = np.where(np.abs(Y_test - preds_test) > 7, True, False)

In [None]:
worst_cases = pd.concat([
    X_test.loc[worst_condition],
    Y_test.loc[worst_condition]
], axis=1).reset_index(drop=True)

In [None]:
worst_cases["predictions"] = pd.Series(preds_test).loc[worst_condition].reset_index(drop=True)

In [None]:
worst_cases.shape

In [None]:
worst_cases