In [1]:
import numpy as np
import pandas as pd
import altair as alt

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
    cross_val_predict
)

alt.data_transformers.enable("data_server")
alt.renderers.enable("mimetype")

RendererRegistry.enable('mimetype')

### Read data

In [2]:
forest_fires = pd.read_csv("../data/raw/forestfires.csv")
forest_fires.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


### Spilt data

In [3]:
train_df, test_df = train_test_split(forest_fires, test_size=0.2, random_state=123)

### Exploratory data analysis on first six features

In [4]:
train_df.describe(include="all")

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,413.0,413.0,413,413,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0
unique,,,12,7,,,,,,,,,
top,,,aug,sun,,,,,,,,,
freq,,,149,73,,,,,,,,,
mean,4.62954,4.237288,,,90.771429,109.854237,546.031235,8.971671,18.819613,44.353511,4.085714,0.026634,13.868329
std,2.278178,1.164551,,,4.655424,63.576254,251.835608,4.581362,5.789594,16.476107,1.813679,0.330882,69.84273
min,1.0,2.0,,,50.4,2.4,7.9,0.4,4.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,,,90.2,61.1,433.3,6.4,15.4,33.0,2.7,0.0,0.0
50%,4.0,4.0,,,91.7,108.0,664.5,8.4,19.3,42.0,4.0,0.0,0.52
75%,6.0,5.0,,,92.9,141.3,713.9,10.7,22.9,53.0,5.4,0.0,6.58


We don't have any column with null values.

### Distribution of numerical features

In [5]:
# Dropping numerical columns that are categorical in nature
train_df_numeric = train_df.drop(["X", "Y", "month", "day"], axis=1)

(alt.Chart(train_df_numeric)
 .mark_bar()
 .encode(x=alt.X(alt.repeat(),
         type="quantitative",
         bin=alt.Bin(maxbins=50)),
         y="count()")
 .repeat(train_df_numeric.columns.tolist(), columns=3))

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


### Feature description

### Distribution of categorical features

In [6]:

(alt.Chart(train_df)
 .mark_bar()
 .encode(x=alt.X(alt.repeat(),
         type="nominal"),
         y="count()")
 .repeat(["X", "Y", "month", "day"]))

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


### Correlation between numeric columns

In [7]:
columns = train_df_numeric.columns.tolist()

(alt.Chart(train_df_numeric)
 .mark_circle(size=10)
 .encode(x=alt.X(alt.repeat("row"), type="quantitative"),
         y=alt.Y(alt.repeat("column"), type="quantitative"))
 .properties(width=120, height=120)
 .repeat(row=columns, column=columns, columns=3))

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [8]:
corr_df = train_df_numeric.corr("spearman").stack().reset_index(name="corr")
corr_df.loc[corr_df["corr"] == 1, "corr"] = 0
corr_df["abs"] = corr_df["corr"].abs()

(
    alt.Chart(corr_df)
    .mark_circle()
    .encode(x="level_0",
            y="level_1",
            size="abs",
            color=alt.Color('corr',
                            scale=alt.Scale(scheme='blueorange',
                                            domain=(-1, 1))))
)

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


### When and where fire happened?

In [9]:
train_df_categories = train_df[["month", "X", "Y"]].copy()

(alt.Chart(train_df_categories[train_df["area"] != 0])
 .mark_bar()
 .encode(x=alt.X(alt.repeat(), type="nominal"),
         y="count()",
         color=alt.value("#e25822"))
 .repeat(train_df_categories.columns.tolist()))

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [11]:
temp_in_celsius = train_df.groupby("month")["temp"].mean()

temp_in_fahrenheit = (temp_in_celsius * (9 / 5)) + 32
temp_in_fahrenheit

month
apr    55.085000
aug    71.121611
dec    40.580000
feb    49.078824
jan    41.540000
jul    70.929091
jun    68.443077
mar    55.240000
may    52.340000
nov    53.240000
oct    61.250000
sep    67.193913
Name: temp, dtype: float64