## Dataset:
### Prever a área queimada de incêndios florestais, na região nordeste de Portugal, usando dados meteorológicos e outros dados
https://archive.ics.uci.edu/ml/datasets/forest+fires

## Importação de bibliotecas

In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import math

## Leitura e visualização dos dados

In [2]:
data = pd.read_csv("forestfires.csv")

In [3]:
data.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [4]:
data.describe()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292
std,2.313778,1.2299,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818
min,1.0,2.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0
50%,4.0,4.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52
75%,7.0,5.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57
max,9.0,9.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84


## Pré-processamento

In [5]:
data.isnull().sum()

X        0
Y        0
month    0
day      0
FFMC     0
DMC      0
DC       0
ISI      0
temp     0
RH       0
wind     0
rain     0
area     0
dtype: int64

In [6]:
data.shape

(517, 13)

In [7]:
data = pd.get_dummies(data, drop_first=True)

In [8]:
data.shape

(517, 28)

In [9]:
data.head()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,month_may,month_nov,month_oct,month_sep,day_mon,day_sat,day_sun,day_thu,day_tue,day_wed
0,7,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,0,0,0,0,0,0,0,0,0,0
1,7,4,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,0,0,1,0,0,0,0,0,1,0
2,7,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,0,0,1,0,0,1,0,0,0,0
3,8,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,0,0,0,0,0,0,0,0,0,0
4,8,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,0,0,0,0,0,0,1,0,0,0


## Separação em dados de treino e teste

In [10]:
X = data.drop(['area'], axis = 1).values
y = data['area'].values

In [11]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
    test_size=0.3, random_state=1)

## Avaliação do linearRegression

In [12]:
teste = pd.DataFrame()
teste['areay'] = y_test
teste.describe()

Unnamed: 0,areay
count,156.0
mean,12.959295
std,63.149382
min,0.0
25%,0.0
50%,0.475
75%,6.1825
max,746.28


In [13]:
ln = linear_model.LinearRegression()
ln.fit(X_train, y_train)
predicao_ln = ln.predict(X_test)

In [14]:
print(metrics.mean_squared_error(y_test, predicao_ln))
print(math.sqrt(metrics.mean_squared_error(y_test, predicao_ln)))
print(metrics.mean_absolute_error(y_test, predicao_ln))

4048.91574602298
63.631091032788206
21.6013993209473


## Avaliação GradienteBoostingRegressor

In [15]:
gbr = xgb.sklearn.XGBRegressor(n_estimators=100)
gbr.fit(X_train, y_train)
predicao_gbr = gbr.predict(X_test)

In [16]:
print(metrics.mean_squared_error(y_test, predicao_gbr))
print(math.sqrt(metrics.mean_squared_error(y_test, predicao_gbr)))
print(metrics.mean_absolute_error(y_test, predicao_gbr))

3931.88635222857
62.7047554195738
19.432671625140383
