# RETO


Os animamos a que realicéis este ejercicio inspirado en el proyecto indicado por Aemet. En nuestro caso emplearemos como fuente de datos un repositorio abierto, el de la universidad de Irvine: 
Repositorio: https://archive.ics.uci.edu/ml/datasets/Forest+Fires


Deberemos analizar y transformar los datos buscando un buen predictor para la variable de área afectada. Dado que se trata de una serie con gran cantidad de ceros (por suerte) nos recomiendan emplear una transformación logarítmica de los datos.
Transformación del logaritmo: https://www.cuemath.com/algebra/log-to-exponential-form/



Estudio de AEMET

[AEMET Estadística del índice meteorológico de riesgo de incendio](AEMET.pdf)	

## DATA

Cortez,Paulo and Morais,Anbal. (2008). Forest Fires. UCI Machine Learning Repository. https://doi.org/10.24432/C5D88D.



In [None]:
!pip install ucimlrepo
!pip install matplotlib


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
forest_fires = fetch_ucirepo(id=162) 
  
# data (as pandas dataframes) 
X = forest_fires.data.features 
y = forest_fires.data.targets 


In [None]:
# metadata 
print(forest_fires.metadata) 


In [None]:
# variable information 
print(forest_fires.variables) 


## Exploratory Data Analysis

In [None]:
forest_fires.data.features.head(10)

In [None]:
forest_fires.data.features.info()

In [None]:
forest_fires.data.features.describe()

In [None]:
forest_fires.data.targets.head(10)

In [None]:
forest_fires.data.targets.info()

In [None]:
forest_fires.data.targets.describe()

In [None]:
df = pd.DataFrame(data=X, columns=forest_fires.feature_names)
df['target'] = y

In [None]:
print("Data Types:")
df.dtypes

In [None]:
numeric_df = df.select_dtypes(include=['number'])


In [None]:
print("Correlation:", numeric_df.corr(method='pearson'))

In [None]:
df.hist(figsize=(8, 8))
plt.tight_layout()
plt.show()

In [None]:
print("Correlation:", numeric_df.corr(method='pearson'))

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))

corr_matrix = numeric_df.corr(method='pearson')

cax = ax.matshow(corr_matrix, vmin=-1, vmax=1)

fig.colorbar(cax)

ticks = np.arange(len(numeric_df.columns))
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(numeric_df.columns, rotation=45, ha='left')
ax.set_yticklabels(numeric_df.columns)

for i in range(len(corr_matrix.columns)):
    for j in range(len(corr_matrix.columns)):
        text = ax.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}', ha='center', va='center', color='black')

plt.tight_layout()

plt.show()

In [None]:
sns.pairplot(df);


In [None]:
sns.displot(df['target'], kde=True);

In [None]:
month_order = ["jan", "feb", "mar", "apr", "may", "jun", 
               "jul", "aug", "sep", "oct", "nov", "dec"]

df['month'] = pd.Categorical(df['month'], categories=month_order, ordered=True)

month_counts = df['month'].value_counts().sort_index()
month_counts

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x=month_counts.index, y=month_counts.values, order=month_order)
plt.xlabel('Month')
plt.ylabel('Number or fires')
plt.title('Number of Fires by Month')

for i, v in enumerate(month_counts.values):
    plt.text(i, v + 0.2, str(v), ha='center', va='bottom', fontsize=10)

plt.show()

In [None]:
target_sum = df.groupby('month')['target'].sum().reindex(month_order, fill_value=0)

plt.figure(figsize=(10, 6))
sns.barplot(x=target_sum.index, y=target_sum.values, order=month_order)
plt.xlabel('Month')
plt.ylabel('Sum of Target')
plt.title('Sum of Target by Month')

for i, v in enumerate(target_sum.values):
    plt.text(i, v + 0.2, f'{int(v)}', ha='center', va='bottom', fontsize=10)

plt.show()

In [None]:
target_mean = df.groupby('month')['target'].mean().reindex(month_order, fill_value=0)

plt.figure(figsize=(10, 6))
sns.barplot(x=target_mean.index, y=target_mean.values, order=month_order)
plt.xlabel('Month')
plt.ylabel('Mean of Target')
plt.title('Mean of Target by Month')

for i, v in enumerate(target_mean.values):
    plt.text(i, v + 0.2, f'{int(v)}', ha='center', va='bottom', fontsize=10)

plt.show()

In [None]:
day_order = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']

day_counts = df['day'].value_counts().reindex(day_order, fill_value=0)

plt.figure(figsize=(10, 6))
sns.barplot(x=day_counts.index, y=day_counts.values, order=day_order)
plt.xlabel('Day of the Week')
plt.ylabel('Number of Fires')
plt.title('Number of Fires by Day of the Week')

# Añadir los valores numéricos en cada columna
for i, v in enumerate(day_counts.values):
    plt.text(i, v + 0.2, str(v), ha='center', va='bottom', fontsize=10)

plt.show()

## Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix


In [None]:
df = pd.get_dummies(df, columns=['day', 'month'])
df.info()


In [None]:
X = df.drop('target', axis=1)
y = df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
scaler = StandardScaler()

X_train_scal = scaler.fit_transform(X_train)
X_test_scal = scaler.transform(X_test)

In [None]:
log_r = LogisticRegression()

log_r.fit(X_train_scal, y_train)

y_pred = log_r.predict(X_test_scal)


In [None]:
log_r.score(X_test_scal, y_test)

In [None]:

sns.heatmap(confusion_matrix(y_test, y_pred, normalize='true'), annot=True);