# Modelos de Predicción

## Importación de librerías

In [1]:
import os.path
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Constantes

Rutas usuales que se ocuparán en el notebook

In [2]:
# Salvar gráficos
save_graf = False

# RUTAS
MAIN_PATH = os.path.join("..")

IMG_PATH = os.path.join(MAIN_PATH, "imagenes")
DATA_PATH = os.path.join(MAIN_PATH, "data")

WF_FOLDER_PATH = os.path.join(DATA_PATH,
                              "wildfires_us")
WF_DATA_PATH = os.path.join(WF_FOLDER_PATH,
                            "WILDFIRES_USA.csv")
WF_DATA_COLUMNS_PATH = os.path.join(WF_FOLDER_PATH,
                                    "WILDFIRES_USA_COLUMNS.csv")

# Tamaño de la imagen
my_figsize = (10, 5)

## Funciones auxiliares

Función que se ocupará para imprimir la información (número de filas y columnas) de un DataFrame.

In [3]:
def print_cantidad(dataframe):
    """Imprime la cantidad de datos que tiene el Data Frame.
    """
    msg_cantidad = "El dataset tiene una cantidad de {} datos y {} variables."
    print(msg_cantidad.format(dataframe.shape[0], dataframe.shape[1]))
    return None

## Carga de datos

### Columnas a ocupar

Se escojen las columnas a ocupar dependiendo de la importancia que tenga. Se omiten algunas columnas tales como las que son para el ID, como el nombre que tuvo el incendio, o la columna que indica de dónde se obtuvo el incendio; pues no deberían de afectar a la predicción.

In [4]:
# Todas las columnas
columnas = str(pd.read_csv(WF_DATA_COLUMNS_PATH).columns[0]).split(",")

# Columnas que se ocuparán en el análisis
columnas_ocupadas = columnas.copy()

# Columnas que no se ocuparán
columnas_sin_ocupar = [x for x in columnas if x not in columnas_ocupadas]

### Carga de Datos

In [5]:
df = pd.read_csv(WF_DATA_PATH)

# Convertimos los datos que sean fechas en ese tipo de dato
df["DISC_DATE_TIME"] = pd.to_datetime(df["DISC_DATE_TIME"])
df["CONT_DATE_TIME"] = pd.to_datetime(df["CONT_DATE_TIME"])

print_cantidad(df)

df.head()

  interactivity=interactivity, compiler=compiler, result=result)


El dataset tiene una cantidad de 581159 datos y 17 variables.


Unnamed: 0,FIRE_YEAR,STAT_CAUSE_DESCR,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_NAME,DISC_DATE_TIME,DISC_MONTH,DISC_DAY_OF_WEEK,DISC_TIME,CONT_DATE_TIME,CONT_MONTH,CONT_DAY_OF_WEEK,CONT_TIME
0,2005,Miscellaneous,0.1,A,40.036944,-121.005833,CA,63,Plumas,2005-02-02 13:00:00,February,Wednesday,13,2005-02-02 17:30:00,February,Wednesday,17
1,2004,Lightning,0.25,A,38.933056,-120.404444,CA,61,Placer,2004-05-12 08:45:00,May,Wednesday,8,2004-05-12 15:30:00,May,Wednesday,15
2,2004,Debris Burning,0.1,A,38.984167,-120.735556,CA,17,El Dorado,2004-05-31 19:21:00,May,Monday,19,2004-05-31 20:24:00,May,Monday,20
3,2004,Lightning,0.1,A,38.559167,-119.913333,CA,3,Alpine,2004-06-28 16:00:00,June,Monday,16,2004-07-03 14:00:00,July,Saturday,14
4,2004,Lightning,0.1,A,38.559167,-119.933056,CA,3,Alpine,2004-06-28 16:00:00,June,Monday,16,2004-07-03 12:00:00,July,Saturday,12


## Preparación de la data

### Convertir variables categóricas a númericas

In [None]:
df_copy = df.copy()

df_copy = df_copy.drop("DISC_DATE_TIME", axis=1)
df_copy = df_copy.drop("CONT_DATE_TIME", axis=1)
df_copy = df_copy.drop("COUNTY", axis=1)

columnas = [
    "STATE", 
    "DISC_MONTH",
    "DISC_DAY_OF_WEEK",
    "CONT_MONTH",
    "CONT_DAY_OF_WEEK",
    "FIRE_SIZE_CLASS",
    "FIPS_NAME",
    "STAT_CAUSE_DESCR",
]

le = preprocessing.LabelEncoder()
for col in columnas:
    df_copy[col] = le.fit_transform(df_copy[col])

### Preparación de la data de entrenamiento con la de prueba

In [6]:
from sklearn.model_selection import train_test_split

X = df_copy.drop("STAT_CAUSE_DESCR", axis=1).values
y = df.STAT_CAUSE_DESCR.values

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0
                                                   )

## Generación de modelos

Empezamos a probar la clasificación con distintos métodos

### Naïve Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB
# Entrenamos
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predecimos
y_pred_gnb = gnb.predict(X_test)

# Mostramos
print(classification_report(y_test, y_pred_gnb))

  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

            Arson       0.37      0.18      0.24     27621
         Campfire       0.16      0.29      0.21      8562
         Children       0.02      0.13      0.03      3706
   Debris Burning       0.40      0.82      0.53     42022
    Equipment Use       0.19      0.01      0.03      9861
        Fireworks       0.07      0.06      0.06       788
        Lightning       0.62      0.37      0.47     34400
    Miscellaneous       0.43      0.14      0.21     30883
Missing/Undefined       0.20      0.04      0.06      8175
        Powerline       0.00      0.00      0.00      1662
         Railroad       0.00      0.00      0.00      1733
          Smoking       0.09      0.01      0.01      4502
        Structure       0.00      0.00      0.00       433

         accuracy                           0.35    174348
        macro avg       0.20      0.16      0.14    174348
     weighted avg       0.38      0.35      0.31    17

### Clasificador de k-vecinos próximos

In [9]:
from sklearn.neighbors import KNeighborsClassifier
# Entrenamos
knc = KNeighborsClassifier(n_neighbors=3)
knc.fit(X_train, y_train)

# Predecimos
y_pred_knc = knc.predict(X_test)

# Mostramos
print(classification_report(y_test, y_pred_knc))

                   precision    recall  f1-score   support

            Arson       0.40      0.56      0.47     27621
         Campfire       0.27      0.36      0.31      8562
         Children       0.08      0.09      0.08      3706
   Debris Burning       0.51      0.56      0.53     42022
    Equipment Use       0.26      0.19      0.22      9861
        Fireworks       0.18      0.09      0.12       788
        Lightning       0.73      0.71      0.72     34400
    Miscellaneous       0.54      0.42      0.47     30883
Missing/Undefined       0.70      0.57      0.63      8175
        Powerline       0.17      0.04      0.06      1662
         Railroad       0.52      0.24      0.33      1733
          Smoking       0.15      0.04      0.06      4502
        Structure       0.04      0.00      0.00       433

         accuracy                           0.50    174348
        macro avg       0.35      0.30      0.31    174348
     weighted avg       0.50      0.50      0.49    17

### Análisis de Discriminante Lineal

In [10]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Entrenamos
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# Predecimos
y_pred_lda = lda.predict(X_test)

# Mostramos
print(classification_report(y_test, y_pred_lda))

  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

            Arson       0.40      0.25      0.31     27621
         Campfire       0.08      0.00      0.00      8562
         Children       0.00      0.00      0.00      3706
   Debris Burning       0.44      0.71      0.54     42022
    Equipment Use       0.00      0.00      0.00      9861
        Fireworks       0.00      0.00      0.00       788
        Lightning       0.51      0.82      0.63     34400
    Miscellaneous       0.47      0.49      0.48     30883
Missing/Undefined       0.06      0.00      0.01      8175
        Powerline       0.00      0.00      0.00      1662
         Railroad       0.00      0.00      0.00      1733
          Smoking       0.00      0.00      0.00      4502
        Structure       0.00      0.00      0.00       433

         accuracy                           0.46    174348
        macro avg       0.15      0.18      0.15    174348
     weighted avg       0.36      0.46      0.39    17

### Análisis de Discriminante Cuadrático

In [11]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# Entrenamos
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)

# Predecimos
y_pred_qda = qda.predict(X_test)

# Mostramos
print(classification_report(y_test, y_pred_qda))

  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

            Arson       0.33      0.08      0.13     27621
         Campfire       0.17      0.23      0.20      8562
         Children       0.02      0.14      0.04      3706
   Debris Burning       0.37      0.89      0.52     42022
    Equipment Use       0.15      0.05      0.08      9861
        Fireworks       0.02      0.07      0.03       788
        Lightning       0.66      0.32      0.43     34400
    Miscellaneous       0.47      0.07      0.12     30883
Missing/Undefined       0.20      0.05      0.08      8175
        Powerline       0.03      0.00      0.00      1662
         Railroad       0.01      0.00      0.00      1733
          Smoking       0.04      0.00      0.00      4502
        Structure       0.00      0.00      0.00       433

         accuracy                           0.32    174348
        macro avg       0.19      0.15      0.13    174348
     weighted avg       0.38      0.32      0.27    17

### Árbol de Decisión

In [12]:
from sklearn.tree import DecisionTreeClassifier

# Entrenamos
dtc = DecisionTreeClassifier()
dtc = dtc.fit(X_train, y_train)

# Predecimos
y_pred_dtc = dtc.predict(X_test)

# Mostramos
print(classification_report(y_test, y_pred_dtc))

                   precision    recall  f1-score   support

            Arson       0.46      0.47      0.46     27621
         Campfire       0.31      0.32      0.31      8562
         Children       0.08      0.09      0.08      3706
   Debris Burning       0.52      0.51      0.52     42022
    Equipment Use       0.22      0.24      0.23      9861
        Fireworks       0.15      0.17      0.16       788
        Lightning       0.73      0.72      0.72     34400
    Miscellaneous       0.45      0.44      0.45     30883
Missing/Undefined       0.76      0.75      0.75      8175
        Powerline       0.11      0.11      0.11      1662
         Railroad       0.29      0.32      0.31      1733
          Smoking       0.10      0.11      0.11      4502
        Structure       0.04      0.04      0.04       433

         accuracy                           0.49    174348
        macro avg       0.33      0.33      0.33    174348
     weighted avg       0.50      0.49      0.49    17

### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
# Entrenamos
rfc = RandomForestClassifier(n_estimators=50)
rfc = rfc.fit(X_train, y_train)

# Predecimos
y_pred_rfc = rfc.predict(X_test)

# Mostramos
print(classification_report(y_test, y_pred_rfc))

                   precision    recall  f1-score   support

            Arson       0.56      0.55      0.56     27621
         Campfire       0.51      0.35      0.42      8562
         Children       0.22      0.03      0.06      3706
   Debris Burning       0.56      0.73      0.63     42022
    Equipment Use       0.39      0.22      0.28      9861
        Fireworks       0.51      0.13      0.20       788
        Lightning       0.76      0.86      0.80     34400
    Miscellaneous       0.55      0.57      0.56     30883
Missing/Undefined       0.81      0.76      0.79      8175
        Powerline       0.36      0.07      0.12      1662
         Railroad       0.81      0.26      0.39      1733
          Smoking       0.29      0.05      0.08      4502
        Structure       0.13      0.01      0.01       433

         accuracy                           0.60    174348
        macro avg       0.50      0.35      0.38    174348
     weighted avg       0.58      0.60      0.58    17