In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
import matplotlib.pyplot as plt

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

from pytorch_tabnet.tab_model import TabNetClassifier

# from pytorch_tabnet.augmentations import ClassificationSMOTE

plt.style.use("./auri.mplstyle")


In [2]:
clf = TabNetClassifier(device_name="cpu")
clf.load_model("output/model.zip")




In [3]:
df_orig = pd.read_parquet("output/data.parquet")
df_1 = df_orig[df_orig.fire == 1]
np.random.seed(1106)
df_0 = df_orig[df_orig.fire == 0].sample(n=len(df_1))
np.random.seed(1106)
df = pd.concat([df_1, df_0]).sample(frac=1).reset_index(drop=True)
df = df[
    [
        "A8",
        "A10",
        "A14",
        "A18",
        "A20",
        "gd_zone",
        "gg_zone",
        "jd_zone",
        "pr_zone",
        "si_zone",
        "crime_zone",
        "fire",
        "fire_1km_cnt",
        "fire_1km2_cnt",
        "fire_500m_cnt",
        "year",
    ]
]
np.random.seed(1106)
df["set"] = np.random.choice(
    ["train", "valid", "test"], p=[0.8, 0.1, 0.1], size=len(df)
)

train_idx = df[df.set == "train"].index
valid_idx = df[df.set == "valid"].index
test_idx = df[df.set == "test"].index

categorical_columns = ["A8", "A10", "A20", "fire"]
categorical_classes = {}
categorical_dims = {}

for col in categorical_columns:
    print(col, df[col].nunique())
    l_enc = LabelEncoder()
    l_enc.fit(df_orig[col].fillna("N_A").values)
    df[col] = df[col].fillna("N_A")
    df[col] = l_enc.transform(df[col].values)
    categorical_classes[col] = l_enc.classes_
    categorical_dims[col] = len(l_enc.classes_)

print(categorical_classes)

numerical_columns = [
    "A14",
    "A18",
    "fire_1km_cnt",
    "fire_1km2_cnt",
    "fire_500m_cnt",
    "crime_zone",
    "gd_zone",
    "gg_zone",
    "jd_zone",
    "pr_zone",
    "si_zone",
    "year",
]

df[numerical_columns] = df[numerical_columns].astype(float)
for col in numerical_columns:
    m_train = df.loc[train_idx, col].mean()
    m_valid = df.loc[valid_idx, col].mean()
    m_test = df.loc[test_idx, col].mean()
    df.loc[train_idx, col] = df.loc[train_idx, col].fillna(m_train)
    df.loc[valid_idx, col] = df.loc[valid_idx, col].fillna(m_valid)
    df.loc[test_idx, col] = df.loc[test_idx, col].fillna(m_test)

features = [col for col in df.columns if col not in ["set", "fire"]]
cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [
    categorical_dims[f] for _, f in enumerate(features) if f in categorical_columns
]

X_train = df[features].values[train_idx]
X_valid = df[features].values[valid_idx]
X_test = df[features].values[test_idx]

y_train = df["fire"].values[train_idx]
y_valid = df["fire"].values[valid_idx]
y_test = df["fire"].values[test_idx]


A8 32
A10 21
A20 2
fire 2
{'A8': array(['01000', '02000', '03000', '04000', '05000', '06000', '07000',
       '08000', '09000', '10000', '11000', '12000', '13000', '14000',
       '15000', '16000', '17000', '18000', '19000', '20000', '21000',
       '22000', '23000', '24000', '25000', '26000', '27000', '29000',
       '31000', 'N_A', 'Z3000', 'Z5000', 'Z6000', 'Z8000', 'Z9000'],
      dtype=object), 'A10': array(['10', '11', '12', '13', '19', '21', '22', '26', '29', '30', '31',
       '32', '33', '37', '39', '40', '41', '42', '43', '49', '50', '51',
       '52', '53', '61', '63', '74', '99', 'N_A'], dtype=object), 'A20': array(['N', 'N_A', 'Y'], dtype=object), 'fire': array([0, 1])}


In [4]:
preds_test = clf.predict_proba(X_test)
test_auc = roc_auc_score(y_score=preds_test[:, 1], y_true=y_test)

preds_valid = clf.predict_proba(X_valid)
valid_auc = roc_auc_score(y_score=preds_valid[:, 1], y_true=y_valid)

print(f"BEST VALID SCORE : {valid_auc}")
print(f"FINAL TEST SCORE : {test_auc}")


BEST VALID SCORE : 0.6750642553390347
FINAL TEST SCORE : 0.6576550560261235


In [5]:
df_orig = pd.read_parquet("output/data.parquet")
df2 = df_orig.loc[
    :,
    [
        "X",
        "Y",
        "A8",
        "A10",
        "A14",
        "A18",
        "A20",
        "gd_zone",
        "gg_zone",
        "jd_zone",
        "pr_zone",
        "si_zone",
        "crime_zone",
        "fire",
        "fire_1km_cnt",
        "fire_1km2_cnt",
        "fire_500m_cnt",
        "year",
    ],
]

categorical_columns = ["A8", "A10", "A20", "fire"]
categorical_classes = {}
categorical_dims = {}

for col in categorical_columns:
    print(col, df2[col].nunique())
    l_enc = LabelEncoder()
    l_enc.fit(df_orig[col].fillna("N_A").values)
    df2[col] = df2[col].fillna("N_A")
    df2[col] = l_enc.transform(df2[col].values)
    categorical_classes[col] = l_enc.classes_
    categorical_dims[col] = len(l_enc.classes_)

print(categorical_classes)

numerical_columns = [
    "A14",
    "A18",
    "fire_1km_cnt",
    "fire_1km2_cnt",
    "fire_500m_cnt",
    "crime_zone",
    "gd_zone",
    "gg_zone",
    "jd_zone",
    "pr_zone",
    "si_zone",
    "year",
]

df2[numerical_columns] = df2[numerical_columns].astype(float)
for col in numerical_columns:
    m = df2.loc[:, col].mean()
    df2.loc[:, col] = df2.loc[:, col].fillna(m)

features = [col for col in df2.columns if col not in ["X", "Y", "set", "fire"]]
cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [
    categorical_dims[f] for _, f in enumerate(features) if f in categorical_columns
]

X_all = df2[features].values
y_all = df2["fire"].values


A8 34
A10 28
A20 2
fire 2
{'A8': array(['01000', '02000', '03000', '04000', '05000', '06000', '07000',
       '08000', '09000', '10000', '11000', '12000', '13000', '14000',
       '15000', '16000', '17000', '18000', '19000', '20000', '21000',
       '22000', '23000', '24000', '25000', '26000', '27000', '29000',
       '31000', 'N_A', 'Z3000', 'Z5000', 'Z6000', 'Z8000', 'Z9000'],
      dtype=object), 'A10': array(['10', '11', '12', '13', '19', '21', '22', '26', '29', '30', '31',
       '32', '33', '37', '39', '40', '41', '42', '43', '49', '50', '51',
       '52', '53', '61', '63', '74', '99', 'N_A'], dtype=object), 'A20': array(['N', 'N_A', 'Y'], dtype=object), 'fire': array([0, 1])}


In [6]:
df[df.fire == 1]


Unnamed: 0,A8,A10,A14,A18,A20,gd_zone,gg_zone,jd_zone,pr_zone,si_zone,crime_zone,fire,fire_1km_cnt,fire_1km2_cnt,fire_500m_cnt,year,set
4,2,5,283.06,0.00000,0,0.000000,0.000088,0.000458,0.000871,0.00000,0.001417,1,115.0,40.0,36.0,1991.0,train
6,3,5,393.36,152.21000,0,0.000000,0.000056,0.000846,0.000506,0.00000,0.001408,1,208.0,105.0,83.0,2000.0,train
8,2,5,1083.03,205.78000,0,0.000000,0.000059,0.000755,0.000808,0.00000,0.001623,1,139.0,59.0,46.0,1984.0,valid
10,13,5,3522.00,299.00000,0,0.000001,0.001114,0.003730,0.003949,0.00000,0.008794,1,85.0,34.0,30.0,1991.0,train
14,1,5,3177.46,0.00000,2,0.000000,0.000162,0.000626,0.000162,0.00000,0.000950,1,114.0,38.0,34.0,1971.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40948,0,1,148.56,0.00000,0,0.000000,0.000081,0.000730,0.000645,0.00000,0.001457,1,165.0,51.0,42.0,1990.0,train
40950,2,5,1049.00,0.00000,0,0.000018,0.000716,0.006409,0.007184,0.00000,0.014327,1,273.0,79.0,67.0,1989.0,train
40952,0,1,196.43,99.40000,0,0.000000,0.000000,0.000851,0.001133,0.00000,0.001984,1,187.0,54.0,46.0,1990.0,train
40956,3,5,426.90,182.89733,0,0.000000,0.000138,0.002093,0.001374,0.00000,0.003605,1,186.0,98.0,86.0,1995.0,train


In [7]:
X_all[y_all == 1][0]


array([2.100000e+01, 5.000000e+00, 1.171852e+04, 7.100000e-01,
       0.000000e+00, 0.000000e+00, 4.839540e-06, 8.271350e-05,
       1.022320e-04, 0.000000e+00, 1.897850e-04, 4.000000e+01,
       1.900000e+01, 1.600000e+01, 2.001000e+03])

In [8]:
X_all[y_all == 1][-1]


array([1.00000e+00, 5.00000e+00, 1.15968e+03, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 1.36776e-05, 8.21051e-05, 2.45499e-04, 0.00000e+00,
       3.41282e-04, 1.60000e+01, 5.00000e+00, 4.00000e+00, 1.98200e+03])

In [9]:
preds = clf.predict(X_all)


In [10]:
preds


array([1, 0, 1, ..., 0, 0, 0])

In [11]:
df_out = df_orig.copy()
df_out["pred"] = preds


In [12]:
df_out


Unnamed: 0,X,Y,A8,A10,A13,A14,A18,A19,A20,NEAR_DIST,...,jd_zone,pr_zone,si_zone,crime_zone,fire,fire_1km_cnt,fire_1km2_cnt,fire_500m_cnt,year,pred
37,959850.802938,1.965981e+06,19000,31,2013-11-11,478.256,0.00,100196744,N,175.894733,...,0.000083,0.000102,0.000000e+00,0.000190,0,36,14,12,2013,1
39,959820.766646,1.965963e+06,19000,31,2013-11-11,135.880,0.00,100196745,N,147.860441,...,0.000083,0.000102,0.000000e+00,0.000190,0,36,15,12,2013,0
42,958669.866323,1.965951e+06,01000,51,1953-03-10,149.420,0.00,16993,N,818.529556,...,0.000001,0.000002,2.000000e-11,0.000004,0,6,0,0,1953,1
48,959861.057922,1.965921e+06,03000,21,2016-10-21,231.520,72.58,100207771,N,131.033976,...,0.000083,0.000102,0.000000e+00,0.000190,0,38,16,14,2016,0
57,959931.949212,1.965876e+06,22000,32,1995-05-15,434.060,0.00,14253,N,166.217673,...,0.000257,0.000225,0.000000e+00,0.000562,0,43,17,16,1995,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669095,961974.807334,1.937380e+06,01000,11,1979-11-10,119.650,30.49,18966,N,739.689106,...,0.000011,0.000013,0.000000e+00,0.000025,0,2,0,0,1979,0
669096,961915.967116,1.937386e+06,01000,21,2011-06-13,280.780,76.72,100205072,N,683.353997,...,0.000011,0.000013,0.000000e+00,0.000025,0,2,0,0,2011,0
669100,961971.627423,1.937371e+06,01000,32,2005-01-01,34.750,9.63,18967,N,740.339123,...,0.000011,0.000013,0.000000e+00,0.000025,0,2,0,0,2005,0
669101,961887.615993,1.937368e+06,01000,11,1982-04-21,147.300,0.00,18975,N,665.168152,...,0.000011,0.000013,0.000000e+00,0.000025,0,2,0,0,1982,0


In [13]:
df_out[["fire", "pred"]].value_counts()


fire  pred
0     0       328791
      1       190840
1     1        12285
      0         8195
dtype: int64

In [25]:
df_plot = pd.DataFrame()
df_plot["count"] = df_out.groupby("pred").fire.count()
df_plot["share"] = df_plot["count"] / len(df_out)
df_plot["risk"] = df_out.groupby("pred").fire.mean()
df_plot["rr"] = df_plot["risk"] / df_out.fire.mean()


In [26]:
df_plot


Unnamed: 0_level_0,count,share,risk,rr
pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,336986,0.62392,0.024319,0.641343
1,203125,0.37608,0.06048,1.595015


TabNet이 화재 발생을 예측한 고위험군은 모집단 대비 1.6배의 상대위험도를 보이는 37.6%의 건축물임.

In [28]:
df_out.to_csv('output/predict_full.csv')
df_out[["X", "Y", "fire", "pred"]].to_csv('output/predict_lean.csv')
