In [69]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

fires = pd.read_csv("fires.csv")
supplies = pd.read_csv("supplies.csv")
temperature = pd.read_csv("temperature.csv")

weather_files = [
    "weather_data_2015.csv",
    "weather_data_2016.csv",
    "weather_data_2017.csv",
    "weather_data_2018.csv",
    "weather_data_2019.csv",
    "weather_data_2020.csv",
    "weather_data_2021.csv",
]
weather = pd.concat([pd.read_csv(f) for f in weather_files], ignore_index=True)

In [70]:
pip install catboost



In [71]:
fires["Дата начала"] = pd.to_datetime(fires["Дата начала"])
fires["Дата оконч."] = pd.to_datetime(fires["Дата оконч."])
fires["Нач.форм.штабеля"] = pd.to_datetime(fires["Нач.форм.штабеля"])

fires["form_start"] = fires["Нач.форм.штабеля"].dt.normalize()

temperature["Дата акта"] = pd.to_datetime(temperature["Дата акта"])
temperature["date"] = temperature["Дата акта"].dt.normalize()

supplies["ВыгрузкаНаСклад"] = pd.to_datetime(supplies["ВыгрузкаНаСклад"])
supplies["ПогрузкаНаСудно"] = pd.to_datetime(supplies["ПогрузкаНаСудно"])
supplies["date_in"] = supplies["ВыгрузкаНаСклад"].dt.normalize()
supplies["date_out"] = supplies["ПогрузкаНаСудно"].dt.normalize()

weather["date"] = pd.to_datetime(weather["date"])
weather["date"] = weather["date"].dt.normalize()

In [72]:
weather["date"].max()

Timestamp('2021-12-31 00:00:00')

In [73]:
j = fires.head(20)

In [74]:
j.to_csv("fires_head_20.csv", index = False)

In [75]:
MERGE_GAP = pd.Timedelta(hours=12)

fires = fires.sort_values(["Склад", "Штабель", "Дата начала"]).reset_index(drop=True)

fires["prev_end"] = fires.groupby(["Склад", "Штабель"])["Дата оконч."].shift(1)
fires["gap"] = fires["Дата начала"] - fires["prev_end"]
fires["new_fire"] = fires["gap"].isna() | (fires["gap"] > MERGE_GAP)
fires["fire_id"] = fires.groupby(["Склад", "Штабель"])["new_fire"].cumsum()

fires = (fires
         .groupby(["Склад", "Штабель", "fire_id"], as_index=False)
         .agg({
             "Дата начала": "min",
             "Дата оконч.": "max",
             "Вес по акту, тн": "sum",
             "Груз": "first",
             "Дата составления": "first",
             "Нач.форм.штабеля": "first",
             "form_start": "first"
         }))


In [76]:
supplies = supplies.sort_values(["Склад", "Штабель", "date_in"]).reset_index(drop=True)
supplies["pile_id"] = supplies.groupby(["Склад", "Штабель"]).cumcount() + 1
def expand_one_pile(row):
    days = pd.date_range(row["date_in"], row["date_out"], freq="D")
    return pd.DataFrame({
        "date": days,
        "Склад": row["Склад"],
        "Штабель": row["Штабель"],
        "coal_type" : row["Наим. ЕТСНГ"],
        "pile_id": row["pile_id"],
        "date_in": row["date_in"],
        "date_out": row["date_out"],
        "mass" : row["На склад, тн"]
    })
base = pd.concat([expand_one_pile(r) for _, r in supplies.iterrows()],
                 ignore_index=True)

In [77]:
supplies.shape

(6323, 10)

In [78]:
supplies.head(3)

Unnamed: 0,ВыгрузкаНаСклад,Наим. ЕТСНГ,Штабель,ПогрузкаНаСудно,"На склад, тн","На судно, тн",Склад,date_in,date_out,pile_id
0,2019-01-02,E5,1,2019-02-08,11984.1925,11984.1925,3,2019-01-02,2019-02-08,1
1,2019-01-06,E5,1,2019-02-08,11427.706,11427.706,3,2019-01-06,2019-02-08,2
2,2019-01-07,E5,1,2019-02-08,11984.1925,11984.1925,3,2019-01-07,2019-02-08,3


In [79]:
base.shape

(144044, 8)

In [80]:
base.head(5)

Unnamed: 0,date,Склад,Штабель,coal_type,pile_id,date_in,date_out,mass
0,2019-01-02,3,1,E5,1,2019-01-02,2019-02-08,11984.1925
1,2019-01-03,3,1,E5,1,2019-01-02,2019-02-08,11984.1925
2,2019-01-04,3,1,E5,1,2019-01-02,2019-02-08,11984.1925
3,2019-01-05,3,1,E5,1,2019-01-02,2019-02-08,11984.1925
4,2019-01-06,3,1,E5,1,2019-01-02,2019-02-08,11984.1925


In [81]:
temperature = temperature.drop(columns=["Пикет"], errors="ignore")
temp_daily = (temperature
              .groupby(["Склад", "Штабель", "date"], as_index=False)
              .agg({
                  "Максимальная температура": "max"
              })
             )
base = base.merge(temp_daily,
                  on=["Склад", "Штабель", "date"],
                  how="left")

weather = weather.drop(columns=["visibility", "p", "wind_dir"], errors="ignore")

weather_daily = (weather
                 .groupby("date", as_index=False)
                 .mean(numeric_only=True)
                )
base = base.merge(weather_daily, on="date", how="left")



In [82]:
len(base)

144044

In [83]:
len(base) - base['Максимальная температура'].isna().sum()

np.int64(34241)

In [84]:
base = base.dropna(subset=["Максимальная температура"]).reset_index(drop=True)

In [85]:
len(base)

34241

In [86]:
base = base.reset_index(drop=True)
base["row_id"] = base.index

fires_start = fires[["Склад", "Штабель", "Дата начала"]].dropna().copy()
fires_start["Дата начала"] = pd.to_datetime(fires_start["Дата начала"]).dt.normalize()

base["date"] = pd.to_datetime(base["date"]).dt.normalize()
base["date_in"] = pd.to_datetime(base["date_in"]).dt.normalize()
base["date_out"] = pd.to_datetime(base["date_out"]).dt.normalize()

tmp = base.merge(fires_start, on=["Склад", "Штабель"], how="left")

tmp["fire_before"] = (
    (tmp["Дата начала"] >= tmp["date_in"]) &
    (tmp["Дата начала"] <= tmp["date_out"]) &
    (tmp["Дата начала"] < tmp["date"])
)

had_before = (tmp.groupby("row_id")["fire_before"]
                .any()
                .astype(int))

base["had_fire_before"] = base["row_id"].map(had_before).fillna(0).astype(int)
base = base.drop(columns=["row_id"])

In [87]:
fires_start = (fires[["Склад", "Штабель", "Дата начала"]]
               .dropna()
               .sort_values(["Склад", "Штабель", "Дата начала"])
              )

fires_start["Дата начала"] = pd.to_datetime(fires_start["Дата начала"]).dt.normalize()
base["date"] = pd.to_datetime(base["date"]).dt.normalize()
base["date_out"] = pd.to_datetime(base["date_out"]).dt.normalize()

def add_target_for_group(g):
    fdates = fires_start.loc[
        (fires_start["Склад"] == g.name[0]) &
        (fires_start["Штабель"] == g.name[1]),
        "Дата начала"
    ].values

    if len(fdates) == 0:
        g["nearest_fire_date"] = pd.NaT
        g["days_to_fire"] = np.nan
        return g

    idx = np.searchsorted(fdates, g["date"].values, side="left")
    nearest = np.take(fdates, np.minimum(idx, len(fdates)-1))
    nearest[idx == len(fdates)] = np.datetime64("NaT")
    g["nearest_fire_date"] = pd.to_datetime(nearest)
    g["days_to_fire"] = (g["nearest_fire_date"] - g["date"]).dt.days

    return g

base = (base
        .sort_values(["Склад", "Штабель", "date"])
        .groupby(["Склад", "Штабель"], group_keys=False)
        .apply(add_target_for_group)
       )
base_all = base.copy()
base = base[
    base["nearest_fire_date"].notna() &
    (base["nearest_fire_date"] <= base["date_out"])
].reset_index(drop=True)

base.head()


  .apply(add_target_for_group)


Unnamed: 0,date,Склад,Штабель,coal_type,pile_id,date_in,date_out,mass,Максимальная температура,t,humidity,precipitation,v_avg,v_max,cloudcover,weather_code,had_fire_before,nearest_fire_date,days_to_fire
0,2020-08-14,3,33,A1,6,2020-08-14,2020-09-18,186.116,38.1,22.1375,55.291667,0.0,21.508333,31.075,24.333333,0.583333,0,2020-09-09,26.0
1,2020-08-15,3,33,A1,6,2020-08-14,2020-09-18,186.116,40.0,21.5625,53.666667,0.0,21.5125,31.25,14.916667,0.458333,0,2020-09-09,25.0
2,2020-08-15,3,33,A1,10,2020-08-15,2020-09-18,6097.305,40.0,21.5625,53.666667,0.0,21.5125,31.25,14.916667,0.458333,0,2020-09-09,25.0
3,2020-08-16,3,33,A1,6,2020-08-14,2020-09-18,186.116,37.3,21.7,51.875,0.0,22.225,32.258333,7.791667,0.166667,0,2020-09-09,24.0
4,2020-08-16,3,33,A1,10,2020-08-15,2020-09-18,6097.305,37.3,21.7,51.875,0.0,22.225,32.258333,7.791667,0.166667,0,2020-09-09,24.0


In [88]:
len(base_all)

34241

In [89]:
base.shape

(8316, 19)

In [90]:
len(base)

8316

In [91]:
base.head(20).to_csv("base1_head_20.csv")

In [92]:
base.duplicated(["Склад","Штабель","pile_id","date"]).sum()

np.int64(0)

In [93]:
base = base.sort_values(["Склад","Штабель","pile_id","date"]).reset_index(drop=True)

grp = base.groupby(["Склад","Штабель","pile_id"])

base["temp_mean_3d"] = grp["Максимальная температура"].transform(
    lambda s: s.rolling(3, min_periods=1).mean()
)
base["temp_max_3d"] = grp["Максимальная температура"].transform(
    lambda s: s.rolling(3, min_periods=1).max()
)

base["temp_mean_7d"] = grp["Максимальная температура"].transform(
    lambda s: s.rolling(7, min_periods=1).mean()
)
base["temp_max_7d"] = grp["Максимальная температура"].transform(
    lambda s: s.rolling(7, min_periods=1).max()
)

In [94]:
base.head(5)

Unnamed: 0,date,Склад,Штабель,coal_type,pile_id,date_in,date_out,mass,Максимальная температура,t,...,v_max,cloudcover,weather_code,had_fire_before,nearest_fire_date,days_to_fire,temp_mean_3d,temp_max_3d,temp_mean_7d,temp_max_7d
0,2020-08-14,3,33,A1,6,2020-08-14,2020-09-18,186.116,38.1,22.1375,...,31.075,24.333333,0.583333,0,2020-09-09,26.0,38.1,38.1,38.1,38.1
1,2020-08-15,3,33,A1,6,2020-08-14,2020-09-18,186.116,40.0,21.5625,...,31.25,14.916667,0.458333,0,2020-09-09,25.0,39.05,40.0,39.05,40.0
2,2020-08-16,3,33,A1,6,2020-08-14,2020-09-18,186.116,37.3,21.7,...,32.258333,7.791667,0.166667,0,2020-09-09,24.0,38.466667,40.0,38.466667,40.0
3,2020-08-17,3,33,A1,6,2020-08-14,2020-09-18,186.116,36.7,23.2,...,24.354167,0.125,0.0,0,2020-09-09,23.0,38.0,40.0,38.025,40.0
4,2020-08-18,3,33,A1,6,2020-08-14,2020-09-18,186.116,36.8,24.154167,...,15.829167,5.0,0.083333,0,2020-09-09,22.0,36.933333,37.3,37.78,40.0


# Обучение и тестирование моделей

In [95]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

base["pile_len"] = base.groupby(["Склад","Штабель","pile_id"])["date"].transform("count")
base["w"] = 1 / (1 + base["days_to_fire"])
base["y_log"] = np.log1p(base["days_to_fire"])

base_sorted = base.sort_values("date").reset_index(drop=True)


#base_sorted = base_sorted[base_sorted["days_to_fire"] <= CAP]

q1 = base_sorted["date"].quantile(0.6)
q2 = base_sorted["date"].quantile(0.8)
q3 = base_sorted["date"].quantile(0.9)
q4 = base_sorted["date"].max()
train_main = base_sorted[(base_sorted["date"] >= q1) & (base_sorted["date"] < q2)]
valid_df   = base_sorted[(base_sorted["date"] >= q2) & (base_sorted["date"] < q3)]
test_df    = base_sorted[(base_sorted["date"] >= q3) & (base_sorted["date"] <= q4)]

for df in [train_main, valid_df, test_df]:
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month

# таргеты
# days_to_fire
y_train = train_main["days_to_fire"]
y_valid = valid_df["days_to_fire"]
y_test  = test_df["days_to_fire"]

drop_cols = ["days_to_fire","nearest_fire_date","date_in","date_out","pile_id","date", "w", "pile_len", "days_to_fire_cap", "y_log" ]
X_train = train_main.drop(columns=drop_cols, errors="ignore")
X_valid = valid_df.drop(columns=drop_cols, errors="ignore")
X_test  = test_df.drop(columns=drop_cols, errors="ignore")

cat_features = [c for c in ["Склад","Штабель","coal_type"] if c in X_train.columns]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["year"] = df["date"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["month"] = df["date"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["year"] = df["date"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] 

In [96]:
print(len(X_train), len(X_valid), len(X_test))

1692 776 909


In [97]:
X_train.head(5)

Unnamed: 0,Склад,Штабель,coal_type,mass,Максимальная температура,t,humidity,precipitation,v_avg,v_max,cloudcover,weather_code,had_fire_before,temp_mean_3d,temp_max_3d,temp_mean_7d,temp_max_7d,year,month
4939,4,39,A1,372.1045,286.0,23.166667,56.083333,0.004167,16.104167,24.0375,17.25,2.458333,1,144.3,286.0,164.014286,286.0,2020,8
4940,4,39,A1,186.388,286.0,23.166667,56.083333,0.004167,16.104167,24.0375,17.25,2.458333,1,144.3,286.0,164.014286,286.0,2020,8
4941,6,1,A1,2781.0895,55.3,23.166667,56.083333,0.004167,16.104167,24.0375,17.25,2.458333,0,112.766667,214.0,85.58,214.0,2020,8
4942,4,39,A1,741.5315,286.0,23.166667,56.083333,0.004167,16.104167,24.0375,17.25,2.458333,1,144.3,286.0,164.014286,286.0,2020,8
4943,4,12,A1,13205.311,37.6,23.166667,56.083333,0.004167,16.104167,24.0375,17.25,2.458333,0,38.566667,42.0,37.385714,42.0,2020,8


In [98]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=500,
    depth=5,
    learning_rate=0.03,
    loss_function="MAE",
    verbose=200
)

model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_valid, y_valid),
    use_best_model=True
)

pred_days = model.predict(X_test)

mae_test = mean_absolute_error(test_df["days_to_fire"], pred_days)
baseline_test = mean_absolute_error(
    test_df["days_to_fire"],
    np.full_like(test_df["days_to_fire"], np.median(train_main["days_to_fire"]), dtype=float)
)

print("TEST MAE:", mae_test)

0:	learn: 3.8527176	test: 2.6405143	best: 2.6405143 (0)	total: 21.4ms	remaining: 10.7s
200:	learn: 0.6032799	test: 5.2195809	best: 2.5571295 (3)	total: 2.88s	remaining: 4.29s
400:	learn: 0.3993007	test: 5.3042525	best: 2.5571295 (3)	total: 5.91s	remaining: 1.46s
499:	learn: 0.3592356	test: 5.3343422	best: 2.5571295 (3)	total: 6.89s	remaining: 0us

bestTest = 2.557129524
bestIteration = 3

Shrink model to first 4 iterations.
TEST MAE: 1.5233830043671008


In [99]:
base_sorted = base.sort_values("date")
split_date = base_sorted["date"].quantile(0.8)

train_df = base_sorted[base_sorted["date"] <= split_date]
test_df  = base_sorted[base_sorted["date"] > split_date]

train_df["days_to_fire"].describe(), test_df["days_to_fire"].describe()

(count    6696.000000
 mean        6.497461
 std         7.400825
 min         0.000000
 25%         1.000000
 50%         4.000000
 75%        10.000000
 max        37.000000
 Name: days_to_fire, dtype: float64,
 count    1620.000000
 mean        2.717284
 std         2.770010
 min         0.000000
 25%         1.000000
 50%         2.000000
 75%         4.000000
 max        17.000000
 Name: days_to_fire, dtype: float64)

In [100]:
H = 7

base_cls = base_all.copy()

base_cls["fire_within_7d"] = (
    base_cls["days_to_fire"].notna() &
    (base_cls["days_to_fire"] >= 0) &
    (base_cls["days_to_fire"] <= H) &
    (base_cls["nearest_fire_date"] <= base_cls["date_out"])  # пожар внутри жизни кучи
).astype(int)

base_cls["fire_within_7d"].value_counts()

Unnamed: 0_level_0,count
fire_within_7d,Unnamed: 1_level_1
0,28146
1,6095


In [101]:
len(base_all)

34241

In [102]:
grp = base_cls.groupby(["Склад","Штабель","pile_id"])

#скользящие средние/максимумы температуры
base_cls["temp_mean_3d"] = grp["Максимальная температура"].transform(
    lambda s: s.rolling(3, min_periods=1).mean()
)
base_cls["temp_max_3d"] = grp["Максимальная температура"].transform(
    lambda s: s.rolling(3, min_periods=1).max()
)
base_cls["temp_mean_7d"] = grp["Максимальная температура"].transform(
    lambda s: s.rolling(7, min_periods=1).mean()
)
base_cls["temp_max_7d"] = grp["Максимальная температура"].transform(
    lambda s: s.rolling(7, min_periods=1).max()
)

In [103]:
base_cls['fire_within_7d'].value_counts()

Unnamed: 0_level_0,count
fire_within_7d,Unnamed: 1_level_1
0,28146
1,6095


In [104]:
base.head(3)

Unnamed: 0,date,Склад,Штабель,coal_type,pile_id,date_in,date_out,mass,Максимальная температура,t,...,had_fire_before,nearest_fire_date,days_to_fire,temp_mean_3d,temp_max_3d,temp_mean_7d,temp_max_7d,pile_len,w,y_log
0,2020-08-14,3,33,A1,6,2020-08-14,2020-09-18,186.116,38.1,22.1375,...,0,2020-09-09,26.0,38.1,38.1,38.1,38.1,36,0.037037,3.295837
1,2020-08-15,3,33,A1,6,2020-08-14,2020-09-18,186.116,40.0,21.5625,...,0,2020-09-09,25.0,39.05,40.0,39.05,40.0,36,0.038462,3.258097
2,2020-08-16,3,33,A1,6,2020-08-14,2020-09-18,186.116,37.3,21.7,...,0,2020-09-09,24.0,38.466667,40.0,38.466667,40.0,36,0.04,3.218876


In [105]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
import numpy as np

base_sorted = base_cls.sort_values("date").reset_index(drop=True)
q60 = base_sorted["date"].quantile(0.6)
q80 = base_sorted["date"].quantile(0.8)
q90 = base_sorted["date"].quantile(0.9)

train_df = base_sorted[(base_sorted["date"] > q60) & (base_sorted["date"] <= q80)]
valid_df = base_sorted[(base_sorted["date"] > q80) & (base_sorted["date"] <= q90)]
test_df  = base_sorted[base_sorted["date"] > q90]

for df in [train_df, valid_df, test_df]:
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["dayofyear"] = df["date"].dt.dayofyear

y_train = train_df["fire_within_7d"]
y_valid = valid_df["fire_within_7d"]
y_test  = test_df["fire_within_7d"]

drop_cols = [
    "fire_within_7d",
    "days_to_fire",
    "nearest_fire_date",
    "date_in","date_out","pile_id","date",
    "w","pile_len","days_to_fire_cap","y_log"
]
X_train = train_df.drop(columns=drop_cols, errors="ignore")
X_valid = valid_df.drop(columns=drop_cols, errors="ignore")
X_test  = test_df.drop(columns=drop_cols, errors="ignore")

cat_features = [c for c in ["Склад","Штабель","coal_type"] if c in X_train.columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["year"] = df["date"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["month"] = df["date"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["dayofyear"] = df["date"].dt.dayofyear
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

In [106]:
train_df.columns

Index(['date', 'Склад', 'Штабель', 'coal_type', 'pile_id', 'date_in',
       'date_out', 'mass', 'Максимальная температура', 't', 'humidity',
       'precipitation', 'v_avg', 'v_max', 'cloudcover', 'weather_code',
       'had_fire_before', 'nearest_fire_date', 'days_to_fire',
       'fire_within_7d', 'temp_mean_3d', 'temp_max_3d', 'temp_mean_7d',
       'temp_max_7d', 'year', 'month', 'dayofyear'],
      dtype='object')

In [107]:
X_train.shape

(6758, 20)

In [108]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
import numpy as np

n0 = (y_train == 0).sum()
n1 = (y_train == 1).sum()
w0 = 1.0
w1 = n0 / n1  # чтобы единицы весили больше

print(w0, w1)

modelCl = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.03,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=200,
    class_weights=[w0, w1]
)


modelCl.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_valid, y_valid),
    use_best_model=True
)

proba_test = modelCl.predict_proba(X_test)[:, 1]
pred_test = (proba_test > 0.5).astype(int)

print("AUC test:", roc_auc_score(y_test, proba_test))
print("F1  test:", f1_score(y_test, pred_test))
print("Acc test:", accuracy_score(y_test, pred_test))
print("best it:", model.get_best_iteration())


1.0 1.9653356735410268
0:	test: 0.6767190	best: 0.6767190 (0)	total: 101ms	remaining: 50.6s
200:	test: 0.7260527	best: 0.7550208 (17)	total: 7.49s	remaining: 11.1s
400:	test: 0.7238416	best: 0.7550208 (17)	total: 13.6s	remaining: 3.35s
499:	test: 0.7188130	best: 0.7550208 (17)	total: 17.5s	remaining: 0us

bestTest = 0.7550207653
bestIteration = 17

Shrink model to first 18 iterations.
AUC test: 0.8254610572623297
F1  test: 0.39503252513305737
Acc test: 0.7006145741878841
best it: 3


In [109]:
CURRENT_DATE = pd.to_datetime("2020-08-18").normalize()

In [110]:
base['date'] = pd.to_datetime(base['date'])
base['date_in'] = pd.to_datetime(base['date_in'])
base['days_in'] = (base['date'] - base['date_in']).dt.days
base_cls['date'] = pd.to_datetime(base_cls['date'])
base_cls['date_in'] = pd.to_datetime(base_cls['date_in'])
base_cls['days_in'] = (base_cls['date'] - base_cls['date_in']).dt.days

In [111]:
base_app = base_cls.copy()

In [112]:
len(base_app)

34241

In [113]:
active_df = base_cls[base_app["date"] == CURRENT_DATE]
active_df = active_df.drop(columns=["nearest_fire_date", "days_to_fire", "date_out", "fire_within_7d"], errors="ignore")
active_df.head(2)

Unnamed: 0,date,Склад,Штабель,coal_type,pile_id,date_in,mass,Максимальная температура,t,humidity,...,v_avg,v_max,cloudcover,weather_code,had_fire_before,temp_mean_3d,temp_max_3d,temp_mean_7d,temp_max_7d,days_in
3946,2020-08-18,3,26,A1,19,2020-08-06,4040.1265,37.3,24.154167,69.916667,...,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,12
3963,2020-08-18,3,26,A1,20,2020-08-06,185.5805,37.3,24.154167,69.916667,...,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,12


In [114]:
inactive_df_reg = base[base["date_out"] < CURRENT_DATE]
len(inactive_df_reg)

5216

In [115]:
inactive_df_cl = base_app[base_app["date_out"] < CURRENT_DATE]
len(inactive_df_cl)

26197

In [116]:
inactive_df_reg.head(2)

Unnamed: 0,date,Склад,Штабель,coal_type,pile_id,date_in,date_out,mass,Максимальная температура,t,...,nearest_fire_date,days_to_fire,temp_mean_3d,temp_max_3d,temp_mean_7d,temp_max_7d,pile_len,w,y_log,days_in
1411,2020-06-29,3,38,A1,13,2020-06-27,2020-07-30,11176.4375,36.3,23.383333,...,2020-06-29,0.0,36.3,36.3,36.3,36.3,1,1.0,0.0,2
1412,2020-06-29,3,38,A1,14,2020-06-28,2020-07-30,36152.8545,36.3,23.383333,...,2020-06-29,0.0,36.3,36.3,36.3,36.3,1,1.0,0.0,1


In [117]:
inactive_df_reg.columns

Index(['date', 'Склад', 'Штабель', 'coal_type', 'pile_id', 'date_in',
       'date_out', 'mass', 'Максимальная температура', 't', 'humidity',
       'precipitation', 'v_avg', 'v_max', 'cloudcover', 'weather_code',
       'had_fire_before', 'nearest_fire_date', 'days_to_fire', 'temp_mean_3d',
       'temp_max_3d', 'temp_mean_7d', 'temp_max_7d', 'pile_len', 'w', 'y_log',
       'days_in'],
      dtype='object')

In [118]:
active_df.columns

Index(['date', 'Склад', 'Штабель', 'coal_type', 'pile_id', 'date_in', 'mass',
       'Максимальная температура', 't', 'humidity', 'precipitation', 'v_avg',
       'v_max', 'cloudcover', 'weather_code', 'had_fire_before',
       'temp_mean_3d', 'temp_max_3d', 'temp_mean_7d', 'temp_max_7d',
       'days_in'],
      dtype='object')

In [119]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

inactive_df_reg_sorted = inactive_df_reg.sort_values("date").reset_index(drop=True)

q50 = inactive_df_reg_sorted["date"].quantile(0.2)
q80 = inactive_df_reg_sorted["date"].quantile(0.8)

train_main = inactive_df_reg_sorted[(inactive_df_reg_sorted['date'] > q50) & (inactive_df_reg_sorted['date'] <= q80)]
val_main = inactive_df_reg_sorted[(inactive_df_reg_sorted['date'] > q80)]



# таргеты
# days_to_fire
y_train = train_main["days_to_fire"]
y_val = val_main['days_to_fire']

drop_cols = ["days_to_fire","nearest_fire_date","date_in","date_out","pile_id","date", "w", "pile_len", "days_to_fire_cap", "y_log", "fire_within_7d" ]
X_train = train_main.drop(columns=drop_cols, errors="ignore")
X_val = val_main.drop(columns=drop_cols, errors="ignore")
active_df = active_df.drop(columns=drop_cols, errors="ignore")

cat_features = [c for c in ["Склад","Штабель","coal_type"] if c in X_train.columns]

In [120]:
X_train.columns

Index(['Склад', 'Штабель', 'coal_type', 'mass', 'Максимальная температура',
       't', 'humidity', 'precipitation', 'v_avg', 'v_max', 'cloudcover',
       'weather_code', 'had_fire_before', 'temp_mean_3d', 'temp_max_3d',
       'temp_mean_7d', 'temp_max_7d', 'days_in'],
      dtype='object')

In [121]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=500,
    depth=5,
    learning_rate=0.03,
    loss_function="MAE",
    verbose=200
)

model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set = (X_val, y_val),
    use_best_model=True
)

pred_days = np.round(model.predict(active_df))


0:	learn: 3.5104312	test: 1.8999282	best: 1.8999282 (0)	total: 8.25ms	remaining: 4.12s
200:	learn: 0.9179531	test: 3.4129665	best: 1.7863150 (10)	total: 974ms	remaining: 1.45s
400:	learn: 0.5932597	test: 3.5623821	best: 1.7863150 (10)	total: 1.99s	remaining: 492ms
499:	learn: 0.5158180	test: 3.5559175	best: 1.7863150 (10)	total: 2.46s	remaining: 0us

bestTest = 1.786314994
bestIteration = 10

Shrink model to first 11 iterations.


In [122]:
active_df["days_to_fire"] = pred_days

In [123]:
active_df

Unnamed: 0,Склад,Штабель,coal_type,mass,Максимальная температура,t,humidity,precipitation,v_avg,v_max,cloudcover,weather_code,had_fire_before,temp_mean_3d,temp_max_3d,temp_mean_7d,temp_max_7d,days_in,days_to_fire
3946,3,26,A1,4040.1265,37.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,12,4.0
3963,3,26,A1,185.5805,37.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,12,4.0
3982,3,26,A1,3028.0910,37.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,11,4.0
4006,3,26,A1,200.2685,37.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,11,4.0
4028,3,26,A1,2526.3445,37.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,8,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33527,6,10,A1,2890.3060,369.0,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,146.200000,369.0,83.800000,369.0,6,4.0
33541,6,10,C3,1693.7525,369.0,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,146.200000,369.0,83.800000,369.0,6,4.0
33555,6,10,B2,1203.6255,369.0,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,146.200000,369.0,83.800000,369.0,6,4.0
33803,6,45,C3,11708.6225,41.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,42.433333,48.1,40.600000,48.1,12,3.0


In [124]:
fi = model.get_feature_importance(type="PredictionValuesChange")
feat_names = X_train.columns

fi_df = pd.DataFrame({"feature": feat_names, "importance": fi})
fi_df = fi_df.sort_values("importance", ascending=False)

print(fi_df.head(20))

                     feature  importance
1                    Штабель   27.623925
14               temp_max_3d   25.091896
5                          t   17.860666
6                   humidity    8.611827
8                      v_avg    4.431169
10                cloudcover    3.985246
13              temp_mean_3d    3.143379
9                      v_max    2.895389
11              weather_code    2.453432
4   Максимальная температура    1.454396
12           had_fire_before    1.272783
15              temp_mean_7d    0.644534
16               temp_max_7d    0.531357
0                      Склад    0.000000
7              precipitation    0.000000
2                  coal_type    0.000000
3                       mass    0.000000
17                   days_in    0.000000


In [125]:
active_df.shape

(159, 19)

In [126]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
import numpy as np

inactive_df_cl_sorted = inactive_df_cl.sort_values("date").reset_index(drop=True)

q50 = inactive_df_cl_sorted["date"].quantile(0.5)
q80 = inactive_df_cl_sorted["date"].quantile(0.8)

train_main = inactive_df_cl_sorted[(inactive_df_cl_sorted['date'] > q50) & (inactive_df_cl_sorted['date'] <= q80)]
val_main = inactive_df_cl_sorted[(inactive_df_cl_sorted['date'] > q80)]

y_train = train_main["fire_within_7d"]
y_val = val_main["fire_within_7d"]
drop_cols = [
    "fire_within_7d",
    "days_to_fire",
    "nearest_fire_date",
    "date_in","date_out","pile_id","date",
    "w","pile_len","days_to_fire_cap","y_log"
]

drop_cols = ["days_to_fire","nearest_fire_date","date_in","date_out","pile_id","date", "w", "pile_len", "days_to_fire_cap", "y_log", "fire_within_7d" ]
X_train = train_main.drop(columns=drop_cols, errors="ignore")
X_val = val_main.drop(columns=drop_cols, errors = "ignore")
active_df = active_df.drop(columns=drop_cols, errors="ignore")

cat_features = [c for c in ["Склад","Штабель","coal_type"] if c in X_train.columns]

In [127]:
y_train.value_counts()

Unnamed: 0_level_0,count
fire_within_7d,Unnamed: 1_level_1
0,6120
1,1709


In [128]:
X_train.columns

Index(['Склад', 'Штабель', 'coal_type', 'mass', 'Максимальная температура',
       't', 'humidity', 'precipitation', 'v_avg', 'v_max', 'cloudcover',
       'weather_code', 'had_fire_before', 'temp_mean_3d', 'temp_max_3d',
       'temp_mean_7d', 'temp_max_7d', 'days_in'],
      dtype='object')

In [129]:
y_train

Unnamed: 0,fire_within_7d
13150,1
13151,1
13152,1
13153,0
13154,1
...,...
20974,0
20975,0
20976,0
20977,0


In [130]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
import numpy as np

n0 = (y_train == 0).sum()
n1 = (y_train == 1).sum()
w0 = 1.0
w1 = n0 / n1

print(w0, w1)

modelCl = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.03,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=200,
    class_weights=[w0, w1]
)


modelCl.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    use_best_model=True
)

1.0 3.5810415447630195
0:	test: 0.8307604	best: 0.8307604 (0)	total: 16.3ms	remaining: 8.12s
200:	test: 0.8215952	best: 0.8424770 (5)	total: 3.18s	remaining: 4.73s
400:	test: 0.8239411	best: 0.8424770 (5)	total: 6.82s	remaining: 1.68s
499:	test: 0.8255262	best: 0.8424770 (5)	total: 8.91s	remaining: 0us

bestTest = 0.8424770406
bestIteration = 5

Shrink model to first 6 iterations.


<catboost.core.CatBoostClassifier at 0x78ac7c7c2ea0>

In [131]:
drop_cols = ["fire_within_7d", "days_to_fire","nearest_fire_date",
             "date_in","date_out","pile_id","date",
             "w","pile_len","days_to_fire_cap","y_log"]

X_active = active_df.drop(columns=drop_cols, errors="ignore")

proba_val = modelCl.predict_proba(X_val)[:, 1]

# сетка порогов
thresholds = np.linspace(0.01, 0.99, 99)

best_thr = None
best_score = -1

scores = []
for thr in thresholds:
    pred_val = (proba_val >= thr).astype(int)
    score = f1_score(y_val, pred_val)
    scores.append(score)
    if score > best_score:
        best_score = score
        best_thr = thr

proba_active = modelCl.predict_proba(X_active)[:, 1]
pred_active  = (proba_active > best_thr).astype(int)

print("Best threshold:", best_thr)
print("Best F1:", best_score)

active_df["proba_fire_within_7d"] = proba_active
active_df["fire_within_7d"] = pred_active

pred_active[:80]

Best threshold: 0.6
Best F1: 0.7764227642276422


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [132]:
fi = modelCl.get_feature_importance(type="PredictionValuesChange")
feat_names = X_train.columns

fi_df = pd.DataFrame({"feature": feat_names, "importance": fi})
fi_df = fi_df.sort_values("importance", ascending=False)

print(fi_df.head(20))

                     feature  importance
0                      Склад   72.359887
15              temp_mean_7d   13.119702
16               temp_max_7d    4.427445
14               temp_max_3d    2.911579
6                   humidity    2.624701
11              weather_code    2.170279
3                       mass    0.929482
5                          t    0.662422
7              precipitation    0.324152
9                      v_max    0.319982
17                   days_in    0.150371
1                    Штабель    0.000000
2                  coal_type    0.000000
4   Максимальная температура    0.000000
8                      v_avg    0.000000
13              temp_mean_3d    0.000000
10                cloudcover    0.000000
12           had_fire_before    0.000000


In [133]:
active_df.shape

(159, 20)

In [134]:
active_df['fire_within_7d'].value_counts()

Unnamed: 0_level_0,count
fire_within_7d,Unnamed: 1_level_1
0,101
1,58


In [135]:
active_df

Unnamed: 0,Склад,Штабель,coal_type,mass,Максимальная температура,t,humidity,precipitation,v_avg,v_max,cloudcover,weather_code,had_fire_before,temp_mean_3d,temp_max_3d,temp_mean_7d,temp_max_7d,days_in,proba_fire_within_7d,fire_within_7d
3946,3,26,A1,4040.1265,37.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,12,0.283828,0
3963,3,26,A1,185.5805,37.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,12,0.268081,0
3982,3,26,A1,3028.0910,37.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,11,0.268081,0
4006,3,26,A1,200.2685,37.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,11,0.268081,0
4028,3,26,A1,2526.3445,37.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,36.566667,37.3,36.342857,38.3,8,0.268081,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33527,6,10,A1,2890.3060,369.0,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,146.200000,369.0,83.800000,369.0,6,0.619008,1
33541,6,10,C3,1693.7525,369.0,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,146.200000,369.0,83.800000,369.0,6,0.619008,1
33555,6,10,B2,1203.6255,369.0,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,146.200000,369.0,83.800000,369.0,6,0.619008,1
33803,6,45,C3,11708.6225,41.3,24.154167,69.916667,0.0,11.1375,15.829167,5.0,0.083333,0,42.433333,48.1,40.600000,48.1,12,0.564310,0
