# KMeans and MLP

In [8]:
import os
import random

import numpy as np
import pandas as pd
import torch
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.optim import AdamW
from tqdm import tqdm

RANDOM_STATE = 560
DEVICE = "cuda:0"


def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


set_seed(RANDOM_STATE)

## Загрузка данных

In [2]:
def left_top_cats(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    column: str,
    top_k: int,
    inplace: bool = False,
):
    if not inplace:
        train_df = train_df.copy()
        test_df = test_df.copy()

    top = train_df[column].value_counts().nlargest(top_k).index
    train_mask = train_df[column].isin(top) + train_df[column].isna()
    test_mask = test_df[column].isin(top) + test_df[column].isna()
    train_df[column] = train_df[column].where(train_mask, "other")
    test_df[column] = test_df[column].where(test_mask, "other")

    return train_df, test_df

In [3]:
train_df = pd.read_parquet("data/train_data.pqt")
test_df = pd.read_parquet("data/test_data_filled.pqt")
target = "end_cluster"

In [4]:
train_for_ae, test_for_ae = left_top_cats(train_df, test_df, "index_city_code", 40)
left_top_cats(train_for_ae, test_for_ae, "channel_code", 15, inplace=True)
left_top_cats(train_for_ae, test_for_ae, "okved", 30, inplace=True)
left_top_cats(train_for_ae, test_for_ae, "city", 40, inplace=True)

# датасеты для автоэнкодера, final_embeds получаем после прогона ае по этим датасетам
train_for_ae.to_parquet("data/train_ae_v.pqt")
test_for_ae.to_parquet("data/test_ae_v.pqt")

del train_for_ae, test_for_ae

In [12]:
feats = np.load("final_embeds.npy").astype(np.float32)
feats.shape

(890120, 1024)

## Clusters

In [None]:
train_user_embeds = (
    torch.from_numpy(feats[:600000])
    .to(DEVICE)
    .reshape(-1, 3, feats.shape[1])
    .mean(1)
    .cpu()
    .numpy()
)

test_df_w_embeds = test_df[["id"]].copy()
test_df_w_embeds[[f"e_{i}" for i in range(feats.shape[1])]] = feats[600000:]
test_df_w_embeds = test_df_w_embeds.copy()
test_df_w_embeds = test_df_w_embeds.groupby(["id"]).mean()
test_user_embeds = test_df_w_embeds.values

In [14]:
nkmeans = 16
train_preds = []
test_preds = []

for i in tqdm(range(nkmeans)):
    step = feats.shape[1] // nkmeans
    kmeans = KMeans(n_clusters=6, random_state=RANDOM_STATE, n_init="auto")
    train_pred = kmeans.fit_predict(train_user_embeds[:, i * step : (i + 1) * step])
    test_pred = kmeans.predict(test_user_embeds[:, i * step : (i + 1) * step])

    train_preds.append(train_pred)
    test_preds.append(test_pred)

100%|██████████| 16/16 [00:05<00:00,  2.75it/s]


In [15]:
train_km = np.hstack([x[..., None] for x in train_preds])
test_km = np.hstack([x[..., None] for x in test_preds])

train_km_with_id = np.hstack([train_df.iloc[::3,]["id"].values[..., None], train_km])
test_km_with_id = np.hstack([test_df_w_embeds.index.values[..., None], test_km])

train_df_km = pd.DataFrame(
    train_km_with_id,
    columns=[
        "id",
    ]
    + [f"cluster_{i}" for i in range(nkmeans)],
)
test_df_km = pd.DataFrame(
    test_km_with_id,
    columns=[
        "id",
    ]
    + [f"cluster_{i}" for i in range(nkmeans)],
)

train_merged = pd.merge(train_df, train_df_km, left_on="id", right_on="id", how="left")
test_merged = pd.merge(test_df, test_df_km, left_on="id", right_on="id", how="left")

In [16]:
train_merged.to_parquet("data/train_cluster_kmeans.pqt")
test_merged.to_parquet("data/test_cluster_kmeans.pqt")

## Training

### Train test split

In [18]:
TEST_SIZE = 0.15

train = train_df
unique_ids = train["id"].unique()
np.random.shuffle(unique_ids)

ids_and_clusters = train[train["date"] == "month_3"][
    ["id", "end_cluster"]
].drop_duplicates()
train_ids, test_ids, _, _ = train_test_split(
    ids_and_clusters["id"],
    ids_and_clusters["end_cluster"],
    stratify=ids_and_clusters["end_cluster"],
    test_size=0.15,
    random_state=RANDOM_STATE,
    shuffle=True,
)
dataset_train = train[train["id"].isin(train_ids)]
dataset_val = train[train["id"].isin(test_ids)]

y_train = dataset_train[target]
y_val = dataset_val[target]

f_train = feats[:600000][train["id"].isin(train_ids)]
f_val = feats[:600000][train["id"].isin(test_ids)]

le = LabelEncoder().fit(y_train)
y_train_enc = le.transform(y_train)
y_val_enc = le.transform(y_val)

input_size = f_train.shape[1]
output_size = len(le.classes_)

### Model

In [None]:
class MLPClassifierPyTorch(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(256, output_size),
        )
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.fc(x)
        x = self.softmax(x)
        return x

### Params

In [38]:
max_iter = 1500
batch_size = 8192
lr = 2e-4
weight_decay = 1e-4

### Train Loop

In [30]:
model = MLPClassifierPyTorch(input_size, output_size).to(DEVICE)

criterion = nn.NLLLoss()
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.998)

f_train_tensor = torch.tensor(f_train, dtype=torch.float32).to(DEVICE)
y_train_tensor = torch.tensor(y_train_enc, dtype=torch.long).to(DEVICE)

f_val_tensor = torch.tensor(f_val, dtype=torch.float32).to(DEVICE)
y_val_tensor = torch.tensor(y_val_enc, dtype=torch.long).to(DEVICE)

train_losses = []
val_losses = []


for epoch in tqdm(range(max_iter)):
    model.train()
    running_loss = 0.0

    for i in range(0, len(f_train_tensor), batch_size):
        x_batch = f_train_tensor[i : i + batch_size]
        y_batch = y_train_tensor[i : i + batch_size]

        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * x_batch.size(0)
    epoch_loss = running_loss / len(f_train_tensor)
    train_losses.append(epoch_loss)

    if (epoch + 1) % 50 == 0:  # Validation
        model.eval()
        with torch.no_grad():
            valid_loss = 0.0
            for i in range(0, len(f_val_tensor), batch_size):
                x_val_batch = f_val_tensor[i : i + batch_size]
                y_val_batch = y_val_tensor[i : i + batch_size]

                outputs = model(x_val_batch)
                loss = criterion(outputs, y_val_batch)
                valid_loss += loss.item() * x_val_batch.size(0)

            epoch_val_loss = valid_loss / len(f_val_tensor)
            val_losses.append(epoch_val_loss)

        print(f"Epoch [{epoch+1}/{max_iter}]")
        print(f"Training Loss: {epoch_loss:.4f}, Validation Loss: {epoch_val_loss:.4f}")
    scheduler.step()

  3%|▎         | 51/1500 [00:08<03:54,  6.18it/s]

Epoch [50/1500]
Training Loss: 0.9946, Validation Loss: 0.9914


  7%|▋         | 101/1500 [00:16<03:40,  6.35it/s]

Epoch [100/1500]
Training Loss: 0.9596, Validation Loss: 0.9577


 10%|█         | 151/1500 [00:23<03:32,  6.36it/s]

Epoch [150/1500]
Training Loss: 0.9370, Validation Loss: 0.9372


 13%|█▎        | 201/1500 [00:31<03:20,  6.48it/s]

Epoch [200/1500]
Training Loss: 0.9215, Validation Loss: 0.9221


 17%|█▋        | 251/1500 [00:39<03:21,  6.19it/s]

Epoch [250/1500]
Training Loss: 0.9115, Validation Loss: 0.9141


 20%|██        | 301/1500 [00:47<03:13,  6.19it/s]

Epoch [300/1500]
Training Loss: 0.9041, Validation Loss: 0.9076


 23%|██▎       | 351/1500 [00:55<02:56,  6.49it/s]

Epoch [350/1500]
Training Loss: 0.8980, Validation Loss: 0.9012


 27%|██▋       | 401/1500 [01:02<02:56,  6.22it/s]

Epoch [400/1500]
Training Loss: 0.8931, Validation Loss: 0.8974


 30%|███       | 451/1500 [01:10<02:49,  6.21it/s]

Epoch [450/1500]
Training Loss: 0.8893, Validation Loss: 0.8944


 33%|███▎      | 501/1500 [01:18<02:41,  6.18it/s]

Epoch [500/1500]
Training Loss: 0.8882, Validation Loss: 0.8920


 37%|███▋      | 551/1500 [01:26<02:25,  6.51it/s]

Epoch [550/1500]
Training Loss: 0.8835, Validation Loss: 0.8908


 40%|████      | 601/1500 [01:34<02:18,  6.49it/s]

Epoch [600/1500]
Training Loss: 0.8821, Validation Loss: 0.8910


 43%|████▎     | 651/1500 [01:41<02:10,  6.51it/s]

Epoch [650/1500]
Training Loss: 0.8800, Validation Loss: 0.8918


 47%|████▋     | 701/1500 [01:49<02:01,  6.58it/s]

Epoch [700/1500]
Training Loss: 0.8789, Validation Loss: 0.8898


 50%|█████     | 751/1500 [01:57<02:01,  6.18it/s]

Epoch [750/1500]
Training Loss: 0.8764, Validation Loss: 0.8840


 53%|█████▎    | 801/1500 [02:05<01:52,  6.21it/s]

Epoch [800/1500]
Training Loss: 0.8751, Validation Loss: 0.8823


 57%|█████▋    | 851/1500 [02:13<01:44,  6.20it/s]

Epoch [850/1500]
Training Loss: 0.8742, Validation Loss: 0.8817


 60%|██████    | 901/1500 [02:20<01:39,  6.03it/s]

Epoch [900/1500]
Training Loss: 0.8729, Validation Loss: 0.8829


 63%|██████▎   | 951/1500 [02:28<01:25,  6.43it/s]

Epoch [950/1500]
Training Loss: 0.8724, Validation Loss: 0.8828


 67%|██████▋   | 1001/1500 [02:36<01:20,  6.23it/s]

Epoch [1000/1500]
Training Loss: 0.8709, Validation Loss: 0.8813


 70%|███████   | 1051/1500 [02:44<01:12,  6.23it/s]

Epoch [1050/1500]
Training Loss: 0.8700, Validation Loss: 0.8819


 73%|███████▎  | 1101/1500 [02:52<01:05,  6.10it/s]

Epoch [1100/1500]
Training Loss: 0.8694, Validation Loss: 0.8804


 77%|███████▋  | 1151/1500 [03:00<00:55,  6.24it/s]

Epoch [1150/1500]
Training Loss: 0.8685, Validation Loss: 0.8797


 80%|████████  | 1201/1500 [03:08<00:47,  6.24it/s]

Epoch [1200/1500]
Training Loss: 0.8678, Validation Loss: 0.8797


 83%|████████▎ | 1251/1500 [03:15<00:39,  6.24it/s]

Epoch [1250/1500]
Training Loss: 0.8669, Validation Loss: 0.8785


 87%|████████▋ | 1301/1500 [03:23<00:31,  6.22it/s]

Epoch [1300/1500]
Training Loss: 0.8664, Validation Loss: 0.8783


 90%|█████████ | 1351/1500 [03:31<00:24,  6.19it/s]

Epoch [1350/1500]
Training Loss: 0.8662, Validation Loss: 0.8781


 93%|█████████▎| 1401/1500 [03:39<00:15,  6.26it/s]

Epoch [1400/1500]
Training Loss: 0.8657, Validation Loss: 0.8780


 97%|█████████▋| 1451/1500 [03:47<00:07,  6.43it/s]

Epoch [1450/1500]
Training Loss: 0.8648, Validation Loss: 0.8775


100%|██████████| 1500/1500 [03:55<00:00,  6.37it/s]

Epoch [1500/1500]
Training Loss: 0.8649, Validation Loss: 0.8771





### Validation

In [31]:
y_pred_proba_tensors = []
for i in range(0, len(f_val_tensor), batch_size):
    x_val_batch = f_val_tensor[i : i + batch_size]
    y_val_batch = y_val_tensor[i : i + batch_size]

    outputs = model(x_val_batch)
    y_pred_proba_tensors.append(outputs)

In [32]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(
        y_true, y_pred, labels=labels, multi_class="ovr", average=None
    )
    return sum(weights * classes_roc_auc), classes_roc_auc


cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [33]:
y_pred_proba = torch.concat(y_pred_proba_tensors).softmax(1).detach().cpu().numpy()
weighted_roc_auc(y_val, y_pred_proba, le.classes_, weights_dict)

(0.8951914936208778,
 array([0.89255291, 0.89447621, 0.86742571, 0.90845865, 0.88565006,
        0.94740019, 0.88652493, 0.87224966, 0.86525395, 0.95808388,
        0.86442687, 0.87320917, 0.86953557, 0.98864159, 0.94609204,
        0.84453281, 0.91684849]))

## Прогноз на тестовой выборке

In [23]:
test_df.pivot(index="id", columns="date", values="start_cluster").head(3)

date,month_4,month_5,month_6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200000,{α},{α},{α}
200001,{α},{α},{α}
200002,{other},{other},{other}


Для того, чтобы сделать прогноз на тестовой выборке, нужно заполнить стартовый кластер. </br>
В качестве базового подхода заполним все стартовые кластеры, самым популярным кластером.

In [35]:
sample_submission_df = pd.read_csv("submissions/sample_submission.csv")

Для тестовой выборки будем использовать только последний месяц

In [36]:
last_m_test_df = feats[600000:][test_df["date"] == "month_6"]

y_pred_proba_tensors = []
test_tensor = torch.tensor(last_m_test_df).to(torch.float32).to(DEVICE)
for i in range(0, len(test_tensor), batch_size):
    x_val_batch = test_tensor[i : i + batch_size]

    outputs = model(x_val_batch)
    y_pred_proba_tensors.append(outputs)

test_pred_proba = torch.concat(y_pred_proba_tensors).softmax(1).detach().cpu().numpy()
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=le.classes_)
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [27]:
test_pred_proba_df.shape

(100000, 17)

In [28]:
test_pred_proba_df.head(2)

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,0.010447,0.011041,0.036975,0.0287,0.002776,0.000126,0.000576,4.5e-05,0.007295,0.007025,0.010355,0.0005,0.003733,6.347711e-06,0.001099,0.8793,1e-06
1,0.012657,0.669136,0.000862,0.001895,0.000381,0.000119,0.000105,6e-06,0.000797,0.007721,0.001,2.7e-05,0.000432,5.159997e-07,0.000223,0.304621,1.7e-05


In [37]:
sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv("mlp_submission.csv", index=False)