In [1]:
import pandas as pd
import torch.nn as nn
import torch
from tqdm import tqdm
from torch import Tensor
import numpy as np
import torch.nn.functional as F
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### Данные

In [2]:
test = pd.read_csv('../data/test.csv')

In [3]:
def preprocess(df):
    df['activation_date'] = pd.to_datetime(df['activation_date'])

    df['day'] = df['activation_date'].dt.day
    df['month'] = df["activation_date"].dt.month
    df['year'] = df["activation_date"].dt.year
    df['weekday'] = df['activation_date'].dt.weekday
    df["dayofyear"] = df['activation_date'].dt.dayofyear
    df.drop(columns=['activation_date', 'item_id'], inplace=True)
    df['param_1'] = df['param_1'].fillna('')
    df['param_2'] = df['param_2'].fillna('')
    df['param_3'] = df['param_3'].fillna('')
    df['description'] = df['description'].fillna('')
    return df

item_id = test.item_id
test = preprocess(test)

### Transformer

In [38]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.layer_1 = nn.Linear(input_dim, hidden_dim)
        self.layer_2 = nn.Linear(hidden_dim, input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_1(x)
        x = F.gelu(x)  # Более плавная активация
        x = self.dropout(x)
        return self.layer_2(x)

class AddAndNorm(nn.Module):
    def __init__(self, input_dim, dropout=0.1):
        super().__init__()
        self.norm = nn.LayerNorm(input_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, residual):
        return self.norm(x + self.dropout(residual))
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(1)].detach()  # Отключаем градиенты
        return self.dropout(x)
    

class TransformerEncoderLayer(nn.Module):
    def __init__(self, input_dim, num_heads, dropout=0.1, positional_encoding=False):
        super().__init__()
        self.input_dim = input_dim
        self.self_attention = nn.MultiheadAttention(input_dim, num_heads, dropout=dropout, batch_first=True)
        self.feed_forward = PositionWiseFeedForward(input_dim, input_dim, dropout=dropout)
        self.add_norm_after_attention = AddAndNorm(input_dim, dropout=dropout)
        self.add_norm_after_ff = AddAndNorm(input_dim, dropout=dropout)
        self.positional_encoding = PositionalEncoding(input_dim) if positional_encoding else None

    def forward(self, key, value, query):
        if self.positional_encoding:
            key = self.positional_encoding(key)
            value = self.positional_encoding(value)
            query = self.positional_encoding(query)

        attn_output, _ = self.self_attention(query, key, value, need_weights=False)

        x = self.add_norm_after_attention(attn_output, query)

        ff_output = self.feed_forward(x)
        x = self.add_norm_after_ff(ff_output, x)

        return x

In [78]:
class MultiModalTransformer(nn.Module):
    def __init__(self, first_dim=768, second_dim=1024, hidden_dim=512, num_transformer_heads=2, positional_encoding=True, dropout=0, mode='mean', device="cuda",  tr_layer_number=1, out_features=128):
        super(MultiModalTransformer, self).__init__()

        self.mode = mode

        self.hidden_dim = hidden_dim

        # Проекционные слои

        self.first_proj = nn.Sequential(
            nn.Conv1d(first_dim, hidden_dim, 1),
            nn.GELU(),
        )

        self.second_proj = nn.Sequential(
            nn.Conv1d(second_dim, hidden_dim, 1),
            nn.GELU(),
        )

        # Механизмы внимания
        self.first_to_second_attn = nn.ModuleList([TransformerEncoderLayer(input_dim=hidden_dim, num_heads=num_transformer_heads, positional_encoding=positional_encoding, dropout=dropout) for i in range(tr_layer_number)
                ])
        self.second_to_first_attn = nn.ModuleList([TransformerEncoderLayer(input_dim=hidden_dim, num_heads=num_transformer_heads, positional_encoding=positional_encoding, dropout=dropout) for i in range(tr_layer_number)
                ])

        self.out = nn.Sequential(
            nn.Linear(hidden_dim*2, out_features) if self.mode == 'mean' else nn.Linear(hidden_dim*4, out_features),
            nn.LayerNorm(out_features),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(out_features, 1)
        )

    def forward(self, first_features, second_features):
        # Преобразование размерностей
        first_features = first_features.float()
        second_features = second_features.float()

        first_features = self.first_proj(first_features.permute(0,2,1)).permute(0,2,1)
        second_features = self.second_proj(second_features.permute(0,2,1)).permute(0,2,1)

        # Адаптивная пуллинг до минимальной длины
        min_seq_len = min(first_features.size(1), second_features.size(1))
        first_features = F.adaptive_avg_pool1d(first_features.permute(0,2,1), min_seq_len).permute(0,2,1)
        second_features = F.adaptive_avg_pool1d(second_features.permute(0,2,1), min_seq_len).permute(0,2,1)

        # Трансформерные блоки
        for i in range(len(self.first_to_second_attn)):
            attn_first = self.first_to_second_attn[i](second_features, first_features, first_features)
            attn_second = self.second_to_first_attn[i](first_features, second_features, second_features)
            first_features += attn_first
            second_features += attn_second

        # Статистики
        std_first, mean_first = torch.std_mean(attn_first, dim=1)
        std_second, mean_second = torch.std_mean(attn_second, dim=1)

        # Классификация
        if self.mode == 'mean':
            return self.out(torch.cat([mean_first, mean_first], dim=1))
        else:
            std_first = torch.nan_to_num(std_first, nan=0.0)
            std_second = torch.nan_to_num(std_second, nan=0.0)
            return self.out(torch.cat([mean_first, std_first, mean_second, std_second], dim=1))

In [7]:
import os
jina_list = sorted(os.listdir('../data/jina'), key= lambda x: int(x.replace("jina_test_", "")))

In [6]:
import pickle
import io

class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else:
            return super().find_class(module, name)

In [42]:
model = model = MultiModalTransformer(first_dim=1024, second_dim=768)
checkpoint = torch.load("models/MultiModalTransformer_5_0.82_checkpoint.pth", map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [43]:
y_pred = []
jina_list_ind = -1
len_test = test.shape[0]
x = 200
for i, row in tqdm(test.iterrows(), total=len_test):
    # text
    if i % 10000 == 0:
        jina_list_ind += 1
        jina_name = jina_list[jina_list_ind]
        with open("../data/jina/" + jina_name, "rb") as f:  
            jina_emb = CPU_Unpickler(f).load()
    # image
    if i == 32001:
        x = 100
    if i <= 36600 and i % 200 == 0:
        with open("../data/vit/vit_test_jpg_" + str(i-1+200), "rb") as f: 
            vit_emb = CPU_Unpickler(f).load() 
    if i > 36600 and i % 100 == 0:
        try:
            with open("../data/vit/vit_test_jpg_" + str(i-1+100), "rb") as f: 
                vit_emb = CPU_Unpickler(f).load() 
        except:
            vit_emb = [None] * 100
    image_embedding = vit_emb[i % x]

    text_embedding = jina_emb[i % 10000].unsqueeze(0)

    if image_embedding is None:
        image_embedding = torch.zeros(1, 1, 768)
    else:
        if image_embedding.shape[0] != 1:
            image_embedding = image_embedding.unsqueeze(0)
    y_pred.append(float(model(text_embedding, image_embedding)))

  std_first, mean_first = torch.std_mean(attn_first, dim=1)
  std_second, mean_second = torch.std_mean(attn_second, dim=1)
100%|██████████| 508438/508438 [13:00<00:00, 651.50it/s] 


In [44]:
result = np.clip(y_pred, 0, 1)
pd.DataFrame({'item_id': item_id, 'deal_probability': result}).to_csv("../results/cross-attention_mean_.csv", index=0)

Результат: 0.25820

In [53]:
model = model = MultiModalTransformer(first_dim=1024, second_dim=768, mode='not_mean')
checkpoint = torch.load("models/MultiModalTransformer_5_0.81_checkpoint.pth", map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [54]:
y_pred = []
jina_list_ind = -1
len_test = test.shape[0]
x = 200
for i, row in tqdm(test.iterrows(), total=len_test):
    # text
    if i % 10000 == 0:
        jina_list_ind += 1
        jina_name = jina_list[jina_list_ind]
        with open("../data/jina/" + jina_name, "rb") as f:  
            jina_emb = CPU_Unpickler(f).load()
    # image
    if i == 32001:
        x = 100
    if i <= 36600 and i % 200 == 0:
        with open("../data/vit/vit_test_jpg_" + str(i-1+200), "rb") as f: 
            vit_emb = CPU_Unpickler(f).load() 
    if i > 36600 and i % 100 == 0:
        try:
            with open("../data/vit/vit_test_jpg_" + str(i-1+100), "rb") as f: 
                vit_emb = CPU_Unpickler(f).load() 
        except:
            vit_emb = [None] * 100
    image_embedding = vit_emb[i % x]

    text_embedding = jina_emb[i % 10000].unsqueeze(0)

    if image_embedding is None:
        image_embedding = torch.zeros(1, 1, 768)
    else:
        if image_embedding.shape[0] != 1:
            image_embedding = image_embedding.unsqueeze(0)
    y_pred.append(float(model(text_embedding, image_embedding)))

  std_first, mean_first = torch.std_mean(attn_first, dim=1)
  std_second, mean_second = torch.std_mean(attn_second, dim=1)
100%|██████████| 508438/508438 [13:35<00:00, 623.69it/s] 


In [55]:
result = np.clip(y_pred, 0, 1)
pd.DataFrame({'item_id': item_id, 'deal_probability': result}).to_csv("../results/cross-attention_not_mean_.csv", index=0)

Результат: 0.36499

In [46]:
from torch_geometric.nn import GATConv, RGCNConv, TransformerConv

  from .autonotebook import tqdm as notebook_tqdm


In [47]:
class GraphFusionLayerAtt(nn.Module):
    def __init__(self, hidden_dim, heads=2):
        super().__init__()
        # Проекционные слои для признаков
        self.proj_audio = nn.Linear(hidden_dim, hidden_dim)
        self.proj_text = nn.Linear(hidden_dim, hidden_dim)

        # Графовые слои
        self.gat1 = GATConv(hidden_dim, hidden_dim, heads=heads)
        self.gat2 = GATConv(hidden_dim*heads, hidden_dim)

        self.attention_fusion = nn.Linear(hidden_dim, 1)

        # Финальная проекция
        self.fc = nn.Linear(hidden_dim, hidden_dim)

    def build_complete_graph(self, num_nodes):
        # Создаем полный граф (каждый узел соединен со всеми)
        edge_index = []
        for i in range(num_nodes):
            for j in range(num_nodes):
                if i != j:
                    edge_index.append([i, j])
        return torch.tensor(edge_index).t().contiguous()

    def forward(self, first_stats, second_stats):
        """
        first_stats: [batch_size, hidden_dim]
        second_stats: [batch_size, hidden_dim]
        """
        batch_size = first_stats.size(0)

        # Проекция признаков
        x_first = F.relu(self.proj_audio(first_stats))  # [batch_size, hidden_dim]
        x_second = F.relu(self.proj_text(second_stats))    # [batch_size, hidden_dim]

        # Объединение узлов (1 и 2 попеременно)
        nodes = torch.stack([x_first, x_second], dim=1)  # [batch_size, 2, hidden_dim]
        nodes = nodes.view(-1, nodes.size(-1))        # [batch_size*2, hidden_dim]

        # Построение графа (полный граф для каждого элемента батча)
        edge_index = self.build_complete_graph(2)  # Граф для одной пары 1-2
        edge_index = edge_index.to(first_stats.device)

        # Применение GAT
        x = F.relu(self.gat1(nodes, edge_index))
        x = self.gat2(x, edge_index)

        # Разделяем обратно аудио и текст
        x = x.view(batch_size, 2, -1)  # [batch_size, 2, hidden_dim]

        # Усреднение по модальностям
        # fused = torch.mean(x, dim=1)   # [batch_size, hidden_dim]

        weights = F.softmax(self.attention_fusion(x), dim=1)
        fused = torch.sum(weights * x, dim=1)  # [batch_size, hidden_dim]

        return self.fc(fused)

In [68]:
class MultiModalTransformer(nn.Module):
    def __init__(self, first_dim=768, second_dim=1024, hidden_dim=512, num_transformer_heads=2, positional_encoding=True, dropout=0, mode='mean', device="cuda",  tr_layer_number=1, out_features=128, num_heads=2):
        super(MultiModalTransformer, self).__init__()

        self.mode = mode

        self.hidden_dim = hidden_dim

        # Проекционные слои

        self.first_proj = nn.Sequential(
            nn.Conv1d(first_dim, hidden_dim, 1),
            nn.GELU(),
        )

        self.second_proj = nn.Sequential(
            nn.Conv1d(second_dim, hidden_dim, 1),
            nn.GELU(),
        )

        # Механизмы внимания
        self.first_to_second_attn = nn.ModuleList([TransformerEncoderLayer(input_dim=hidden_dim, num_heads=num_transformer_heads, positional_encoding=positional_encoding, dropout=dropout) for i in range(tr_layer_number)
                ])
        self.second_to_first_attn = nn.ModuleList([TransformerEncoderLayer(input_dim=hidden_dim, num_heads=num_transformer_heads, positional_encoding=positional_encoding, dropout=dropout) for i in range(tr_layer_number)
                ])
        
        # Графовое слияние
        if self.mode == 'mean':
            self.graph_fusion = GraphFusionLayerAtt(hidden_dim, heads=num_heads)
        else:
            self.graph_fusion = GraphFusionLayerAtt(hidden_dim*2, heads=num_heads)

        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, out_features) if self.mode == 'mean' else nn.Linear(hidden_dim*2, out_features),
            nn.LayerNorm(out_features),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(out_features, 1)
        )

    def forward(self, first_features, second_features):
        # Преобразование размерностей
        first_features = first_features.float()
        second_features = second_features.float()

        first_features = self.first_proj(first_features.permute(0,2,1)).permute(0,2,1)
        second_features = self.second_proj(second_features.permute(0,2,1)).permute(0,2,1)

        # Адаптивная пуллинг до минимальной длины
        min_seq_len = min(first_features.size(1), second_features.size(1))
        first_features = F.adaptive_avg_pool1d(first_features.permute(0,2,1), min_seq_len).permute(0,2,1)
        second_features = F.adaptive_avg_pool1d(second_features.permute(0,2,1), min_seq_len).permute(0,2,1)

        # Трансформерные блоки
        for i in range(len(self.first_to_second_attn)):
            attn_first = self.first_to_second_attn[i](second_features, first_features, first_features)
            attn_second = self.second_to_first_attn[i](first_features, second_features, second_features)
            first_features += attn_first
            second_features += attn_second

        # Статистики
        std_first, mean_first = torch.std_mean(attn_first, dim=1)
        std_second, mean_second = torch.std_mean(attn_second, dim=1)

        # Графовое слияние статистик
        if self.mode == 'mean':
            h_ta = self.graph_fusion(mean_first, mean_second)
        else:
            std_first = torch.nan_to_num(std_first, nan=0.0)
            std_second = torch.nan_to_num(std_second, nan=0.0)
            h_ta = self.graph_fusion(torch.cat([mean_first, std_first], dim=1), torch.cat([mean_second, std_second], dim=1))

        # Классификация
        return self.classifier(h_ta)

In [49]:
model = model = MultiModalTransformer(first_dim=1024, second_dim=768)
checkpoint = torch.load("models/MultiModalTransformer_3_0.81_checkpoint.pth", map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [50]:
y_pred = []
jina_list_ind = -1
len_test = test.shape[0]
x = 200
for i, row in tqdm(test.iterrows(), total=len_test):
    # text
    if i % 10000 == 0:
        jina_list_ind += 1
        jina_name = jina_list[jina_list_ind]
        with open("../data/jina/" + jina_name, "rb") as f:  
            jina_emb = CPU_Unpickler(f).load()
    # image
    if i == 32001:
        x = 100
    if i <= 36600 and i % 200 == 0:
        with open("../data/vit/vit_test_jpg_" + str(i-1+200), "rb") as f: 
            vit_emb = CPU_Unpickler(f).load() 
    if i > 36600 and i % 100 == 0:
        try:
            with open("../data/vit/vit_test_jpg_" + str(i-1+100), "rb") as f: 
                vit_emb = CPU_Unpickler(f).load() 
        except:
            vit_emb = [None] * 100
    image_embedding = vit_emb[i % x]

    text_embedding = jina_emb[i % 10000].unsqueeze(0)

    if image_embedding is None:
        image_embedding = torch.zeros(1, 1, 768)
    else:
        if image_embedding.shape[0] != 1:
            image_embedding = image_embedding.unsqueeze(0)
    y_pred.append(float(model(text_embedding, image_embedding)))

  std_first, mean_first = torch.std_mean(attn_first, dim=1)
  std_second, mean_second = torch.std_mean(attn_second, dim=1)
100%|██████████| 508438/508438 [19:49<00:00, 427.57it/s]  


In [51]:
result = np.clip(y_pred, 0, 1)
pd.DataFrame({'item_id': item_id, 'deal_probability': result}).to_csv("../results/cross-attention_mean_graph_fusion.csv", index=0)

Результат: 0.24969

In [57]:
model = model = MultiModalTransformer(first_dim=1024, second_dim=768, mode='not_mean')
checkpoint = torch.load("models/MultiModalTransformer_6_0.82_checkpoint.pth", map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [69]:
y_pred = []
jina_list_ind = -1
len_test = test.shape[0]
x = 200
for i, row in tqdm(test.iterrows(), total=len_test):
    # text
    if i % 10000 == 0:
        jina_list_ind += 1
        jina_name = jina_list[jina_list_ind]
        with open("../data/jina/" + jina_name, "rb") as f:  
            jina_emb = CPU_Unpickler(f).load()
    # image
    if i == 32001:
        x = 100
    if i <= 36600 and i % 200 == 0:
        with open("../data/vit/vit_test_jpg_" + str(i-1+200), "rb") as f: 
            vit_emb = CPU_Unpickler(f).load() 
    if i > 36600 and i % 100 == 0:
        try:
            with open("../data/vit/vit_test_jpg_" + str(i-1+100), "rb") as f: 
                vit_emb = CPU_Unpickler(f).load() 
        except:
            vit_emb = [None] * 100
    image_embedding = vit_emb[i % x]

    text_embedding = jina_emb[i % 10000].unsqueeze(0)

    if image_embedding is None:
        image_embedding = torch.zeros(1, 1, 768)
    else:
        if image_embedding.shape[0] != 1:
            image_embedding = image_embedding.unsqueeze(0)
    y_pred.append(float(model(text_embedding, image_embedding)))

  0%|          | 0/508438 [00:00<?, ?it/s]

  std_first, mean_first = torch.std_mean(attn_first, dim=1)
  std_second, mean_second = torch.std_mean(attn_second, dim=1)
100%|██████████| 508438/508438 [27:20<00:00, 310.01it/s]  


In [77]:
result = np.clip(y_pred, 0, 1)
pd.DataFrame({'item_id': item_id, 'deal_probability': result}).to_csv("../results/cross-attention_not_mean_graph_fusion.csv", index=0)