In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from torch_geometric.nn import GCNConv
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
import numpy as np
import scipy.sparse as sp
import pandas as pd
import argparse

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
# 设置环境变量
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4"
import gc
gc.collect()

0

In [3]:
class BertGCN(nn.Module):
    def __init__(self, num_vocab, bert_model_name='bert-base-uncased', gcn_hidden_dim=128, num_classes=2, num_words=5000, m=0.7):
        super(BertGCN, self).__init__()
        self.m = m
        self.num_words = num_words
        self.bert_model = AutoModel.from_pretrained(bert_model_name)
        self.gcn = GCNConv(1024, gcn_hidden_dim)  # 1024 是 BERT 的 hidden size
        self.classifier_gcn = nn.Linear(gcn_hidden_dim, num_classes)
        self.classifier_bert = nn.Linear(1024, num_classes)
        self.word_embeddings = nn.Parameter(torch.zeros(num_vocab, 1024)) # 初始化词节点的特征为零向量

    def forward(self, edge_index, edge_weight, input_ids, attention_mask, doc_indices, word_indices):
        # 获取文档节点的BERT特征
        bert_outputs = self.bert_model(input_ids, attention_mask)
        doc_embeddings = bert_outputs.last_hidden_state[:, 0, :]  # [CLS] token 的嵌入表示

        word_embeddings = self.word_embeddings.to(doc_embeddings.device)
        
        # GCN部分
        combined_embeddings = torch.cat([doc_embeddings, word_embeddings], dim=0)
        
        gcn_embeddings = self.gcn(combined_embeddings, edge_index, edge_weight=edge_weight)
        
        # 仅提取文档节点的GCN特征
        gcn_doc_embeddings = gcn_embeddings[:len(doc_indices)]
        
        # 分类器输出
        gcn_logits = self.classifier_gcn(gcn_doc_embeddings)
        gcn_probs = F.softmax(gcn_logits, dim=1)
        
        # BERT部分的预测（仅基于文档节点）
        bert_logits = self.classifier_bert(doc_embeddings)
        bert_probs = F.softmax(bert_logits, dim=1)
        
        # 平衡BERT和GCN的预测结果
        final_probs = self.m * gcn_probs + (1 - self.m) * bert_probs
        return final_probs

In [4]:

data_path = '/root/Mcqueen/kaggle_c/MyBertGCN/data/train.csv'
bert_model_name='/root/Mcqueen/kaggle_c/model/tuned_deberta_v3large/output_8/checkpoint-980'
epochs=20
batch_size=2
lr=1e-3
m=0.7


# 加载图数据
adj = sp.load_npz('./graph/adj.npz')
features = sp.load_npz('./graph/features.npz').todense()
labels = torch.load('./graph/labels.pt').to(device) - 1



In [5]:
adj = adj.tocoo()  # 将csr格式存储的稀疏矩阵转换成coo格式
edge_index = torch.tensor(np.vstack((adj.row, adj.col)), dtype=torch.long).to(device)
edge_weight = torch.tensor(adj.data, dtype=torch.float).to(device)

# 读取数据集
df = pd.read_csv(data_path)
num_docs = len(df)
num_words = adj.shape[0] - num_docs - 1 # 词汇表的大小
doc_indices = torch.arange(num_docs, dtype=torch.long).to(device)  + 1
word_indices = torch.arange(num_words, dtype=torch.long).to(device) + num_docs + 1


In [6]:
print(edge_index.shape)
print(adj.data[:20])  # 这里的data是稀疏矩阵的值，实际上是节点之间的similariy

print(num_docs)
print(num_words)

torch.Size([2, 5910356])
[0.04388138 0.04347523 0.05358129 0.0647622  0.05047925 0.04459067
 0.05354018 0.05184324 0.04772471 0.06099653 0.01646671 0.04941215
 0.05743867 0.01493714 0.04330815 0.0381083  0.10602278 0.0164276
 0.13424944 0.05489831]
17307
5000


In [7]:
tokenizer = AutoTokenizer.from_pretrained("/root/Mcqueen/kaggle_c/pretrained_llm/deberta-v3-large")
all_encodings = tokenizer(df['full_text'].values.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
# train_encodings = tokenizer(df['full_text'].values[train_indices].tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
# val_encodings = tokenizer(df['full_text'].values[val_indices].tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')

for key in all_encodings.keys():
    all_encodings[key] = all_encodings[key].to(device)




In [None]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, edge_index, edge_weight, y, input_ids, attention_mask, doc_indices, word_indices):
        self.edge_index = edge_index
        self.edge_weight = edge_weight
        self.y = y
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.doc_indices = doc_indices
        self.word_indices = word_indices

    def __len__(self):
        return len(self.doc_indices)

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            if idx.start is None or idx.stop is None:
                raise ValueError("slice start and stop cannot be None")
            step = idx.step if idx.step is not None else 1
            idx = list(range(idx.start, idx.stop, step))
        idx = idx.tolist()
        print("idx",idx)
        input_ids = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]
        doc_idx = [item+1 for item in idx]  # 因为文章都排在前面

        # 创建index到老index的映射
        selected_docs = doc_idx
        all_indices = selected_docs + self.word_indices.tolist()
        idx_map = {old_idx: new_idx + 1 for new_idx, old_idx in enumerate(all_indices)}

        # 处理边，使得不应该存在的边无效，有效的边进行端点的重新映射

        num_docs = len(self.doc_indices)

        print(num_docs)

        new_edge_index = self.edge_index.clone()
        new_edge_index[new_edge_index > num_docs] -= (num_docs - len(idx))  # 处理word节点
        # 处理doc节点
        temp = new_edge_index[self.edge_index <= num_docs].cpu().numpy()
        mapped_indices = np.vectorize(lambda x: idx_map.get(x, -1))(temp)
        new_edge_index[self.edge_index <= num_docs] = torch.tensor(mapped_indices, device=new_edge_index.device)
        mask = (new_edge_index == -1).any(dim=0)
        filtered_new_edge_index = new_edge_index[:, ~mask]

        # 更新edge_weight
        filtered_edge_weight = self.edge_weight[~mask]  # 要传递梯度
        
        # 获取新的doc和word索引
        new_doc_indices = torch.tensor([idx_map[i] for i in doc_idx], dtype=torch.long)
        new_word_indices = torch.tensor([idx_map[i.item()] for i in self.word_indices], dtype=torch.long)

        return CustomDataset(filtered_new_edge_index, filtered_edge_weight, self.y[doc_idx], input_ids, attention_mask, new_doc_indices, new_word_indices)



data_all = CustomDataset(edge_index, edge_weight, labels, all_encodings['input_ids'], all_encodings['attention_mask'], doc_indices, word_indices)
train_indices, val_indices = train_test_split(np.arange(num_docs), test_size=0.1)
print(train_indices)

data_train = data_all[train_indices]
data_val = data_all[val_indices]


print(edge_weight.device)
print(all_encodings['input_ids'])

In [None]:
# print(len(data_all))
print(edge_index[:, :10])
# print(data_all.edge_index[:, :100])
# print((data_all.edge_index==1).sum())
print(data_all[[0]].edge_index[:, :10])

In [10]:
import random
from sklearn.metrics import cohen_kappa_score
from torch.utils.tensorboard import SummaryWriter
import time
formatted_time = time.strftime("%Y%m%H%M%S", time.localtime())

writer = SummaryWriter(f'runs/{formatted_time}')

def qwk(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')


def train(model, data_train=data_train, data_eval=data_val, batch_size=2, eval_batch_size=4, num_epochs=5, lr=0.001):
    global_step = 0
    optimizer = Adam(model.parameters(), lr=lr)
    for epoch in range(num_epochs):
        model.train()
        print(f'Epoch {epoch} begin ...')
        total_loss = 0.0
        epoch_step = 0
        all_pred = []
        idx_list = list(range(len(data_train)))
        random.shuffle(idx_list)
        for i in range(0, len(idx_list), batch_size):
            epoch_step += 1
            global_step += 1
            if i + batch_size < len(idx_list):
                idxs = idx_list[i: i+batch_size]
            else:
                idxs = idx_list[i:]
            optimizer.zero_grad()
            batch = data_train[idxs]
            input_ids, attention_mask, doc_indices, edge_index, edge_weight, word_indices, labels = batch.input_ids, batch.attention_mask, batch.doc_indices, batch.edge_index, batch.edge_weight, batch.word_indices, batch.y
            out = model(edge_index, edge_weight, input_ids, attention_mask, doc_indices, word_indices)
            pred = out.argmax(dim=1)
            all_pred += pred.tolist()
            loss = F.cross_entropy(out, labels)
            loss.backward()
            optimizer.step()
            writer.add_scalar('Loss/Train', loss.item(), global_step)
            print('new iteration')
            total_loss += loss.item()
            if epoch_step % 1 == 0 or epoch_step == 1:
                print(f'    Epoch {epoch} Iteration {epoch_step} Avg train loss: {(total_loss/epoch_step):.3f}')
        print(f'Epoch {epoch} Train loss: {(total_loss / (len(idx_list)//batch_size + 1)):.3f}\n')
        epoch_qwk = qwk(data_train.y.tolist(), all_pred)
        print(f'QWK score: {epoch_qwk:.3f}')
        writer.add_scalar('QWK/Train', epoch_qwk, epoch)
    print
    evaluate(model, data_eval=data_eval, batch_size=eval_batch_size, curr_epoch=epoch)


def evaluate(model, data_eval, batch_size, curr_epoch=None):
    print('\nEvaluate ...')
    model.eval()
    total_loss = 0
    total_correct = 0
    idx_list = list(range(len(data_eval)))
    random.shuffle(idx_list)
    all_pred = []
    eval_step = 0
    for i in range(0, len(idx_list), batch_size):
        eval_step += 1
        if i + batch_size < len(idx_list):
            idxs = idx_list[i: i+batch_size]
        else:
            idxs = idx_list[i:]
        batch = data_train[idxs]
        input_ids, attention_mask, doc_indices, edge_index, edge_weight, word_indices, labels = batch.input_ids, batch.attention_mask, batch.doc_indices, batch.edge_index, batch.edge_weight, batch.word_indices, batch.y
        with torch.no_grad():
            out = model(edge_index, edge_weight, input_ids, attention_mask, doc_indices, word_indices)
            loss = F.cross_entropy(out, labels)
            total_loss += loss.item()
            pred = out.argmax(dim=1)
            all_pred += pred.tolist()
            correct = (pred == batch.y).sum().item()
            total_correct += correct
            if eval_step % 100 == 0 or eval_step == 1:
                print(f'Avg eval loss: {(total_loss/eval_step):.3f}, Avg eval acc: {(total_correct/(batch_size*eval_step)):.3f}')
    avg_loss = total_loss / (len(idx_list)//batch_size + 1)
    accuracy = total_correct / len(idx_list)
    eval_qwk = qwk(data_eval.y.tolist(), all_pred)
    print(data_eval.y.tolist(), all_pred)
    print(f'QWK score: {eval_qwk:.3f}')
    if curr_epoch:
        writer.add_scalar('Loss/Val', avg_loss, curr_epoch)
        writer.add_scalar('Accuracy/Val', accuracy, curr_epoch)
        writer.add_scalar('QWK/Val', eval_qwk, curr_epoch)
    return avg_loss, accuracy

In [16]:
# 初始化模型
model = BertGCN(num_vocab=num_words, bert_model_name=bert_model_name, num_classes=len(set(labels.tolist())), m=m, num_words=num_words)
model = model.to(device)

# 初始化优化器
optimizer = Adam(model.parameters(), lr=0.001)

# 打印优化器参数名称和形状
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(f"Parameter name: {name}, shape: {param.shape}")

total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params} ({total_params*4/(1024*1024):.2f} MB)')

Total parameters: 439270284 (1675.68 MB)


In [None]:
torch.backends.cudnn.enabled = False
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

train(model=model, data_train=data_train, batch_size=2, num_epochs=5, lr=0.0003)




In [None]:
torch.cuda.empty_cache()
gc.collect()

0

In [15]:
evaluate(model=model, data_eval=data_val[0:10], batch_size=2)


Evaluate ...
Avg eval loss: 1.550, Avg eval acc: 0.500
[2, 1, 2, 1, 2, 2, 1, 2, 2, 1] [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
QWK score: 0.000


(1.4546708861986797, 0.3)

In [17]:
print(model.word_embeddings[300:310])

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:3',
       grad_fn=<SliceBackward0>)


In [None]:
qwk([4, 1, 1, 4, 3, 2, 1, 1, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

0.0