In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, Subset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from collections import Counter
from torchtext.vocab import vocab
from torch.cuda.amp import GradScaler, autocast

In [1]:
train_df = pd.read_csv('/kaggle/input/newstextclassification/train_set.csv', sep='\t')

In [None]:
def tokenizer(text):
    """由于加密后的数据由空格隔开，所以此函数就是split()"""
    return text.split()

def build_vocab(train, min_freq):
    all_text = ' '.join(train['text'])  # 将所有token组合在一起
    c = Counter(tokenizer(all_text))  # 统计token频率
    sorted_tokens = dict(c.most_common())  # vocab的输入需要是字典，most_common和sorted一样
    # 创建vocab，设置最小频率，
    v = vocab(sorted_tokens, min_freq=min_freq, specials=['<pad>','<unk>'])
    v.set_default_index(v['<unk>'])  # 如果不在词汇表，则映射到<unk>
    return v

train_df = pd.read_csv('/kaggle/input/newstextclassification/train_set.csv', sep='\t')
min_freq = 5  # 忽略出现频率小于5的词
my_vocab = build_vocab(train_df, min_freq)

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label'].tolist()
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return my_vocab(tokenizer(self.texts[idx])), int(self.labels[idx])
    
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _text, _label in batch:
        label_list.append(_label)
        processed_text = torch.tensor(_text, dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.float)
    # 这里要写batch_first=True，表示按照batch进行填充，即一个text一个text填充
    text_list = pad_sequence(text_list, padding_value=my_vocab["<pad>"], batch_first=True)
    lengths = torch.tensor(lengths, dtype=torch.int64)
    return text_list, label_list, lengths

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(BiLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden)

In [None]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    
    epoch_loss = 0
    scaler = GradScaler()
    
    for text, labels, lengths in tqdm(dataloader, desc="Train", leave=False):
#     for text, labels, lengths in dataloader:
        text, labels, lengths = text.to(device), labels.to(device), lengths.to(device)
        
        optimizer.zero_grad()
        with autocast(enabled=True, dtype=torch.float16):
            predictions = model(text, lengths)
            loss = criterion(predictions, labels.long())
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)
    
    
def evaluate(model, dataloader, criterion, device):
    model.eval()
    
    epoch_loss = 0
    all_predictions = []
    all_true_labels = []
    with torch.no_grad():
        for text, labels, lengths in dataloader:
            text, labels, lengths = text.to(device), labels.to(device), lengths.to(device)
            
            predictions = model(text, lengths)
            loss = criterion(predictions, labels.long())
            
            epoch_loss += loss.item()
            
            correct = torch.argmax(predictions, dim=1)
            all_predictions.extend(correct.cpu().numpy())
            all_true_labels.extend(labels.cpu().numpy())

        f1 = f1_score(all_true_labels, all_predictions, average='macro')
        print(f"模型在未见数据集上的f1分数: {f1}")
    return epoch_loss / len(dataloader), f1

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 123
torch.manual_seed(seed)
dataset = MyDataset(train_df)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 123
torch.manual_seed(seed)
dataset = MyDataset(train_df)

INPUT_DIM = len(my_vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 14
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
lr = 0.0003

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=seed)
epochs = 40
batch_size = 32

In [None]:
model = BiLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model = nn.DataParallel(model)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

patience = 0
best_loss = float('inf')
best_f1 = float('0')
stop_count = 15
for train_index, test_index in sss.split(dataset.texts, dataset.labels):
    train_loader = DataLoader(Subset(dataset, train_index), batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    test_loader = DataLoader(Subset(dataset, test_index), batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
    
    for epoch in range(epochs):
        train_loss = train(model, train_loader, optimizer, criterion, device)
        valid_loss, f1 = evaluate(model, test_loader, criterion, device)
        print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Val. Loss: {valid_loss:.3f}')

        # 早停
#         if valid_loss < best_loss:
#             best_loss = valid_loss
#             patience = 0
# #             torch.save(model.module.state_dict(), f'BiLNoCut_256_{epoch+1}_{valid_loss:.3f}_{f1:.3f}.pth')
#             torch.save(model.state_dict(), f'CBLA_256_{epoch+1}_{valid_loss:.3f}_{f1:.3f}.pth')
        if f1 > best_f1:
            best_f1 = f1
            patience = 0
            torch.save(model.module.state_dict(),  f'BiLSTM_{f1:.3f}_{valid_loss:.3f}.pth')
#             torch.save(model.state_dict(), f'BiLSTM_{f1:.3f}_{valid_loss:.3f}.pth')
        else:
            patience += 1
            if patience >= stop_count:
                print("early stop!")
                break

In [None]:
torch.cuda.empty_cache()

In [1]:
import pandas as pd
from collections import Counter
from torchtext.vocab import vocab
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import random_split, DataLoader, Dataset, Subset
from torch.nn.utils.rnn import pad_sequence
from torch.cuda.amp import GradScaler, autocast
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score
import os
import json

In [17]:
def tokenizer(text):
    """由于加密后的数据由空格隔开，所以此函数就是split()"""
    return text.split()

def build_vocab(min_freq):
    token_path = '/kaggle/input/newstextclassification/sorted_token.json'
    train = None
    if os.path.exists(token_path):
        with open(token_path, 'r') as fp:
            sorted_tokens = json.load(fp)
    else:
        all_text = ' '.join(train['text'])  # 将所有token组合在一起
        c = Counter(tokenizer(all_text))  # 统计token频率
        sorted_tokens = dict(c.most_common())  # vocab的输入需要是字典，most_common和sorted一样
    # 创建vocab，设置最小频率，
    v = vocab(sorted_tokens, min_freq=min_freq, specials=['<pad>','<unk>'])
    v.set_default_index(v['<unk>'])  # 如果不在词汇表，则映射到<unk>
    return v

min_freq = 5  # 忽略出现次数低于此值的字
my_vocab = build_vocab(min_freq)

In [18]:
len(my_vocab)

6038

In [25]:
class TestDataset(Dataset):
    def __init__(self, dataframe):
        self.texts = dataframe['text'].tolist()
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return my_vocab(tokenizer(self.texts[idx]))
    
def collate_batch(batch):
    text_list, lengths = [], []
    for _text in batch:
        processed_text = torch.tensor(_text, dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    # 这里要写batch_first=True，表示按照batch进行填充，即一个text一个text填充
    text_list = pad_sequence(text_list, padding_value=my_vocab["<pad>"], batch_first=True)
    lengths = torch.tensor(lengths, dtype=torch.int64)
    return text_list, lengths

In [20]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(BiLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden)

In [21]:
INPUT_DIM = len(my_vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 14
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5


epochs = 40
batch_size = 32

In [22]:
test_df = pd.read_csv('/kaggle/input/newstextclassification/test_a.csv', sep='\t')
device = torch.device('cuda')

In [28]:
def test_a(path, dataset):
    model = BiLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

    
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
    

    model.load_state_dict(torch.load(path))
    model = nn.DataParallel(model)

    model.to(device)
    model.eval()
    
    rest = []
    
    with torch.no_grad():
        for text, lengths in tqdm(dataloader, desc="Train", leave=False):

            text, lengths = text.to(device), lengths.to(device)

            predictions = model(text, lengths)

            correct = predictions.argmax(dim=1, keepdim=True).squeeze(1)
            rest.extend(correct.tolist())
    return rest

dataset = TestDataset(test_df)
res = test_a('/kaggle/input/bimodel/BiLSTM_0.957_0.132.pth', dataset)

                                                          

In [29]:
sub = pd.DataFrame()
sub['label'] = res
sub.to_csv('sub.csv', index=False)

In [30]:
sub.head()

Unnamed: 0,label
0,1
1,2
2,8
3,5
4,0
