In [115]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForTokenClassification, AdamW
import json
import numpy as np
from tqdm import tqdm


In [116]:
# Cell 2: Data loading with pandas
def load_data(filename):
    return pd.read_json(filename, lines=True)

train_data = load_data('train.jsonl')
dev_data = load_data('test1.jsonl')
dev_data = dev_data.rename(columns= {'senences': 'sentences'})

In [117]:
dev_data

Unnamed: 0,sentences,id
0,Владелец «Бирмингема» получил шесть лет тюрьмы...,584
1,Акция протеста на Майдане Независимости объявл...,585
2,Фольксваген может перейти под контроль Порше \...,586
3,В Москве покажут фильмы Чарли Чаплина с живой ...,587
4,Чулпан Хаматова сыграет главную роль в фильме ...,588
...,...,...
60,ОБСЕ назвала референдум о статусе Крыма незако...,644
61,Египетского студента могут выслать из страны з...,645
62,Геннадий Онищенко отправлен в отставку\nГеннад...,646
63,Племянник Алишера Усманова разбился в ДТП\nВид...,647


In [118]:
def get_label_dict(data):
    label_set = set()
    for annotations in data['ners']:
        for _, _, label in annotations:
            label_set.add(label)
    return {label: idx for idx, label in enumerate(label_set)}

label_dict = get_label_dict(train_data)
label_dict['O'] = 29
inverse_label_dict = {v: k for k, v in label_dict.items()}


In [119]:
def load_model(model_name, num_labels):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForTokenClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        output_attentions=False,
        output_hidden_states=False,
    )
    return tokenizer, model

tokenizer, model = load_model('bert-base-multilingual-cased', len(label_dict))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [120]:
def prepare_data(data, tokenizer, label_dict=None, max_len=128, include_labels=True):
    input_ids = []
    attention_masks = []
    labels = []
    
    for _, row in data.iterrows():
        sentence = row['sentences']
        tokenized_info = tokenizer(sentence, max_length=max_len, padding='max_length', truncation=True)
        input_ids.append(tokenized_info['input_ids'])
        attention_masks.append(tokenized_info['attention_mask'])
        
        if include_labels and 'ners' in row:
            label_array = [label_dict['O']] * max_len  # Initialize with 'O' for non-entity
            for start, end, label in row['ners']:
                for i in range(start, min(end, max_len)):
                    label_array[i] = label_dict[label]
            labels.append(label_array)
    
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    if include_labels:
        labels = torch.tensor(labels)
        return TensorDataset(input_ids, attention_masks, labels)
    return TensorDataset(input_ids, attention_masks)

train_dataset = prepare_data(train_data, tokenizer, label_dict)
dev_dataset = prepare_data(dev_data, tokenizer, include_labels=False)

In [121]:
batch_size = 32
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size)


In [122]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.classifier.parameters(), lr=5e-5)

def train_and_evaluate(model, train_dataloader, dev_dataloader, optimizer, device):
    model.train()
    for epoch in range(1):  # Number of epochs
        for batch in tqdm(train_dataloader, desc="Training"):
            batch = tuple(t.to(device) for t in batch)
            inputs, masks, labels = batch
            model.zero_grad()
            outputs = model(inputs, token_type_ids=None, attention_mask=masks, labels=labels)
            loss = outputs[0]
            loss.backward()
            optimizer.step()

train_and_evaluate(model, train_dataloader, dev_dataloader, optimizer, device)

Training:   0%|          | 0/17 [00:00<?, ?it/s]

Training: 100%|██████████| 17/17 [02:29<00:00,  8.82s/it]


In [123]:
label_dict

{'FACILITY': 0,
 'LANGUAGE': 1,
 'PERCENT': 2,
 'PERSON': 3,
 'FAMILY': 4,
 'DISTRICT': 5,
 'LOCATION': 6,
 'EVENT': 7,
 'PROFESSION': 8,
 'MONEY': 9,
 'ORDINAL': 10,
 'IDEOLOGY': 11,
 'CRIME': 12,
 'STATE_OR_PROVINCE': 13,
 'WORK_OF_ART': 14,
 'AGE': 15,
 'AWARD': 16,
 'DATE': 17,
 'PENALTY': 18,
 'NATIONALITY': 19,
 'NUMBER': 20,
 'TIME': 21,
 'DISEASE': 22,
 'LAW': 23,
 'RELIGION': 24,
 'PRODUCT': 25,
 'ORGANIZATION': 26,
 'COUNTRY': 27,
 'CITY': 28,
 'O': 29}

In [124]:
def generate_predictions(model, dataloader, data_ids):
    model.eval()
    predictions = []
    for i, batch in enumerate(tqdm(dataloader, desc="Predicting")):
        inputs, masks = batch[0].to(device), batch[1].to(device)
        with torch.no_grad():
            logits = model(inputs, token_type_ids=None, attention_mask=masks).logits
        preds = torch.argmax(logits, dim=2)
        batch_predictions = preds.cpu().numpy().tolist()
        
        # Iterate over batch predictions and associate each with its corresponding data ID
        for j, pred in enumerate(batch_predictions):
            ners = [[0, 0, inverse_label_dict[p]] for p in pred if p != label_dict["O"]]
            predictions.append({"id": data_ids[i * len(batch) + j], "ners": ners})
    
    return predictions

# Get ids from dev_data to pass to the prediction function
dev_ids = dev_data['id'].tolist()
predictions = generate_predictions(model, dev_dataloader, dev_ids)


Predicting:   0%|          | 0/3 [00:00<?, ?it/s]

Predicting: 100%|██████████| 3/3 [00:05<00:00,  1.71s/it]


In [125]:
with open('test.jsonl', 'w') as file:
    for prediction in predictions:
        file.write(json.dumps(prediction) + "\n")

# Zip the file for submission
!zip test.zip test.jsonl

19354.97s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
updating: test.jsonl (deflated 94%)
