<a href="https://colab.research.google.com/github/electronic-carpet/lqbtq-project/blob/main/Copy_of_task1_tamil.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

train_ta = pd.read_csv("/HT_Span_Tam_train.csv")  # tamil span train
test_ta = pd.read_csv("/CN_Tamil_Test_without_CN.csv")

train_ta.head()


Unnamed: 0,Id,text,span,position,class
0,1,மகிழ்ச்சியே கொடு,[],[],None of the above
1,2,இந்த கேவலத்தை பேட்டி எடுக்குற கருமம்டா இந்த அச...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2...","[4, 86]",Homophobia
2,3,திருட்டு நாய்கள் ஓரினச்சேர்க்கையில் ஈடுபடுபவர்...,"0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18...","[0, 42]",Homophobia
3,4,இவர்களுக்கும் பொதுமக்களை போல் தண்டனை சட்டம் கொ...,[],[],None of the above
4,5,Venkat Raman போடா மெண்டல் 9 துங்ககிட்ட நீ அசி...,"13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,2...","[13, 120]",Transphobia


In [None]:
import ast

def parse_span(span_str):
    try:
        return ast.literal_eval(span_str)
    except:
        return []

train_ta["span"] = train_ta["span"].apply(parse_span)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

def tokenize_and_align(example):
    text = example["text"]
    spans = example["span"]

    enc = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_offsets_mapping=True
    )

    labels = []
    for start, end in enc["offset_mapping"]:
        if start == end:
            labels.append(-100)
        else:
            labels.append(1 if any(s >= start and s < end for s in spans) else 0)

    enc["labels"] = labels
    enc.pop("offset_mapping")
    return enc


In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_ta)
train_dataset = train_dataset.map(tokenize_and_align)
train_dataset.set_format("torch")


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

model = AutoModelForTokenClassification.from_pretrained(
    "google/mt5-small",
    num_labels=2
)

training_args = TrainingArguments(
    output_dir="./tamil_span",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_steps=50,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()


Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

MT5ForTokenClassification LOAD REPORT from: google/mt5-small
Key                                                                       | Status     | 
--------------------------------------------------------------------------+------------+-
decoder.block.{0, 1, 2, 3, 4, 5, 6, 7}.layer.0.SelfAttention.v.weight     | UNEXPECTED | 
decoder.block.{0, 1, 2, 3, 4, 5, 6, 7}.layer.{0, 1, 2}.layer_norm.weight  | UNEXPECTED | 
decoder.block.{0, 1, 2, 3, 4, 5, 6, 7}.layer.0.SelfAttention.o.weight     | UNEXPECTED | 
decoder.block.{0, 1, 2, 3, 4, 5, 6, 7}.layer.2.DenseReluDense.wi_1.weight | UNEXPECTED | 
decoder.block.{0, 1, 2, 3, 4, 5, 6, 7}.layer.1.EncDecAttention.v.weight   | UNEXPECTED | 
decoder.block.{0, 1, 2, 3, 4, 5, 6, 7}.layer.2.DenseReluDense.wi_0.weight | UNEXPECTED | 
decoder.block.{0, 1, 2, 3, 4, 5, 6, 7}.layer.1.EncDecAttention.k.weight   | UNEXPECTED | 
decoder.block.{0, 1, 2, 3, 4, 5, 6, 7}.layer.1.EncDecAttention.o.weight   | UNEXPECTED | 
decoder.block.{0, 1, 2, 3, 4, 5, 6, 7}.

Step,Training Loss
50,2.958562
100,2.494804
150,2.146623


TrainOutput(global_step=150, training_loss=2.5333296712239584, metrics={'train_runtime': 13.1378, 'train_samples_per_second': 91.339, 'train_steps_per_second': 11.417, 'total_flos': 17403584716800.0, 'train_loss': 2.5333296712239584, 'epoch': 3.0})

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict_span(text):
    enc = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128,
        return_offsets_mapping=True
    )

    offsets = enc.pop("offset_mapping")[0].tolist()
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        preds = model(**enc).logits.argmax(-1)[0].tolist()

    spans = []
    for (start, end), p in zip(offsets, preds):
        if p == 1 and start != end:
            spans.extend(range(start, end))

    return sorted(set(spans))


In [None]:
predicted = []

for text in test_ta["text"]:
    predicted.append(predict_span(text))

submission = pd.DataFrame({
    "Id": test_ta["Id"],
    "span": predicted
})

submission.to_csv("task1_tamil_submission1.csv", index=False)
print("Tamil Task-1 submission 1 saved")


Tamil Task-1 submission 1 saved
