In [None]:
!pip3 install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
from google.colab import drive
drive.mount('/content/drive')

train_tsv_path = "/content/drive/MyDrive/power-tr-train.tsv"
test_tsv_path = "/content/drive/MyDrive/power-tr-test.tsv"

# Load TSV files into pandas DataFrames
def load_tsv(filepath):
    """
    Load a TSV file into a pandas DataFrame.
    Args:
        filepath (str): Path to the TSV file.
    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    return pd.read_csv(filepath, sep="\t")

# Load training and test datasets
train_data = load_tsv(train_tsv_path)
test_data = load_tsv(test_tsv_path)

# Check data
print("Training Data Sample:")
print(train_data.head())
print("\nTest Data Sample:")
print(test_data.head())


Mounted at /content/drive
Training Data Sample:
        id                           speaker sex  \
0  tr18146  ca2031caa4032c51980160359953d507   M   
1  tr18147  4cee0addb3c69f6866869b180f90d45f   M   
2  tr18148  b3d7f76d74ec268492f8190ca123a6b2   M   
3  tr18149  722efac7138c8197a9d1e97eed3a8b18   M   
4  tr18150  fcc61122f3553c57ae207adeb1a1af84   M   

                                                text  \
0  Yeni yasama döneminin ülkemiz için, milletimiz...   
1  Sayın Başkan, değerli milletvekilleri; bugün, ...   
2  Sayın Başkanım, öncelikle yüce Meclisin Başkan...   
3  24’üncü Dönem Meclis Başkanlığına seçilmenizde...   
4  Usul tartışmasında 2 kişi lehte 2 kişi aleyhte...   

                                             text_en  label  
0  Mr. President, dear lawmakers, I salute you, a...      0  
1  Mr. President, members of lawmakers, as I spea...      0  
2  Mr. President, I'm here to share with you the ...      0  
3  Mr. President, under the principles determined...  

In [None]:
# Check for missing values
print(f"Missing values in training data:\n{train_data.isna().sum()}")
print(f"\nMissing values in test data:\n{test_data.isna().sum()}")

# Drop rows with missing labels in the training dataset
train_data = train_data.dropna(subset=["label"]).reset_index(drop=True)

# Ensure the labels are integers
train_data["label"] = train_data["label"].astype(int)

# Count the number of samples for each label
label_distribution = train_data["label"].value_counts()
print("\nLabel Distribution in Training Data:")
print(label_distribution)

# Count the number of samples for each speaker
speaker_distribution = train_data["speaker"].value_counts()
print("\nTop 10 Speakers by Number of Speeches:")
print(speaker_distribution.head(10))

# Check text length statistics
train_data["text_length"] = train_data["text"].apply(len)
text_length_stats = train_data["text_length"].describe()
print("\nText Length Statistics:")
print(text_length_stats)


Missing values in training data:
id         0
speaker    0
sex        0
text       0
text_en    0
label      0
dtype: int64

Missing values in test data:
id         0
text       0
text_en    0
sex        0
dtype: int64

Label Distribution in Training Data:
label
1    8932
0    8452
Name: count, dtype: int64

Top 10 Speakers by Number of Speeches:
speaker
a71f9077d9a1f752c5dacedc685f635d    38
f10f82aed6a46d9359e3eeac1219fc4f    36
5f44812e833b6098ca80819dd3b2d393    36
dd2f23731c92b8f013a0a3254dc5c7e0    35
17cafa8a85e8cb4b16bfcab3b22d7344    33
469cfc5f4fb196946f22855dcbe1f480    29
9411ca9d3a016b53ca63489d72dd09bd    29
bb1bc131de452aede8c90daeccdfc84d    28
9a8d8dcf16b826416474e9895aed20bb    27
3bf0b2c7ad560c037bc1ad9f967e798f    27
Name: count, dtype: int64

Text Length Statistics:
count    17384.000000
mean      3254.661413
std       3041.760645
min        486.000000
25%        912.000000
50%       2511.000000
75%       4314.250000
max      19952.000000
Name: text_length, dtype: 

In [None]:
train_split, validation_split = train_test_split(
    train_data,
    test_size=0.1,
    stratify=train_data["label"],
    random_state=42
)

print(f"Training set size: {len(train_split)}")
print(f"Validation set size: {len(validation_split)}")


Training set size: 15645
Validation set size: 1739


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
def preprocess_data(data, tokenizer, is_test=False):
    """
    Preprocess and tokenize the dataset.
    Args:
        data (pd.DataFrame): Input DataFrame with 'text' (and optionally 'label') columns.
        tokenizer: Hugging Face tokenizer.
        is_test (bool): If True, processes the test dataset without labels.
    Returns:
        Dataset: Tokenized Hugging Face Dataset.
    """
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

    dataset = Dataset.from_pandas(data)
    dataset = dataset.map(tokenize_function, batched=True)

    if not is_test:
        dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    else:
        dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

    return dataset

# Initialize tokenizer
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocess train, validation, and test data
train_dataset = preprocess_data(train_split, tokenizer)
validation_dataset = preprocess_data(validation_split, tokenizer)
test_dataset = preprocess_data(test_data, tokenizer, is_test=True)


Map:   0%|          | 0/15645 [00:00<?, ? examples/s]

Map:   0%|          | 0/1739 [00:00<?, ? examples/s]

Map:   0%|          | 0/1990 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(pred):
    """
    Compute evaluation metrics including accuracy, precision, recall, and F1.
    Args:
        pred: Predictions from the model.
    Returns:
        dict: A dictionary with accuracy, precision, recall, and F1 score.
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(axis=1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=64,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_ratio=0,
    logging_dir="./logs",
    save_strategy="epoch",
    fp16=True,
    load_best_model_at_end=True,
    logging_steps=50,
    report_to="none",
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3913,0.364858,0.835538,0.93553,0.730425,0.820352
2,0.2806,0.344831,0.855664,0.945908,0.762864,0.844582
3,0.2143,0.304977,0.887867,0.930949,0.844519,0.88563
4,0.1938,0.32498,0.899367,0.914648,0.887025,0.900625


TrainOutput(global_step=3132, training_loss=0.2933504153637746, metrics={'train_runtime': 1996.5377, 'train_samples_per_second': 31.344, 'train_steps_per_second': 1.569, 'total_flos': 1.64654898444288e+16, 'train_loss': 0.2933504153637746, 'epoch': 4.0})

In [None]:
# Evaluate the model on the validation set
results = trainer.evaluate()
print("Validation Results:", results)

import json
with open("task2_text_xlm.json", "w") as f:
    json.dump(results, f, indent=4)


Validation Results: {'eval_loss': 0.3049771189689636, 'eval_accuracy': 0.8878665899942496, 'eval_precision': 0.9309494451294698, 'eval_recall': 0.8445190156599552, 'eval_f1': 0.8856304985337243, 'eval_runtime': 14.5446, 'eval_samples_per_second': 119.563, 'eval_steps_per_second': 1.925, 'epoch': 4.0}


In [None]:
test_predictions = trainer.predict(test_dataset)

# Extract predicted labels
predicted_labels = test_predictions.predictions.argmax(axis=1)

# Add predictions to the test DataFrame
test_data["predictions"] = predicted_labels

# Save predictions to a CSV file
test_data[["id","text", "predictions"]].to_csv("power_task_predictions.csv", index=False)
print("Predictions saved to 'power_task_predictions.csv'.")


Predictions saved to 'power_task_predictions.csv'.
