# NLP: Sentiment Analysis
Choosing and training a model to perform sentiment analysis on catalan text.

In [187]:
import pandas as pd
import numpy as np

from datasets import Dataset, concatenate_datasets

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [188]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

## The data

Check out `data-preprocessing.ipynb` for details about how this initial dataset was chosen, and how the subset dataset was labeled.

In [189]:
comments = pd.read_csv("comments.csv")

In [190]:
labeled = pd.read_csv("comments_subset_labeled.tsv", sep="\t")

### Defined the labeled dataset

In [191]:
comments_labeled = comments.join(labeled.set_index('comment_id'), on='comment_id', rsuffix='labeled')

In [192]:
comments_labeled = comments_labeled[~comments_labeled['label'].isnull()]

In [193]:
labels = {'positive':       0,
          'neutral':        1,
          'negative':       2,
          'very-negative':  3}

In [194]:
comments_labeled['labels'] = comments_labeled['label'].apply(lambda text_label: labels[text_label])

In [195]:
comments_labeled = comments_labeled[['input_text', 'labels']]

In [196]:
comments_labeled.head()

Unnamed: 0,input_text,labels
5,[ARTICLE TITLE] Pas endavant del conveni del C...,2
9,[ARTICLE TITLE] Espot confia en el suport d’Es...,3
15,[ARTICLE TITLE] Més de 174.000 euros per a un ...,3
22,[ARTICLE TITLE] Més de 174.000 euros per a un ...,2
25,[ARTICLE TITLE] Més de 174.000 euros per a un ...,2


In [197]:
comments_labeled['labels'].value_counts()

labels
2    114
1     59
3     53
0     24
Name: count, dtype: int64

In [198]:
num_custom_labels = 4

### Define the unlabeled dataset

In [199]:
comments['labels'] = np.nan

In [200]:
len(comments)

2413

In [201]:
comments_unlabeled = comments[~comments['comment_id'].isin(labeled['comment_id'])][['input_text']]

In [202]:
len(comments_unlabeled)

2163

In [203]:
comments_unlabeled.head()

Unnamed: 0,input_text
0,[ARTICLE TITLE] SDP advoca per un espai transv...
1,[ARTICLE TITLE] SDP advoca per un espai transv...
2,[ARTICLE TITLE] Acord en cercar “consens” i “c...
3,[ARTICLE TITLE] Acord en cercar “consens” i “c...
4,[ARTICLE TITLE] Acord en cercar “consens” i “c...


## Prepare the dataset for the model

### Splitting into training and testing

In [204]:
train, val = train_test_split(
    comments_labeled,
    test_size=0.2,
    random_state=42,
    stratify=comments_labeled['labels']  # Ensures proportional class distribution
)

In [205]:
print(f"Length of training dataset: {len(train)}\nLength of validation dataset: {len(val)}")
print(f"Total length of dataset: {len(train) + len(val)}")

Length of training dataset: 200
Length of validation dataset: 50
Total length of dataset: 250


## Fine-tuning

We'll get the Bert model and Tokenizer from the transformers package.

In [206]:
mbert = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(mbert)
model = BertForSequenceClassification.from_pretrained(mbert, num_labels=num_custom_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Defining the datasets in the correct HuggingFace format

In [207]:
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)

### Tokenize the text

In [208]:
def tokenize(record):
    return tokenizer(record['input_text'], padding='max_length', truncation=True, max_length=128)

In [209]:
train_dataset

Dataset({
    features: ['input_text', 'labels', '__index_level_0__'],
    num_rows: 200
})

In [210]:
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

Map: 100%|██████████| 200/200 [00:00<00:00, 4165.11 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 3414.78 examples/s]


### Rename the columns

### Set format for PyTorch

In [211]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [212]:
train_dataset

Dataset({
    features: ['input_text', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})

### Define training arguments

In [213]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,  # Limit the total amount of checkpoints
    seed=42
)



### Define metrics

In [214]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

### Initialize the trainer

In [215]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [216]:
print(train_dataset['input_text'])

["[ARTICLE TITLE] El fons de reserva de jubilació guanya 8,4 milions a l'agost Hauríem de treure els fons de pensions dels mercats financers, els fons de pensions dels Estats Units, Europa i Àsia, estan profundament involucrats en pràcticament totes les formes de capitalisme financer depredador que afecten el planeta.", '[ARTICLE TITLE] Cultura destina 46.000 euros a la campanya per difondre la llei del català La llei del català? Si per votar si o no al referèndum el que cal saber és anglès!', '[ARTICLE TITLE] L’Elisi assegura que l’acord d’associació “no comportarà un tsunami d’immigració” Andorra amb les seves particularitats ha arribat on és per la seva personalitat , propia i única, sols sense ningú, no deixem de perdre la nostra identitat, fem acords de col.laboració, portem les coses al nostre terreny, no ens venguem a Europa,haurem de fer els que ens impossin i manem nosaltres, reflexioneu!!!!', '[ARTICLE TITLE] El català fa tombar els concursos per trobar sotsoficial i caporals

### Training the model !

In [217]:
trainer.train()


 80%|████████  | 56/70 [00:59<00:09,  1.54it/s]

{'loss': 1.3991, 'grad_norm': 27.554386138916016, 'learning_rate': 1e-05, 'epoch': 0.77}



[A

[A[A                               
                                               
 80%|████████  | 56/70 [01:02<00:09,  1.54it/s]
[A

{'eval_loss': 1.3047295808792114, 'eval_accuracy': 0.46, 'eval_runtime': 0.6158, 'eval_samples_per_second': 81.193, 'eval_steps_per_second': 1.624, 'epoch': 1.0}



 80%|████████  | 56/70 [01:10<00:09,  1.54it/s]

{'loss': 1.2975, 'grad_norm': 4.896843433380127, 'learning_rate': 2e-05, 'epoch': 1.54}



[A

[A[A                               
                                               
 80%|████████  | 56/70 [01:15<00:09,  1.54it/s]
[A

{'eval_loss': 1.213774561882019, 'eval_accuracy': 0.52, 'eval_runtime': 0.6158, 'eval_samples_per_second': 81.196, 'eval_steps_per_second': 1.624, 'epoch': 2.0}



 80%|████████  | 56/70 [01:21<00:09,  1.54it/s]

{'loss': 1.2357, 'grad_norm': 3.9402825832366943, 'learning_rate': 3e-05, 'epoch': 2.31}



[A

[A[A                               
                                               
 80%|████████  | 56/70 [01:28<00:09,  1.54it/s]
[A

{'eval_loss': 1.1799027919769287, 'eval_accuracy': 0.48, 'eval_runtime': 0.6192, 'eval_samples_per_second': 80.749, 'eval_steps_per_second': 1.615, 'epoch': 3.0}



 80%|████████  | 56/70 [01:31<00:09,  1.54it/s]

{'loss': 1.1814, 'grad_norm': 7.534204483032227, 'learning_rate': 4e-05, 'epoch': 3.08}



 80%|████████  | 56/70 [01:39<00:09,  1.54it/s]

{'loss': 1.1062, 'grad_norm': 6.692944049835205, 'learning_rate': 5e-05, 'epoch': 3.85}



[A

[A[A                               
                                               
 80%|████████  | 56/70 [01:41<00:09,  1.54it/s]
[A

{'eval_loss': 1.2643542289733887, 'eval_accuracy': 0.34, 'eval_runtime': 0.6234, 'eval_samples_per_second': 80.199, 'eval_steps_per_second': 1.604, 'epoch': 4.0}



 80%|████████  | 56/70 [01:50<00:09,  1.54it/s]

{'loss': 0.9785, 'grad_norm': 5.559157371520996, 'learning_rate': 1.6666666666666667e-05, 'epoch': 4.62}



[A

[A[A                               
                                               
 80%|████████  | 56/70 [01:56<00:09,  1.54it/s]
[A

{'eval_loss': 1.245222568511963, 'eval_accuracy': 0.46, 'eval_runtime': 0.6643, 'eval_samples_per_second': 75.266, 'eval_steps_per_second': 1.505, 'epoch': 5.0}



100%|██████████| 65/65 [01:08<00:00,  1.05s/it]

{'train_runtime': 68.1226, 'train_samples_per_second': 14.679, 'train_steps_per_second': 0.954, 'train_loss': 1.1678079971900353, 'epoch': 5.0}





TrainOutput(global_step=65, training_loss=1.1678079971900353, metrics={'train_runtime': 68.1226, 'train_samples_per_second': 14.679, 'train_steps_per_second': 0.954, 'total_flos': 65778945024000.0, 'train_loss': 1.1678079971900353, 'epoch': 5.0})

### Save the trained model

In [218]:
trainer.save_model('./fine-tuned-mbert-sentiment')

# Pseudo-labeling on Unlabeled Data

In [219]:
len(comments_unlabeled)

2163

### Define the Dataset

In [220]:
unlabeled_dataset = Dataset.from_pandas(comments_unlabeled)

### Tokenize

In [221]:
unlabeled_dataset = unlabeled_dataset.map(tokenize, batched=True)

Map: 100%|██████████| 2163/2163 [00:00<00:00, 4070.27 examples/s]


### Define torch format

In [222]:
unlabeled_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

### Predict the labels for the unlabeled data

In [223]:
predictions = trainer.predict(unlabeled_dataset)
probs = torch.nn.functional.softmax(torch.tensor(predictions.predictions), dim=1)
max_probs, pseudo_labels = torch.max(probs, dim=1)

100%|██████████| 34/34 [00:28<00:00,  1.18it/s]


In [224]:
probs = max_probs.numpy()
pseudo_labels = pseudo_labels.numpy()

### Define confidence threshold

In [235]:
confidence_threshold = 0.6
confident_indices = np.where(probs > confidence_threshold)[0]

In [236]:
print(f"Confident pseudo-labeled samples: {len(confident_indices)}")

Confident pseudo-labeled samples: 23


### Select confident pseudo-labeled data

In [227]:
pseudo_labeled_dataset = unlabeled_dataset.select(confident_indices)
pseudo_labeled_dataset = pseudo_labeled_dataset.add_column('labels', pseudo_labels[confident_indices])

Flattening the indices: 100%|██████████| 23/23 [00:00<00:00, 3760.68 examples/s]


# Combining labeled and pseudo-labeled dataset

In [228]:
expanded_train_dataset = concatenate_datasets([train_dataset, pseudo_labeled_dataset])

In [229]:
print(f"Expanded training samples: {len(expanded_train_dataset)}")

Expanded training samples: 223


# Fine tune on expanded dataset

In [230]:
trainer_expanded = Trainer(
    model=model,
    args=training_args,
    train_dataset=expanded_train_dataset,
    eval_dataset=val_dataset,  # Use the same validation set
    compute_metrics=compute_metrics,
)

In [231]:
trainer_expanded.train()


 80%|████████  | 56/70 [02:39<00:09,  1.54it/s]

{'loss': 1.1164, 'grad_norm': 4.487930774688721, 'learning_rate': 1e-05, 'epoch': 0.71}



[A

[A[A                               
                                               
 80%|████████  | 56/70 [02:44<00:09,  1.54it/s]
[A

{'eval_loss': 1.2011468410491943, 'eval_accuracy': 0.48, 'eval_runtime': 0.7232, 'eval_samples_per_second': 69.136, 'eval_steps_per_second': 1.383, 'epoch': 1.0}



 80%|████████  | 56/70 [02:51<00:09,  1.54it/s]

{'loss': 1.0554, 'grad_norm': 7.591151237487793, 'learning_rate': 2e-05, 'epoch': 1.43}



[A

[A[A                               
                                               
 80%|████████  | 56/70 [02:59<00:09,  1.54it/s]
[A

{'eval_loss': 1.2449299097061157, 'eval_accuracy': 0.38, 'eval_runtime': 0.6702, 'eval_samples_per_second': 74.609, 'eval_steps_per_second': 1.492, 'epoch': 2.0}



 80%|████████  | 56/70 [03:02<00:09,  1.54it/s]

{'loss': 1.0543, 'grad_norm': 6.536264419555664, 'learning_rate': 3e-05, 'epoch': 2.14}



 80%|████████  | 56/70 [03:11<00:09,  1.54it/s]

{'loss': 0.983, 'grad_norm': 17.37132453918457, 'learning_rate': 4e-05, 'epoch': 2.86}



[A

[A[A                               
                                               
 80%|████████  | 56/70 [03:13<00:09,  1.54it/s]
[A

{'eval_loss': 1.3240058422088623, 'eval_accuracy': 0.42, 'eval_runtime': 0.6691, 'eval_samples_per_second': 74.722, 'eval_steps_per_second': 1.494, 'epoch': 3.0}



 80%|████████  | 56/70 [03:22<00:09,  1.54it/s]

{'loss': 0.8187, 'grad_norm': 15.7627534866333, 'learning_rate': 5e-05, 'epoch': 3.57}



[A

[A[A                               
                                               
 80%|████████  | 56/70 [03:28<00:09,  1.54it/s]
[A

{'eval_loss': 1.5696197748184204, 'eval_accuracy': 0.42, 'eval_runtime': 0.6794, 'eval_samples_per_second': 73.595, 'eval_steps_per_second': 1.472, 'epoch': 4.0}



 80%|████████  | 56/70 [03:33<00:09,  1.54it/s]

{'loss': 0.9785, 'grad_norm': 16.168893814086914, 'learning_rate': 2.5e-05, 'epoch': 4.29}



 80%|████████  | 56/70 [03:42<00:09,  1.54it/s]

{'loss': 0.6826, 'grad_norm': 11.880638122558594, 'learning_rate': 0.0, 'epoch': 5.0}



[A

[A[A                               
                                               
 80%|████████  | 56/70 [03:45<00:09,  1.54it/s]
[A

{'eval_loss': 1.238420009613037, 'eval_accuracy': 0.44, 'eval_runtime': 0.6908, 'eval_samples_per_second': 72.377, 'eval_steps_per_second': 1.448, 'epoch': 5.0}



100%|██████████| 70/70 [01:17<00:00,  1.11s/it]

{'train_runtime': 77.3608, 'train_samples_per_second': 14.413, 'train_steps_per_second': 0.905, 'train_loss': 0.9555482864379883, 'epoch': 5.0}





TrainOutput(global_step=70, training_loss=0.9555482864379883, metrics={'train_runtime': 77.3608, 'train_samples_per_second': 14.413, 'train_steps_per_second': 0.905, 'total_flos': 73343523701760.0, 'train_loss': 0.9555482864379883, 'epoch': 5.0})

In [232]:
trainer_expanded.save_model('./fine-tuned-mbert-sentiment-expanded')