In [1]:
!pip install torchtext==0.6



In [2]:
from torchtext.data import Field, TabularDataset, BucketIterator,LabelField
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import GloVe

In [3]:
# Essential libraries
import matplotlib.pyplot as plt
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

In [4]:
print(torch.__version__)

2.0.1+cu118


In [5]:
import pandas as pd
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the original dataset
df = pd.read_csv('missom_annotated.csv')
df = df.drop(columns=['post_id','how_annotated'])

empty_cells =  df.isnull().sum()
print(empty_cells)
df.dropna(inplace = True)



text                      0
label_minority_coping     0
label_prej_event          0
label_exp_reject          0
label_identity_conceal    0
label_internal_stigma     0
label_dysphoria           0
label_minority_stress     0
dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

import random
import numpy as np
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

train_ratio = 0.70
valid_ratio = 0.15
test_ratio = 0.15

train_df, remaining_df = train_test_split(df, test_size=1 - train_ratio, random_state=random.seed(SEED))
valid_df, test_df = train_test_split(remaining_df, test_size=test_ratio/(valid_ratio + test_ratio), random_state=random.seed(SEED))

# Save the dataframes to CSV files
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)
valid_df.to_csv('valid.csv', index=False)

In [7]:
# Labels in the dataset
label_fields = [
    'label_minority_coping',
    'label_prej_event',
    'label_exp_reject',
    'label_identity_conceal',
    'label_internal_stigma',
    'label_dysphoria',
    'label_minority_stress'
]

# Create label fields for each label in multi-label classification
label_fields_objs = [LabelField(dtype=torch.float) for _ in label_fields]

In [8]:
# Define your own preprocessing steps, e.g., tokenization, lowercase, etc.
TEXT = Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm')
LABEL = LabelField(sequential=False, dtype=torch.float)

# Load the data
train_data,valid_data, test_data = TabularDataset.splits(
    path= r'/content/',
    train='train.csv',
    validation='valid.csv',
    test='test.csv',
    format='csv',
    fields=[('text', TEXT)] + [(label, label_field) for label, label_field in zip(label_fields, label_fields_objs)],
    skip_header=True
)

print('Number of training examples: {}'.format(len(train_data)))
print('Number of Validation examples: {}'.format(len(valid_data)))
print('Number of testing examples: {}'.format(len(test_data)))
print(train_data[0])

Number of training examples: 4052
Number of Validation examples: 868
Number of testing examples: 869
<torchtext.data.example.Example object at 0x7b265cca93c0>


In [9]:
print(vars(train_data.examples[0]))

{'text': ['Ok', 'so', ',', 'I', 'have', 'n’t', 'really', '’', 'liked', '’', 'anyone', '.', 'At', 'most', 'it', 'was', 'just', 'a', 'mutual', 'friendship', 'that', 'we', 'thought', 'might', 'go', 'somewhere', 'if', 'we', 'dated', '.', 'It', 'does', 'n’t', '.', 'The', 'most', 'I', '’ve', 'had', 'was', 'a', 'slight', 'crush', 'on', 'my', 'best', 'friend', 'in', '4th', 'grade', '.', 'But', 'I', 'never', 'told', 'her', '.', 'I', '’ve', 'struggled', 'with', 'my', 'appearance', 'for', 'a', 'while', 'now', ',', 'never', 'showing', 'my', 'face', 'to', 'anyone', 'online', ',', 'or', 'my', 'voice', 'for', 'that', 'matter', '.', 'And', 'hiding', 'myself', 'in', 'bulky', 'hoodies', '.', 'I', '’ve', 'just', 'recently', 'started', 'to', 'think', 'I', 'look', 'pretty', '.', 'I', 'really', 'want', 'a', 'gf', '/', 'bf', ',', 'but', 'no', 'matter', 'the', 'gender', ',', 'no', 'one', 'my', 'age', 'seems', 'attractive', 'to', 'me', '.', 'I', 'think', 'some', 'of', 'them', 'are', '*', 'pretty', '*', 'in', '

In [10]:

TEXT.build_vocab(train_data, min_freq=1)

# Build vocab for label fields
for label_field in label_fields_objs:
    label_field.build_vocab(train_data)

In [11]:
batch_size = 32

#For GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = batch_size,
    device = device,sort_key=lambda x: len(x.text),sort_within_batch = False)

In [12]:
import torch.nn as nn
class recurrent_nn(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)

        self.rnn = nn.RNN(embedding_dim, hidden_dim)

        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text): #used when we feed examples.

        #text = [sent len, batch size]
        embedded = self.embedding(text)


        output, hidden = self.rnn(embedded)

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))

        return self.fc(hidden.squeeze(0))

In [13]:
input_dim = len(TEXT.vocab)
embedding_dim = 100

hidden_dim = 256
output_dim = OUTPUT_DIM = len(label_fields)

model = recurrent_nn(input_dim, embedding_dim, hidden_dim, output_dim)

In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,293,247 trainable parameters


In [15]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr = 1e-3)
loss_func = nn.BCEWithLogitsLoss()
model = model.to(device)
loss_func = loss_func.to(device)

In [16]:
def multi_label_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / (len(label_fields) * len(correct))
    return acc

def train(model, iterator, optimizer, loss_func):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()

        predictions = model(batch.text)
        sigmoid_preds = torch.sigmoid(predictions)
        for i, label_field in enumerate(label_fields_objs):
                label_name = label_fields[i]
                output_1 = (sigmoid_preds[:, i] > 0.5).int()

                loss = loss_func(predictions[:, i], getattr(batch, label_name))
                acc = multi_label_accuracy(predictions[:, i], getattr(batch, label_name))

                loss.backward(retain_graph=True)

                optimizer.step()

                epoch_loss += loss.item()
                epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text)

            sigmoid_preds = torch.sigmoid(predictions)

            for i, label_field in enumerate(label_fields_objs):
                label_name = label_fields[i]
                output_1 = (sigmoid_preds[:, i] > 0.5).int()

                loss = criterion(predictions[:, i], getattr(batch, label_name))
                acc = multi_label_accuracy(predictions[:, i], getattr(batch, label_name))

                epoch_loss += loss.item()
                epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [17]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time/60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

    return elapsed_mins, elapsed_secs

In [18]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, loss_func)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_func)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'rnn-model.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 7s
	Train Loss: 2.472 | Train Acc: 88.34%
	 Val. Loss: 2.589 |  Val. Acc: 87.71%
Epoch: 02 | Epoch Time: 0m 7s
	Train Loss: 2.242 | Train Acc: 89.50%
	 Val. Loss: 2.564 |  Val. Acc: 88.22%
Epoch: 03 | Epoch Time: 0m 7s
	Train Loss: 2.229 | Train Acc: 89.55%
	 Val. Loss: 2.555 |  Val. Acc: 88.30%
Epoch: 04 | Epoch Time: 0m 7s
	Train Loss: 2.230 | Train Acc: 89.64%
	 Val. Loss: 2.538 |  Val. Acc: 88.38%
Epoch: 05 | Epoch Time: 0m 7s
	Train Loss: 2.237 | Train Acc: 89.54%
	 Val. Loss: 2.528 |  Val. Acc: 88.49%
Epoch: 06 | Epoch Time: 0m 7s
	Train Loss: 2.235 | Train Acc: 89.59%
	 Val. Loss: 2.526 |  Val. Acc: 88.49%
Epoch: 07 | Epoch Time: 0m 10s
	Train Loss: 2.233 | Train Acc: 89.60%
	 Val. Loss: 2.508 |  Val. Acc: 88.54%
Epoch: 08 | Epoch Time: 0m 7s
	Train Loss: 2.231 | Train Acc: 89.63%
	 Val. Loss: 2.502 |  Val. Acc: 88.60%
Epoch: 09 | Epoch Time: 0m 7s
	Train Loss: 2.218 | Train Acc: 89.66%
	 Val. Loss: 2.498 |  Val. Acc: 88.62%
Epoch: 10 | Epoch Time: 0m 

In [19]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
def calc_report(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    y_pred = [[] for _ in range(len(label_fields))]
    y_true = [[] for _ in range(len(label_fields))]
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            predictions = model(text)
            sigmoid_preds = torch.sigmoid(predictions)
            for i, label_field in enumerate(label_fields_objs):
                label_name = label_fields[i]
                output_1 = (sigmoid_preds[:, i] > 0.5).int()
                y_pred[i].extend(output_1.tolist())
                y_true[i].extend(getattr(batch, label_name).tolist())
                loss = criterion(predictions[:, i], getattr(batch, label_name))
                acc = multi_label_accuracy(predictions[:, i], getattr(batch, label_name))
                epoch_loss += loss.item()
                epoch_acc += acc.item()
        all_reports = []
        for i, label_field in enumerate(label_fields_objs):
            rep = classification_report(y_true[i], y_pred[i], labels=[1, 0], digits=2)
            print('_________'*8)
            print(label_fields[i])
            print(rep)



In [20]:
calc_report(model, test_iterator, loss_func)

________________________________________________________________________
label_minority_coping
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       112
           0       0.87      0.99      0.93       757

    accuracy                           0.87       869
   macro avg       0.44      0.50      0.46       869
weighted avg       0.76      0.87      0.81       869

________________________________________________________________________
label_prej_event
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        74
           0       0.91      1.00      0.95       795

    accuracy                           0.91       869
   macro avg       0.46      0.50      0.48       869
weighted avg       0.84      0.91      0.87       869

________________________________________________________________________
label_exp_reject
              precision    recall  f1-score   support

           1    