In [3]:
import pandas as pd
import numpy as np
import os
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import time
import datetime

In [4]:
# column info
# data from https://www.kaggle.com/datasets/doanquanvietnamca/liar-dataset?resource=download
'''
Column 1: the ID of the statement ([ID].json).
Column 2: the label.
Column 3: the statement.
Column 4: the subject(s).
Column 5: the speaker.
Column 6: the speaker's job title.
Column 7: the state info.
Column 8: the party affiliation.
Column 9-13: the total credit history count, including the current statement.
9: barely true counts.
10: false counts.
11: half true counts.
12: mostly true counts.
13: pants on fire counts.
Column 14: the context (venue / location of the speech or statement).
'''

"\nColumn 1: the ID of the statement ([ID].json).\nColumn 2: the label.\nColumn 3: the statement.\nColumn 4: the subject(s).\nColumn 5: the speaker.\nColumn 6: the speaker's job title.\nColumn 7: the state info.\nColumn 8: the party affiliation.\nColumn 9-13: the total credit history count, including the current statement.\n9: barely true counts.\n10: false counts.\n11: half true counts.\n12: mostly true counts.\n13: pants on fire counts.\nColumn 14: the context (venue / location of the speech or statement).\n"

In [5]:
train = pd.read_csv("train_cleaned.csv").drop("Unnamed: 0",axis=1)
test = pd.read_csv("test_cleaned.csv").drop("Unnamed: 0",axis=1)
val = pd.read_csv("val_cleaned.csv").drop("Unnamed: 0",axis=1)
all_data = pd.concat([train, test, val], ignore_index=True)

In [28]:
# remove rows where statement < 3 words
def remove_short_and_long_statements(df):
    df["len"] = df["statement"].str.split().str.len()
    df = df[df["len"] >= 3]
    df = df[df["len"] <= 100]
    df = df.drop("len", axis = 1)
    return df

In [7]:
train = remove_short_and_long_statements(train)
test = remove_short_and_long_statements(test)
val = remove_short_and_long_statements(val)

# Method 1

In [8]:
def add_text_features(df):
    new_statements = [ ]
    for index, row in df.iterrows():
        speaker = " ".join(row["speaker"].split("-"))
        new_text = f'I am {speaker}, a {row["speaker_job"]} and a {row["party"]} from {row["state"]}. I am at {row["context"]}. '
        new_statements.append(new_text + row["statement"])
    
    df["statement"] = new_statements
    
    return df
        

In [9]:
train = add_text_features(train)
test = add_text_features(test)
val = add_text_features(val)

In [10]:
import tensorflow as tf
import numpy as np
import os
import tensorflow_hub as hub
import tensorflow_text as text
# from official.nlp import optimization  # to create AdamW optimizer
import matplotlib.pyplot as plt
import pandas as pd

In [11]:
tf.get_logger().setLevel('ERROR')

In [12]:
label_to_val = {
        "pants-fire": 0,
        "false" : 1,
        "barely-true": 2,
        "half-true": 3,
        "mostly-true": 4,
        "true" : 5
}
train["label"] = train['label'].map(label_to_val)
test["label"] = test['label'].map(label_to_val)
val["label"] = val['label'].map(label_to_val)

In [13]:
from sklearn.model_selection import train_test_split

# load data
num_classes = len(train["label"].unique())

# train data
train = train[["statement", "label"]]
x_train = train["statement"]
y_train = tf.keras.utils.to_categorical(train["label"].values, num_classes=num_classes)

# test data
test = test[["statement", "label"]]
x_test = test["statement"]
y_test = tf.keras.utils.to_categorical(test["label"].values, num_classes=num_classes)

# validation data
val = val[["statement", "label"]]
x_val = val["statement"]
y_val = tf.keras.utils.to_categorical(val["label"].values, num_classes=num_classes)

In [14]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8' 
# bert_model_name = 'small_bert/bert_en_uncased_L-12_H-128_A-2'

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

In [15]:
preprocessor = hub.KerasLayer(tfhub_handle_preprocess)
encoder = hub.KerasLayer(tfhub_handle_encoder)

2024-11-14 13:57:49.090295: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
def get_embeddings(sentences):
    '''return BERT-like embeddings of input text
    Args:
     - sentences: list of strings
    Output:
      - BERT-like embeddings: tf.Tensor of shape=(len(sentences), 768)
    '''
    preprocessed_text = preprocessor(sentences)
    return encoder(preprocessed_text)['pooled_output']

In [17]:
# define a model as the preprocessor and encoder layers followed by a dropout and a dense layer with a 
# softmax activation function and an output space dimensionality equal to the number of classes we want to predict
num_classes = 6
i = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = preprocessor(i)
x = encoder(x)
x = tf.keras.layers.Dropout(0.1, name="dropout")(x['pooled_output']) # 0.2
x = tf.keras.layers.Dense(num_classes, activation='softmax', name="output")(x)

model = tf.keras.Model(i, x)

In [18]:
# compile and fit model
# if the metric does not improve for at least 3 epochs (patience), the training is interrupted and the weights 
# from the epoch where the validation loss showed the best value (i.e. lowest) are restored
n_epochs = 20

METRICS = [
      tf.keras.metrics.CategoricalAccuracy(name="accuracy"), 
      tf.keras.metrics.CategoricalCrossentropy(name="cross_entropy")
]

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", 
                                                      patience = 3,
                                                      restore_best_weights = True)

model.compile(optimizer = "adam",
              loss = "categorical_crossentropy",
              metrics = METRICS)

model_fit = model.fit(x_train, 
                      y_train, 
                      epochs = n_epochs,
                      validation_data = (x_test, y_test),
                      callbacks = [earlystop_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


In [20]:
# run model on validation set
def predict_class(statements):
    '''predict class of input text
    '''
    return [np.argmax(pred) for pred in model.predict(statements)]

# predict_class(reviews)

In [21]:
# test set performance
from sklearn.metrics import classification_report

y_pred = predict_class(val["statement"])
print(classification_report(val["label"], y_pred))

print("MAE", mae(y_pred, val["label"]))

              precision    recall  f1-score   support

           0       0.57      0.17      0.26       116
           1       0.23      0.32      0.27       263
           2       0.21      0.19      0.20       237
           3       0.21      0.28      0.24       248
           4       0.31      0.29      0.30       251
           5       0.22      0.14      0.17       169

    accuracy                           0.24      1284
   macro avg       0.29      0.23      0.24      1284
weighted avg       0.27      0.24      0.24      1284



In [24]:
def mae(pred, actual):
    return np.sum(np.abs(pred - actual)) / len(pred)

In [25]:
mae(y_pred, val["label"])

1.3714953271028036

# Method 2

In [26]:
train = pd.read_csv("train_cleaned.csv").drop("Unnamed: 0",axis=1)
test = pd.read_csv("test_cleaned.csv").drop("Unnamed: 0",axis=1)
val = pd.read_csv("val_cleaned.csv").drop("Unnamed: 0",axis=1)

In [29]:
train = remove_short_and_long_statements(train)
test = remove_short_and_long_statements(test)
val = remove_short_and_long_statements(val)

In [30]:
def add_num_features(df):
    new_statements = [ ]
    for index, row in df.iterrows():
        speaker = " ".join(row["speaker"].split("-"))
        new_text = f'I am {speaker}, a {row["speaker_job"]} and a {row["party"]} from {row["state"]}. I am at {row["context"]}. I have said {int(row["barely_true_counts"])} barely true statements, {int(row["false_counts"])} statements, {int(row["half_true_counts"])} half true statements, {int(row["mostly_true_counts"])} mostly true statements, and {int(row["pants_on_fire_counts"])} completely false statements. '
        new_statements.append(new_text + row["statement"])
    df["statement"] = new_statements
    return df

In [31]:
train = add_num_features(train)
test = add_num_features(test)
val = add_num_features(val)

In [32]:
label_to_val = {
        "pants-fire": 0,
        "false" : 1,
        "barely-true": 2,
        "half-true": 3,
        "mostly-true": 4,
        "true" : 5
}
train["label"] = train['label'].map(label_to_val)
test["label"] = test['label'].map(label_to_val)
val["label"] = val['label'].map(label_to_val)

In [33]:
preprocessor = hub.KerasLayer(tfhub_handle_preprocess)
encoder = hub.KerasLayer(tfhub_handle_encoder)

In [34]:
# define a model as the preprocessor and encoder layers followed by a dropout and a dense layer with a 
# softmax activation function and an output space dimensionality equal to the number of classes we want to predict
num_classes = 6
i = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = preprocessor(i)
x = encoder(x)
x = tf.keras.layers.Dropout(0.2, name="dropout")(x['pooled_output']) # 0.2
x = tf.keras.layers.Dense(num_classes, activation='softmax', name="output")(x)

model = tf.keras.Model(i, x)

In [35]:
# compile and fit model
# if the metric does not improve for at least 3 epochs (patience), the training is interrupted and the weights 
# from the epoch where the validation loss showed the best value (i.e. lowest) are restored
n_epochs = 20

METRICS = [
      tf.keras.metrics.CategoricalAccuracy(name="accuracy"), 
      tf.keras.metrics.CategoricalCrossentropy(name="cross_entropy")
]

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", 
                                                      patience = 3,
                                                      restore_best_weights = True)

model.compile(optimizer = "adam",
              loss = "categorical_crossentropy",
              metrics = METRICS)

model_fit = model.fit(x_train, 
                      y_train, 
                      epochs = n_epochs,
                      validation_data = (x_test, y_test),
                      callbacks = [earlystop_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [36]:
y_pred = predict_class(val["statement"])
print(classification_report(val["label"], y_pred))

print("MAE", mae(y_pred, val["label"]))

              precision    recall  f1-score   support

           0       0.35      0.23      0.28       116
           1       0.25      0.15      0.19       263
           2       0.16      0.03      0.06       237
           3       0.22      0.53      0.31       248
           4       0.28      0.31      0.30       251
           5       0.20      0.15      0.17       169

    accuracy                           0.24      1284
   macro avg       0.24      0.24      0.22      1284
weighted avg       0.24      0.24      0.22      1284

MAE 1.3870716510903427


# Method 3

In [221]:
train = pd.read_csv("train_cleaned.csv").drop("Unnamed: 0",axis=1)
test = pd.read_csv("test_cleaned.csv").drop("Unnamed: 0",axis=1)
val = pd.read_csv("val_cleaned.csv").drop("Unnamed: 0",axis=1)
all_data = pd.concat([train, test, val], ignore_index=True)

In [222]:
train = remove_short_and_long_statements(train)
test = remove_short_and_long_statements(test)
val = remove_short_and_long_statements(val)

In [223]:
train = add_text_features(train)
test = add_text_features(test)
val = add_text_features(val)

In [163]:
import torch
from transformers import BertTokenizer, BertModel

In [206]:
model_name = 'bert-base-uncased'  # You can choose other BERT models
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tokens = tokenizer.encode_plus()

In [275]:
def get_embeddings(texts: list[str], batch_size: int =10) -> torch.tensor:
  """Returns mean vector for the last hidden layer from the Bert Model

  Args:
    texts: (list[str]) - List of sentences/paragraph
    batch_size: int - Number of sentences in a single batch

  Returns:
    torch.tensor -- embedding for each sentence/paragraph in texts
  """
  torch.cuda.empty_cache()

  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  model = BertModel.from_pretrained('bert-base-uncased')

  # determine number of buckets
  num_buckets = math.ceil(len(texts) // batch_size)

  # split data into buckets and generate embeddings
  vectors = []
  for bucket in np.array_split(texts, num_buckets):
    model = model.to('cuda') # Move the model to GPU
    with torch.no_grad():
      tokens = tokenizer(bucket.tolist(), return_tensors="pt", padding=True, truncation=True).to('cuda')
      embeddings = model(**tokens).last_hidden_state.mean(dim=1).detach().cpu()
      model = model.cpu()
    vectors.append(embeddings)

  return torch.concatenate(vectors)

In [None]:
def get_num_features(df, embeddings):
  num_cols = ["barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts",
              "pants_on_fire_counts"]
  num_features = torch.tensor(df[num_cols].astype("float32").values)

  return torch.cat((embeddings, num_features), dim=1)

In [None]:
train_bert = get_embeddings(train["statement"].to_list())
test_bert = get_embeddings(test["statement"].to_list())
val_bert = get_embeddings(val["statement"].to_list())

In [None]:
train_x = get_num_features(train, train_bert)
test_x = get_num_features(test, test_bert)
val_x = get_num_features(val, val_bert)

In [None]:
label_to_val = {
        "pants-fire": 0,
        "false" : 1,
        "barely-true": 2,
        "half-true": 3,
        "mostly-true": 4,
        "true" : 5
}
train["label"] = train['label'].map(label_to_val)
test["label"] = test['label'].map(label_to_val)
val["label"] = val['label'].map(label_to_val)

In [None]:
train_y = torch.tensor(train["label"].to_numpy())
test_y = torch.tensor(test["label"].to_numpy())
val_y = torch.tensor(val["label"].to_numpy())

In [None]:
def get_trainloaders(train, test, val, batch_size = 32):
    train_dataloader = torch.utils.data.DataLoader(
                train,
                batch_size=batch_size,
                shuffle=True,
    )
    val_dataloader = torch.utils.data.DataLoader(
                val,
                batch_size=batch_size,
                shuffle=True,
    )
    test_dataloader = torch.utils.data.DataLoader(
                test,
                batch_size=batch_size,
                shuffle=True,
    )

    return train_dataloader, val_dataloader, test_dataloader

In [None]:
class Data(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = torch.reshape(y, (y.shape[0], 1))

    def __len__(self):
        # number of samples
        return self.X.shape[0]

    def __numfeatures__(self):
        # number of features/columns
        return self.X.shape[1]

    def __getitem__(self, index):
        # returns sample at index
        return self.X[index].float(), self.y[index].float()

In [None]:
train_dataset = Data(train_x, train_y)
test_dataset = Data(test_x, test_y)
val_dataset = Data(val_x, val_y)

train_dataloader, val_dataloader, test_dataloader = get_trainloaders(train, test, val, batch_size = 32)

In [None]:
from torch import nn

In [None]:
class MulticlassModel(nn.Module):
  def __init__(self, input_size, num_classes):
    super().__init__()

    self.X_size = input_size
    self.layers = nn.Sequential(
              nn.Linear(input_size, 64),
              nn.ReLU(),
              nn.Linear(64, 32),
              nn.ReLU(),
              nn.Linear(32, num_classes)
            )

  def forward(self, x):
    return self.layers(x)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = train_x.shape[1]
num_classes = 6
model = MulticlassModel(input_size, num_classes).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [None]:
# Fit the model
# torch.manual_seed(42)

def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100
    return acc

epochs = 400

train_x, train_y = train_x.to(device), train_y.to(device)
val_x, val_y = val_x.to(device), val_y.to(device)

for epoch in range(epochs):
    model.train()

    # 1. Forward pass
    y_logits = model(train_x) # model outputs raw logits
    y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1) # go from logits -> prediction probabilities -> prediction labels

    # 2. Calculate loss and accuracy
    loss = loss_fn(y_logits, train_y)
    acc = accuracy_fn(y_true=train_y,
                      y_pred=y_pred)

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    model.eval()
    with torch.inference_mode():
      # 1. Forward pass
      test_logits = model(val_x)
      test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)
      # 2. Calculate test loss and accuracy
      test_loss = loss_fn(test_logits, val_y)
      test_acc = accuracy_fn(y_true=val_y,
                             y_pred=test_pred)

    # Print out what's happening
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Acc: {acc:.2f}% | Test Loss: {test_loss:.5f}, Test Acc: {test_acc:.2f}%")

In [None]:
# Make predictions
test_x, test_y = test_x.to(device), test_y.to(device)

model.eval()
with torch.inference_mode():
    y_logits = model(test_x)

y_pred_probs = torch.softmax(y_logits, dim=1)

# Turn prediction probabilities into prediction labels
y_preds = y_pred_probs.argmax(dim=1)

# Compare first 10 model preds and test labels
print(f"Predictions: {y_preds[:10]}\nLabels: {test_y[:10]}")
print(f"Test accuracy: {accuracy_fn(y_true=test_y, y_pred=y_preds)}%")

In [None]:
def mae(pred, actual):
    return np.abs(pred.cpu() - actual.cpu()).sum().item() / len(pred)

In [None]:
mae(y_preds, test_y)