In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from transformers import BertTokenizer

In [2]:
# loading dataset files
train_df = pd.read_csv('train.tsv', delimiter='\t', header=None)
valid_df = pd.read_csv('valid.tsv', delimiter='\t', header=None)
test_df = pd.read_csv('test.tsv', delimiter='\t', header=None)

print("unique labels training data:", train_df[1].unique())

unique labels training data: ['false' 'half-true' 'mostly-true' 'true' 'barely-true' 'pants-fire']


In [3]:
# Map labels to integers
label_mapping = {
    'false': 0,
    'half-true': 0,
    'mostly-true': 1,
    'true': 1,
    'barely-true': 0,
    'pants-fire': 0
}

train_df[1] = train_df[1].map(label_mapping)
valid_df[1] = valid_df[1].map(label_mapping)
test_df[1] = test_df[1].map(label_mapping)

# dropping NANs just in case
train_df.dropna(subset=[1], inplace=True)
valid_df.dropna(subset=[1], inplace=True)
test_df.dropna(subset=[1], inplace=True)

# extract labels and text after cleaning data
train_texts = train_df[2].astype(str).values
train_labels = train_df[1].astype(int).values
valid_texts = valid_df[2].astype(str).values
valid_labels = valid_df[1].astype(int).values
test_texts = test_df[2].astype(str).values
test_labels = test_df[1].astype(int).values



In [4]:
# sequence padding and tokenising > after extracting texts and labels
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
valid_sequences = tokenizer.texts_to_sequences(valid_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

train_padded = pad_sequences(train_sequences, maxlen=100, padding='post', truncating='post')
valid_padded = pad_sequences(valid_sequences, maxlen=100, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=100, padding='post', truncating='post')


In [6]:
# Diagnostic: Check if any padded sequences are zeros
print("Zero-length train sequences:", np.sum(np.sum(train_padded, axis=1) == 0))
print("Zero-length valid sequences:", np.sum(np.sum(valid_padded, axis=1) == 0))
print("Zero-length test sequences:", np.sum(np.sum(test_padded, axis=1) == 0))


Zero-length train sequences: 0
Zero-length valid sequences: 0
Zero-length test sequences: 0


In [7]:
# defining CNNs model for training and validating
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=100),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(5),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           640000    
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           41088     
                                                                 
 max_pooling1d (MaxPooling1  (None, 19, 128)           0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 15, 128)           82048     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 24)                3

In [8]:
# train CNNs Model and evaluate it
history = model.fit(
    train_padded, train_labels,
    epochs=10,
    validation_data=(valid_padded, valid_labels),
    verbose=2
)

results = model.evaluate(test_padded, test_labels, verbose=2)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")


Epoch 1/10
320/320 - 12s - loss: 0.6437 - accuracy: 0.6439 - val_loss: 0.5995 - val_accuracy: 0.6729 - 12s/epoch - 38ms/step
Epoch 2/10
320/320 - 10s - loss: 0.5600 - accuracy: 0.7021 - val_loss: 0.6033 - val_accuracy: 0.6822 - 10s/epoch - 31ms/step
Epoch 3/10
320/320 - 10s - loss: 0.3710 - accuracy: 0.8374 - val_loss: 0.7670 - val_accuracy: 0.6488 - 10s/epoch - 32ms/step
Epoch 4/10
320/320 - 11s - loss: 0.1104 - accuracy: 0.9609 - val_loss: 1.2219 - val_accuracy: 0.6347 - 11s/epoch - 33ms/step
Epoch 5/10
320/320 - 10s - loss: 0.0223 - accuracy: 0.9940 - val_loss: 1.6718 - val_accuracy: 0.6145 - 10s/epoch - 32ms/step
Epoch 6/10
320/320 - 10s - loss: 0.0047 - accuracy: 0.9991 - val_loss: 1.9825 - val_accuracy: 0.6301 - 10s/epoch - 30ms/step
Epoch 7/10
320/320 - 10s - loss: 0.0035 - accuracy: 0.9996 - val_loss: 2.0265 - val_accuracy: 0.6301 - 10s/epoch - 31ms/step
Epoch 8/10
320/320 - 10s - loss: 0.0035 - accuracy: 0.9995 - val_loss: 2.1139 - val_accuracy: 0.6293 - 10s/epoch - 32ms/step
