In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
import pandas as pd
import numpy as np

In [4]:
# Loading and preparing data
train_df = pd.read_csv('/content/train.tsv', delimiter='\t', header=None)
valid_df = pd.read_csv('/content/valid.tsv', delimiter='\t', header=None)
test_df = pd.read_csv('/content/test.tsv', delimiter='\t', header=None)

# Create a universal set of labels from all datasets
all_labels = pd.concat([train_df[1], valid_df[1], test_df[1]])
unique_labels = sorted(all_labels.unique())
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}

# Convert labels using the universal mapping
train_labels = to_categorical(train_df[1].map(label_mapping).values, num_classes=len(unique_labels))
valid_labels = to_categorical(valid_df[1].map(label_mapping).values, num_classes=len(unique_labels))
test_labels = to_categorical(test_df[1].map(label_mapping).values, num_classes=len(unique_labels))

# Print to verify
print("Label Mapping:", label_mapping)
print("Sample Encoded Train Labels:", train_labels[:5])

# Print shape of the labels arrays
print("Train Labels Shape:", train_labels.shape)

# Check the first few labels to ensure they contain exactly one '1' per row
for i in range(5):
    print(f"Label {i}: {train_labels[i]} - Sum: {np.sum(train_labels[i])}")

Label Mapping: {'barely-true': 0, 'false': 1, 'half-true': 2, 'mostly-true': 3, 'pants-fire': 4, 'true': 5}
Sample Encoded Train Labels: [[0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
Train Labels Shape: (8919, 6)
Label 0: [0. 1. 0. 0. 0. 0.] - Sum: 1.0
Label 1: [0. 0. 1. 0. 0. 0.] - Sum: 1.0
Label 2: [0. 0. 0. 1. 0. 0.] - Sum: 1.0
Label 3: [0. 1. 0. 0. 0. 0.] - Sum: 1.0
Label 4: [0. 0. 1. 0. 0. 0.] - Sum: 1.0


In [5]:
# Print unique labels from the dataset to ensure they are correct
print("Unique labels in training data:", train_df[0].unique())


Unique labels in training data: ['2635.json' '10540.json' '324.json' ... '12269.json' '9658.json'
 '3951.json']


In [6]:
print(train_df.head())

           0            1                                                  2   \
0   2635.json        false  Says the Annies List political group supports ...   
1  10540.json    half-true  When did the decline of coal start? It started...   
2    324.json  mostly-true  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json        false  Health care reform legislation is likely to ma...   
4   9028.json    half-true  The economic turnaround started at the end of ...   

                                   3               4                     5   \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

         6           7     8     9    

In [7]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df[2])
train_sequences = tokenizer.texts_to_sequences(train_df[2])
train_padded = pad_sequences(train_sequences, maxlen=500)
valid_sequences = tokenizer.texts_to_sequences(valid_df[2])
valid_padded = pad_sequences(valid_sequences, maxlen=500)
test_sequences = tokenizer.texts_to_sequences(test_df[2])
test_padded = pad_sequences(test_sequences, maxlen=500)

# Build the model
model = Sequential([
    Embedding(10000, 64, input_length=500),
    LSTM(64),
    Dense(len(unique_labels), activation='softmax')  # Adjust the output layer
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Training
history = model.fit(train_padded, train_labels, epochs=10, validation_data=(valid_padded, valid_labels))

# Evaluation
test_loss, test_accuracy = model.evaluate(test_padded, test_labels)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 64)           640000    
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 6)                 390       
                                                                 
Total params: 673414 (2.57 MB)
Trainable params: 673414 (2.57 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 4.557751655578613, Test Accuracy: 0.22178374230861664
