In [1]:
# We will use the LIAR dataset from this paper: https://aclanthology.org/P17-2067/
# Link https://www.cs.ucsb.edu/~william/data/liar_dataset.zip

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
import os
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [4]:
# Load and see training data
liar_data_train = pd.read_csv(os.path.join(r'/content/drive/MyDrive/Teaching/Online/DeepLearning/Assignment2/liar_dataset',
                                           'train.tsv'), sep='\t', header=None)
liar_data_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [5]:
print(set(liar_data_train[1].to_list()))

{'true', 'mostly-true', 'half-true', 'barely-true', 'pants-fire', 'false'}


In [6]:
# Categories
cat_dict = {'pants-fire': 0,
            'false': 1,
            'barely-true': 2,
            'half-true': 3,
            'mostly-true': 4,
            'true': 5}

In [10]:
# Load all splits of data
# Write an API to load training, validation, and test sets
def load_dataset(split_type='train'):
    '''
    split_type is either 'train', 'valid', or 'test'
    Returns X, y
    '''
    liar_data = pd.read_csv(os.path.join(r'/content/drive/MyDrive/Teaching/Online/DeepLearning/Assignment2/liar_dataset',
                                         '{}.tsv'.format(split_type)),
                            sep='\t',
                            header=None)
    cat_dict = {'pants-fire': 0,
            'false': 1,
            'barely-true': 2,
            'half-true': 3,
            'mostly-true': 4,
            'true': 5}
    
    X = liar_data[2].to_list()
    y = [cat_dict[i] for i in liar_data[1].to_list()]
    return X, y

X_train, y_train = load_dataset('train')
X_valid, y_valid = load_dataset('valid')
X_test, y_test = load_dataset('test')

In [13]:
# Preprocess text
# Create tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(X_train)

# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

# sequence encode
encoded_train = tokenizer.texts_to_sequences(X_train)

# pad sequences
# max_length will be a tunable hyperparameter
max_length = 20
X_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')

# Preprocess validation set
# sequence encode
encoded_valid = tokenizer.texts_to_sequences(X_valid)

# pad sequences
X_valid = pad_sequences(encoded_valid, maxlen=max_length, padding='post')


# Preprocess test set
# sequence encode
encoded_test = tokenizer.texts_to_sequences(X_test)

# pad sequences
X_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')

In [16]:
# Convert categories to one-hot encoding
y_train = to_categorical(y_train)
y_valid = to_categorical(y_valid)
y_test = to_categorical(y_test)

# Start your solutions below