In [1]:
import pandas as pd
import torch, gc
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# gc.collect()
# torch.cuda.empty_cache()


In [2]:
import os
import numpy as np
import re
import pickle

In [3]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [4]:
# dataset = pd.read_csv('../FakeNewsNet/Preprocessed_FakeNewsNet.csv', quotechar='"', index_col=0, encoding="latin-1")

In [8]:
# import nvidia_smi

# nvidia_smi.nvmlInit()

# deviceCount = nvidia_smi.nvmlDeviceGetCount()
# for i in range(deviceCount):
#     handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
#     info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
#     print("Device {}: {}, Memory : ({:.2f}% free): {}(total), {} (free), {} (used)".format(i, nvidia_smi.nvmlDeviceGetName(handle), 100*info.free/info.total, info.total, info.free, info.used))

# nvidia_smi.nvmlShutdown()

## Read all LIAR data

In [4]:
original_titles = ['statement_ID', 'label', 'statement', 'subject', 'speaker', 'speaker_job_title', 'state_info','party_affiliation', 'barely_true',
                  'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']

file_dir = '../dataset/liar_dataset_/'

def dataset_development(dir_, new_title):
    dataset = pd.read_csv(dir_, sep = '\t', header=None, index_col = False)
#     titles = ['col_' + str(i) for i in range(1, len(list(dataset.columns))+1)]
    titles = [i for i in range(len(list(dataset.columns)))]
    title_dict = {i:j for i,j in zip(titles, new_title)}
    dataset = dataset.rename(columns=title_dict)
#     dataset.columns = new_title
    
    return dataset

train_data = dataset_development(file_dir + 'train.tsv', original_titles)
valid_data = dataset_development(file_dir + 'valid.tsv', original_titles)
test_data = dataset_development(file_dir + 'test.tsv', original_titles)

In [5]:
train_data['y'] = train_data['label'].apply(lambda x: 0 if x == 'pants-fire' else 1 if x == 'false'
                                         else 2 if x == 'half-true' else 3 if x == 'mostly-true' else 4)

In [6]:
valid_data['y'] = valid_data['label'].apply(lambda x: 0 if x == 'pants-fire' else 1 if x == 'false'
                                         else 2 if x == 'half-true' else 3 if x == 'mostly-true' else 4)
test_data['y'] = test_data['label'].apply(lambda x: 0 if x == 'pants-fire' else 1 if x == 'false'
                                         else 2 if x == 'half-true' else 3 if x == 'mostly-true' else 4)

In [7]:
print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)

(10240, 15)
(1284, 15)
(1267, 15)


## Data Preprocessing

In [7]:
def clean_text(df):
    all_text_contents = list()
#     lines = df["text"].values.tolist()
    lines = df["statement"].astype('string').tolist()
    for text in lines:
        text = text.lower()
        
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', text)
        
        emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        text = emoji.sub(r'', text)
        
        text = re.sub(r"i'm", "i am", text)
        text = re.sub(r"i've", "i have", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"that's", "that is", text)        
        text = re.sub(r"what's", "what is", text)
        text = re.sub(r"they're", "they are", text)        
        text = re.sub(r"you're", "you are", text)   
        text = re.sub(r"you've", "you have", text)   
        text = re.sub(r"we've", "we have", text) 
        text = re.sub(r"they've", "they have", text) 
#         text = re.sub(r"what's", "what is", text)
        text = re.sub(r"where's", "where is", text) 
        text = re.sub(r"\'ll", " will", text)  
        text = re.sub(r"\'ve", " have", text)  
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"don't", "do not", text)
        text = re.sub(r"didn't", "did not", text)
        text = re.sub(r"can't", "can not", text)
        text = re.sub(r"it's", "it is", text)
        text = re.sub(r"couldn't", "could not", text)
        text = re.sub(r"have't", "have not", text)
        
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", " ", text)
        tokens = word_tokenize(text)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
#         stop_words = set(stopwords.words("english"))
#         stop_words.discard("not")
#         words = [w for w in words if not w in stop_words]
        words = ' '.join(words)
        all_text_contents.append(words)
    return all_text_contents

In [8]:
# def tokenizing_and_more(corpus):
#     texts = []
#     for text in corpus:
# #         lemmatized_words = []
#         tokenized_words = []
# #         for word in text.split(" "):
#         for line in text:
# #             if word not in set(stopwords.words('english')):
# #                 lemmatized_word = wordnet_lemmatizer.lemmatize(word)
#             tokenized_words = 
#             tokenized_words.append(word)
#         texts.append(tokenized_words)
            
#     return texts

In [8]:
train_cleansed_statements = clean_text(train_data)
len(train_cleansed_statements)

10240

In [9]:
train_data['preprocessed_statement'] = train_cleansed_statements
train_data['tokenized_preprocessed_statement'] = train_data['preprocessed_statement'].apply(lambda x: word_tokenize(x))

In [10]:
train_data['word_count'] = train_data['tokenized_preprocessed_statement'].apply(lambda x: len(x))
train_data['word_count'].max()

456

In [12]:
train_data.head()

Unnamed: 0,statement_ID,label,statement,subject,speaker,speaker_job_title,state_info,party_affiliation,barely_true,false,half_true,mostly_true,pants_on_fire,context,y,preprocessed_statement,tokenized_preprocessed_statement,word_count
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,1,says the annies list political group supports ...,"[says, the, annies, list, political, group, su...",12
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,2,when did the decline of coal start it started ...,"[when, did, the, decline, of, coal, start, it,...",24
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,3,hillary clinton agrees with john mccain by vot...,"[hillary, clinton, agrees, with, john, mccain,...",19
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,1,health care reform legislation is likely to ma...,"[health, care, reform, legislation, is, likely...",12
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,2,the economic turnaround started at the end of ...,"[the, economic, turnaround, started, at, the, ...",10


In [None]:
# train_data.to_csv('C:/Users/Jiwi/Documents/NLP/Fakes News Detection/dataset/liar_dataset_/preprocessed_train.csv')

## Bert Embedding

In [None]:
# Reference for long texts: https://github.com/jamescalam/transformers/blob/main/course/language_classification/04_window_method_in_pytorch.ipynb

In [None]:
''' Take a look at flair's  TransformerWordEmbeddings!!!! '''

In [11]:
from transformers import BertTokenizer, BertModel

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
device

device(type='cuda')

In [13]:
# list(tokenizer.vocab.keys())[5000:5020]
model = BertModel.from_pretrained('bert-base-cased', output_hidden_states = True).to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
print("Size of Vocabulary with which BERT was pre-trained on:", len(list(tokenizer.vocab.keys())))

Size of Vocabulary with which BERT was pre-trained on: 28996


In [91]:
# input_ = '[CLS] ' +  train_cleansed_statements[0] + ' [SEP]'
# tokenized_test = tokenizer.tokenize(input_)
# tokenized_test
# indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_test)
# indexed_tokens

In [93]:
# tokenized_test

# segments_ids = [1] * len(tokenized_test)
# print('segments_ids length:', len(segments_ids))
# print('tokenized text length:', len(tokenized_test))

In [None]:
''' ------------- Some important notes in BERT embeddings ------------- '''

'''hash signs preceding some of these subwords are just our tokenizer’s way to denote that this subword 
or character is part of a larger word and preceded by another subword.'''

## Input data for bert embeddings

In [14]:


# text = "Replace me by any text you'd like because I hate dirty space here."
# encoded_input = tokenizer(text, return_tensors='pt', do_lower_case =True)  
# encoded_input

def bert_tokenized_text(corpus):
    embedding_info = dict()
    for idx, corp in enumerate(corpus):
        input_ = '[CLS] ' +  corp + '.' + ' [SEP]'
        tokenized_texts = tokenizer.tokenize(input_)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_texts)
        segments_ids = [idx] * len(tokenized_texts)
        
        assert len(tokenized_texts) == len(segments_ids)
        assert len(tokenized_texts) == len(indexed_tokens)
        
        embedding_info['Sentence_' + str(idx)] = {'Tokenized corpus': tokenized_texts, 
                                                 'Indexed Tokens': indexed_tokens, 'Segment IDs': segments_ids}
    return embedding_info
        
Embedding_input_info_liar_train = bert_tokenized_text(train_cleansed_statements)

In [39]:
print(list(Embedding_input_info_liar_train.items())[0], end='')

('Sentence_0', {'Tokenized corpus': ['[CLS]', 'says', 'the', 'an', '##nies', 'list', 'political', 'group', 'supports', 'third', 'trim', '##ester', 'abortion', '##s', 'on', 'demand', '.', '[SEP]'], 'Indexed Tokens': [101, 1867, 1103, 1126, 16133, 2190, 1741, 1372, 6253, 1503, 13373, 12831, 12030, 1116, 1113, 4555, 119, 102], 'Segment IDs': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})

### Find highest number of size of indexed tokens:

In [26]:
sentence_token_id_sizes = []
for tups in list(Embedding_input_info_liar_train.items()):
    sentence_info = tups[1]
    indexed_tokens_size = len(sentence_info['Indexed Tokens'])
    sentence_token_id_sizes.append(indexed_tokens_size)

In [27]:
print('Highest Size of Sentence Token ID:', max(sentence_token_id_sizes))

Highest Size of Sentence Token ID: 596


In [25]:
# print(list(Embedding_input_info_liar_train.items())[:1][0][1], end='')

In [26]:
# sample_tensor = torch.Tensor([list(Embedding_input_info_liar_train.items())[:1][0][1]['Indexed Tokens']]).cuda(device)

In [19]:
# model = BertModel.from_pretrained('bert-base-cased', output_hidden_states = True).to(device)

### sample bert testing

In [19]:
# print(Embedding_input_info_liar_train, end='')

In [20]:
# first_embedding = Embedding_input_info_liar_train['Sentence_0']['Indexed Tokens']

In [30]:
# zero_tensor = torch.zeros(10).reshape(1, -1).to(device)
# zero_tensor.shape

torch.Size([1, 10])

In [23]:
# token_tensor.shape
# zero_tensor = torch.zeros(10).reshape(1, -1).to(device)
# token_tensor_padded = torch.cat([token_tensor, zero_tensor], dim=1).to(device)
# token_tensor_padded.shape

In [18]:
# token_tensor

In [20]:
# token_tensor_padded = token_tensor_padded.type(torch.LongTensor)

In [22]:
# token_tensor_padded.shape

In [19]:
# # token_tensor = torch.LongTensor([first_embedding]).cuda(device)
# with torch.no_grad():
#     outputs = model(token_tensor_padded.cuda(device))
#     hidden_states = outputs[2]
#     token_embeddings = torch.stack(hidden_states, dim=0)
#     token_embeddings = torch.squeeze(token_embeddings, dim=1)
# #         print(token_embeddings.size())
#     token_padded_embeddings = token_embeddings.permute(1,0,2)

In [16]:
# token_padded_embeddings[-1]

## Embedding Creation Part

In [21]:
from tqdm import tqdm
import time

In [53]:
# zero_tensor = torch.zeros(10).reshape(1, -1).type(torch.LongTensor).to(device)
# token_tensor_padded = torch.cat([token_tensor, zero_tensor], dim=1).to(device)

In [64]:
model.eval()
def create_bert_embeddings(input_dictionary):
#     model.eval()
    max_seq_len = 512
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     print('Device:', device)
#     current_device = device
    # sep_indicator = torch.LongTensor([102]).cuda(device)
    sep_indicator = torch.LongTensor([102]).to(device)
    layer_embeddings = []

    for idx, tups in enumerate(tqdm(list(input_dictionary.items()))):
#         token_ids, segment_ids = tups[1]['Indexed Tokens'], tups[1]['Segment IDs']
        token_ids = tups[1]['Indexed Tokens']
    #     model.eval()
#         tokens_tensor, segment_id_tensor = torch.LongTensor([token_ids]).to(device), torch.Tensor([segment_ids]).to(device)
        tokens_tensor = torch.LongTensor([token_ids]).to(device)
#         print('tokens_tensor:', tokens_tensor.is_cuda)
        if tokens_tensor.shape[1] > max_seq_len:

            tokens_tensor = tokens_tensor[:,:max_seq_len-1].to(device)
#             print('tokens_tensor 1:', tokens_tensor.is_cuda)
            tokens_tensor = torch.cat((tokens_tensor, sep_indicator.unsqueeze(0)), dim=-1).to(device)
#             print('tokens_tensor 2:', tokens_tensor.is_cuda)

        else:
            zero_tensor = torch.zeros(max_seq_len-tokens_tensor.shape[1]).reshape(1, -1).to(device)
#             print('zero_tensor :', zero_tensor.is_cuda)
            tokens_tensor = torch.cat([tokens_tensor, zero_tensor], dim=1).type(torch.LongTensor).to(device)


    #         print(tokens_tensor, tokens_tensor.shape)
#         print('tokens_tensor:', tokens_tensor.is_cuda)
    #     print('tensor type:', type(tokens_tensor))

        # one of these tokens tensors has size: torch.Size([1, 596])

        # Run the text through BERT, and collect all of the hidden states produced from all 12 layers.
#         print('current device:', device)
        with torch.no_grad():
            # the third item will be the hidden states from all layers.
            
            outputs = model(tokens_tensor)
            
            
            tokens_tensor.to('cpu')
            hidden_states = outputs[2]

            token_embeddings = torch.stack(hidden_states, dim=0)
            token_embeddings = torch.squeeze(token_embeddings, dim=1).permute(1,0,2)
    #         print(token_embeddings.size())
#             token_embeddings = token_embeddings.permute(1,0,2)

            # sum of the outputs from last 4 layers
            summed_tok_vecs = []

            for token in token_embeddings:
    #             print('embeddings from last 4 layers:', token[-4:])
    #             print('=' * 100)
#                 summed_tok_vecs.append(torch.sum(token[-4:], dim=0))
#                 sum_vec = torch.sum(token[-4:], dim=0)
    #             print('summed vector:',  sum_vec)
    #             print('shape of summed vector:', sum_vec.shape)
#                 summed_tok_vecs.append(sum_vec)
                summed_tok_vecs.append(torch.sum(token[-4:], dim=0))


            layer_embeddings.append(summed_tok_vecs)
            time.sleep(0.5)
            
    return layer_embeddings

def assemble_and_bert_embeddings(layer_embeddings, destination_dir):
    stacked_vectors_ = []
    for i in range(len(layer_embeddings)):
        vecs_to_stack = tuple([embed_vect for embed_vect in layer_embeddings[i]])

        stacked_vectors = torch.stack(vecs_to_stack, dim=0)
        stacked_vectors_.append(stacked_vectors)
        
#     with open(destination_dir + 'huggingface_bert_train_embeddings.pickle', 'wb') as f:
#         pickle.dump(stacked_vectors_, f)
        
    return stacked_vectors_
    
def save_bert_embeddings(stacked_vectors):
    destination_dir = '../dataset/liar_dataset_/huggingface_bert_embeddings/'
    with open(destination_dir + 'huggingface_bert_train_embeddings.pickle', 'wb') as f:
        pickle.dump(stacked_vectors, f)
    
def main():
    
    layer_embeddings = create_bert_embeddings(Embedding_input_info_liar_train)
    token_stacked_embeddings = assemble_and_bert_embeddings(layer_embeddings, '../dataset/liar_dataset_/huggingface_bert_embeddings/')
    save_bert_embeddings(token_stacked_embeddings)
    

if __name__ == "__main__":
    main()
            

 60%|██████████████████████████████████████████████▌                              | 6191/10240 [56:15<36:47,  1.83it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 11.00 GiB total capacity; 9.52 GiB already allocated; 19.75 MiB free; 9.57 GiB reserved in total by PyTorch)

In [67]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [68]:
from GPUtil import showUtilization as gpu_usage
gpu_usage()

| ID | GPU | MEM |
------------------


## Check the saved embeddings

In [8]:
destination_dir = '../dataset/liar_dataset_/huggingface_bert_embeddings/'
with open(destination_dir + 'huggingface_bert_train_embeddings.pickle', 'rb') as pickle_file:
    loaded_train_embeddings = pickle.load(pickle_file)

In [9]:
len(loaded_train_embeddings)

5

In [10]:
print(len(loaded_train_embeddings))
print(loaded_train_embeddings[0].shape, loaded_train_embeddings[1].shape)
print(loaded_train_embeddings[-1].shape)

5
torch.Size([512, 768]) torch.Size([512, 768])
torch.Size([512, 768])


In [35]:
# pkl_file = open(destination_dir + 'number.pickle', 'rb')
# data = pickle.load(pkl_file)
# print(data)
# print(type(data))
# pkl_file.close()

## Getting all embeddings together

In [50]:
# stacked_vectors_ = []
# for i in range(len(sample_embedding)):
#     vecs_to_stack = tuple([embed_vect for embed_vect in sample_embedding[i]])

#     stacked_vectors = torch.stack(vecs_to_stack, dim=0)
#     stacked_vectors_.append(stacked_vectors)
    

In [51]:
# print(len(stacked_vectors_))
# print(stacked_vectors_[0].shape, stacked_vectors_[1].shape)
# print(stacked_vectors_[2].shape)

In [None]:
with open('../dataset/liar_dataset_/huggingface_bert_embeddings/huggingface_bert_train_embeddings.pickle', 'wb') as f:
    pickle.dump(stacked_vectors_, f)

# Flair's Embedding package (another option for embedding)

In [7]:
# preprocessed_train_data = pd.read_csv('../dataset/liar_dataset_/preprocessed_train.csv', sep = '\t')

In [17]:
train_data.head()

Unnamed: 0,statement_ID,label,statement,subject,speaker,speaker_job_title,state_info,party_affiliation,barely_true,false,half_true,mostly_true,pants_on_fire,context,preprocessed_statement,tokenized_preprocessed_statement,word_count
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,says the annies list political group supports ...,"[says, the, annies, list, political, group, su...",12
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,when did the decline of coal start it started ...,"[when, did, the, decline, of, coal, start, it,...",24
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,hillary clinton agrees with john mccain by vot...,"[hillary, clinton, agrees, with, john, mccain,...",19
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,health care reform legislation is likely to ma...,"[health, care, reform, legislation, is, likely...",12
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,the economic turnaround started at the end of ...,"[the, economic, turnaround, started, at, the, ...",10


In [99]:
train_data['label'].unique()

array(['false', 'half-true', 'mostly-true', 'true', 'barely-true',
       'pants-fire'], dtype=object)

### give labels their numerical values

In [18]:
from flair.embeddings import TransformerWordEmbeddings
from flair.data import Sentence

In [19]:
bert_embedding = TransformerWordEmbeddings('bert-base-uncased', subtoken_pooling='mean', layers='-1,-2,-3,-4')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
sentence_1 = Sentence(list(train_data['preprocessed_statement'])[0])
type(sentence_1)

flair.data.Sentence

In [35]:
# [tok for tok in sentence_1]
bert_embedding.embed(sentence_1)

# print(sentence_1[0].embedding)

[Sentence: "says the annies list political group supports third trimester abortions on demand"   [− Tokens: 12]]

In [20]:
sentence_1[0].embedding

tensor([], device='cuda:0')

# Loop for embedding

In [26]:
from tqdm import tqdm
import time

In [27]:
# bert_embedding.embed(sentence_1)
corpus_embeddings = []
for corpus in tqdm(list(train_data['preprocessed_statement'])):
    corpus = Sentence(corpus)
    bert_embedding.embed(corpus)
    token_embeddings = []
    for token in corpus:
    #     print(token.embedding.shape)
        token_embeddings.append(token.embedding.unsqueeze(0).cuda(device))

    token_embeddings = torch.cat(token_embeddings, dim=0).cuda(device)
    corpus_embeddings.append(token_embeddings)
    time.sleep(0.5)


100%|██████████████████████████████████████████████████████████████████████████| 10240/10240 [1:29:57<00:00,  1.90it/s]


In [31]:
with open('../dataset/liar_dataset_/bert_embeddings/bert_train_embeddings.pickle', 'wb') as f:
    pickle.dump(corpus_embeddings, f)

In [29]:
corpus_embeddings[-1].shape

torch.Size([30, 768])

In [31]:
token_embeddings.shape

torch.Size([12, 768])

### putting results from layers together

In [53]:
# sample_embedding[0]

# token_vecs_cat = []

# for token in sample_embedding[0]:
    
#     # `token` is a [12 x 768] tensor

#     # Concatenate the vectors (that is, append them together) from the last 
#     # four layers.
#     # Each layer vector is 768 values, so `cat_vec` is length 3,072.
# #     print(token[-1].shape)
#     cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    
#     # Use `cat_vec` to represent `token`.
#     token_vecs_cat.append(cat_vec)

In [21]:
# # confirming outputs of everything
# print('Number of layers:', len(sample_embedding[0]))  # 13 layers
# print('Number of batches:',len(sample_embedding[0][0]))  # 1 sentence
# print('Number of tokens:',len(sample_embedding[0][0][0]))  # 17 tokens
# print('Number of hidden units:',len(sample_embedding[0][0][0][0]))

### Make the data into batches (TO be done once all embeddings are made)

## CNN Model

In [11]:
import torchvision
import torchvision.transforms as transforms

In [12]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
#         self.conv1 = nn.Conv2d(1, 10, 3
          # (in_channel, out_channel, kernel size)  Input dims: (batch size, Num channels in, Height_in, Width_in)
                                            # Output dims: (batch size, Num channels out, Height_out, Width_out)   could start with filter size of 3 & 5
#         self.pool = nn.MaxPool2d(2, 2)  # (kernel_size, stride)  Input dims: (batch size, Num channels in, Height_in, Width_in)   
        self.emb_size = 768
        self.conv1 = nn.Conv1d(self.emb_size, 50, kernel_size=2, stride=2)      # Input dims: (batch size, Num channels in, L_in)  Output dims: (batch size, Num channels out, L_out)
        self.pool1 = nn.MaxPool1d(2, stride=2)
        self.conv2 = nn.Conv1d(50, 20, kernel_size=2, stride=2)
        self.pool2 = nn.MaxPool1d(2, stride=2)
        self.fc1 = nn.Linear(20 * 32, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 5)  # 5 classes

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()

In [13]:
criterion = nn.CrossEntropyLoss()  # used for multi-class classification, nn.BCEWithLogitsLoss() is used when there are binary classes 
                                    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

## Train

In [None]:
'''get embeddings and labels as tuples'''

### verify dimensions of input

In [14]:
# sample_train_true_y = list(train_data['y'])[:5]
# sample_train_true_y
# zero_vector = [np.zeros(5) for _ in range(len(sample_train_true_y))]

# sample_train_true_ys = []
# for vector, idx in zip(zero_vector, sample_train_true_y):

#     vector[idx] = 1 
#     sample_train_true_ys.append(vector)
# true_y
# zero_vector[1] = 1
# zero_vector
sample_train_true_y = [np.array([i]) for i in list(train_data['y'])[:5]]
# print(sample_train_true_y)
train_dataset = [(x, y) for x, y in zip(loaded_train_embeddings, sample_train_true_y)]
# train_dataset
x, y = train_dataset[0]
x.shape

In [48]:
from sklearn.metrics import accuracy_score, f1_score, precision_score
import matplotlib.pyplot as plt

In [None]:
# for x, y in enumerate(range(20)):
#     plt.plot(x, y)

In [None]:
# def training_loss_plotting(loss_value):
    

In [28]:
train_batch_losses, val_batch_losses = [], []

for epoch in range(2):  # loop over the dataset multiple times (epochs)

    running_epoch_loss = 0.0
    
    # Here we are looping over each batch 
    for i, data in enumerate(train_dataset[:1], 0):   # in my case, I made the trainData as a list of tuples of (embedding(shape:[512, 768]), label)
        # get the inputs; data is a list of [inputs, labels]
        net.train()     
        inputs, labels = data
        inputs = inputs.unsqueeze(0).permute(0, 2, 1)
   
#         print(inputs.shape)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)   # In this case the input has shape: (batch size, Num channels in, Height_in, Width_in)   
                                 # shape: torch.Size([1, 5])
        
#         print(outputs.shape)
        
        softmax_outputs = F.softmax(outputs, dim=1) # shape: torch.Size([1, 5])
        
        pred_y = torch.argmax(sample_softmax, dim=1)
#         print('softmax_outputs:', softmax_outputs, softmax_outputs.shape)
#         print('pred_y:', pred_y.item(), pred_y.shape)

#         loss = criterion(outputs, labels)
  
        train_loss = criterion(softmax_outputs, torch.LongTensor(labels))
        
#         train_losses.append(train_loss)
        
#         print('loss:', loss)
        train_loss.backward()  # calculates gradients
        optimizer.step()   # Updating the weights

        # print statistics
        running_epoch_loss += train_loss.item()
        if i % 1 == 1:    # print every 200 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_epoch_loss / 2000))
            running_epoch_loss = 0.0
    train_batch_losses.append(running_epoch_loss)
            
        # Do the same with Validation data:
#     va
    val_running_loss = 0.0
    for i, data in enumerate(valid_dataset, 0):   # in my case, I made the trainData as a list of tuples of (embedding(shape:[512, 768]), label)
        # get the inputs; data is a list of [inputs, labels]
        net.eval()     
        valid_inputs, valid_labels = data
        valid_inputs = valid_inputs.unsqueeze(0).permute(0, 2, 1)
        
        val_outputs = net(inputs)
        val_softmax_outputs = F.softmax(val_outputs, dim=1)
        val_pred_y = torch.argmax(val_softmax_outputs, dim=1)
        
        val_loss = criterion(val_softmax_outputs, torch.LongTensor(valid_labels))
        
        val_running_loss += val_loss/item()
        
    val_batch_losses.append(val_running_loss)
        

print('Finished Training')

pred_y: 4 torch.Size([1])
pred_y: 4 torch.Size([1])
Finished Training


In [26]:
sample_softmax = torch.Tensor([[0.1828, 0.2236, 0.1699, 0.1922, 0.2315]])
s_s_argnax = torch.argmax(sample_softmax, dim=1)

print(s_s_argnax)

tensor([4])


## Save model

In [30]:
PATH = '../dataset/liar_dataset_/BERT_CNN_weights/cnn_bert_1.pth'

In [182]:
PATH = '../dataset/liar_dataset_/BERT_CNN_weights/cnn_bert_1.pth'
torch.save(net.state_dict(), PATH)

## load saved weights to model

In [31]:
net = Net()
net.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [37]:
train_dataset[0]

(tensor([[-1.4252, -3.8400, -3.8484,  ..., -5.4332,  1.9479,  3.5162],
         [ 0.1662, -7.5079,  2.8423,  ..., -3.7434,  6.5641,  2.5978],
         [-2.9217, -5.5550, -1.3506,  ..., -1.5238,  1.6359,  2.7358],
         ...,
         [-0.3025, -1.8622,  0.0407,  ..., -9.4621,  3.7189, -0.0241],
         [ 0.3599, -3.6430, -0.7354,  ..., -9.5790,  2.8197, -1.4477],
         [-2.5329, -3.1569, -0.9558,  ..., -8.0567,  2.8855, -1.3816]]),
 array([1]))

In [None]:
'''net.eval() '''  # used to evaluate or test 

## Test model

In [49]:
def test_model(saved_model, dataset):
    pred_test_ys, true_test_ys = [], []
    saved_model.eval()  # This means we are testing the model 
    with torch.no_grad():  # exclude gradient computations.
        for i, data in enumerate(dataset, 0):
            inputs, true_test_y = data
            inputs = inputs.unsqueeze(0)
            inputs = inputs.permute(0, 2, 1)
            pred_classes = saved_model(inputs)
            softmax_test_output = F.softmax(pred_classes)

            pred_test_y = torch.argmax(softmax_test_output, dim=1)
            pred_test_ys.append(pred_test_y.item())
            true_test_ys.append(true_test_y)
        
    test_accuracy = accuracy_score(true_test_ys, pred_test_ys)
    test_precision = precision_score(true_test_ys, pred_test_ys, average='micro')
    test_f1 = f1_score(true_test_ys, pred_test_ys, average='micro')
    
    return test_accuracy, test_precision, test_f1

In [50]:
test_accuracy, test_precision, test_f1 = test_model(net, train_dataset)

  softmax_test_output = F.softmax(pred_classes)


In [44]:
test_accuracy

0.6

In [51]:
test_precision

0.6

In [45]:
test_f1

0.6