In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
import torchvision.transforms as transforms
import numpy as np
from sklearn.metrics import f1_score

# Preprocessing Trainset Testset Validset

In [2]:
#Trainset
data_file='train.tsv'
df=pd.read_csv(data_file,sep='\t',header=None)
#df=df.sort_values(by=1)

In [3]:
df_train=df[[1,2]]
df_train.columns =['category', 'news'] 
df_train.head()

Unnamed: 0,category,news
0,false,Says the Annies List political group supports ...
1,half-true,When did the decline of coal start? It started...
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo..."
3,false,Health care reform legislation is likely to ma...
4,half-true,The economic turnaround started at the end of ...


In [4]:
df_train.category.value_counts()

half-true      2114
false          1995
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: category, dtype: int64

In [5]:
labels=df_train.category.unique()

In [6]:
#create label dictionary
label_dict={}

for i,label in enumerate(labels):
    label_dict[label]=i

In [7]:
label_dict

{'false': 0,
 'half-true': 1,
 'mostly-true': 2,
 'true': 3,
 'barely-true': 4,
 'pants-fire': 5}

In [8]:
df_train['label']=df_train.category.replace(label_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['label']=df_train.category.replace(label_dict)


In [9]:
len(label_dict)

6

In [10]:
df_train.head()

Unnamed: 0,category,news,label
0,false,Says the Annies List political group supports ...,0
1,half-true,When did the decline of coal start? It started...,1
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",2
3,false,Health care reform legislation is likely to ma...,0
4,half-true,The economic turnaround started at the end of ...,1


In [11]:
#testset
data_file='test.tsv'
df_test=pd.read_csv(data_file,sep='\t',header=None)
#df_test=df_test.sort_values(by=2)

In [12]:
df_test=df_test[[1,2]]
df_test.columns =['category', 'news'] 
df_test['label']=df_test.category.replace(label_dict)

df_test.head()

Unnamed: 0,category,news,label
0,true,Building a wall on the U.S.-Mexico border will...,3
1,false,Wisconsin is on pace to double the number of l...,0
2,false,Says John McCain has done nothing to help the ...,0
3,half-true,Suzanne Bonamici supports a plan that will cut...,1
4,pants-fire,When asked by a reporter whether hes at the ce...,5


In [13]:
#validset
data_file='valid.tsv'

df_valid=pd.read_csv(data_file,sep='\t',header=None)
#df_valid=df_valid.sort_values(by=2)

df_valid=df_valid[[1,2]]
df_valid.columns =['category', 'news'] 

df_valid['label']=df_valid.category.replace(label_dict)

df_valid.head()

Unnamed: 0,category,news,label
0,barely-true,We have less Americans working now than in the...,4
1,pants-fire,"When Obama was sworn into office, he DID NOT u...",5
2,false,Says Having organizations parading as being so...,0
3,half-true,Says nearly half of Oregons children are poor.,1
4,half-true,On attacks by Republicans that various program...,1


In [14]:
df_train.shape

(10240, 3)

In [15]:
df_test.shape

(1267, 3)

In [16]:
df_valid.shape

(1284, 3)

In [17]:
frames = [df_train, df_test, df_valid]
result = pd.concat(frames)


In [18]:
#Average length of each news statement 
result["Length"]= result["news"].str.len() 
result.Length.max()


3192

In [19]:
result

Unnamed: 0,category,news,label,Length
0,false,Says the Annies List political group supports ...,0,82
1,half-true,When did the decline of coal start? It started...,1,141
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",2,105
3,false,Health care reform legislation is likely to ma...,0,78
4,half-true,The economic turnaround started at the end of ...,1,54
...,...,...,...,...
1279,half-true,"For the first time in more than a decade, impo...",1,104
1280,mostly-true,Says Donald Trump has bankrupted his companies...,2,82
1281,true,"John McCain and George Bush have ""absolutely n...",3,80
1282,false,A new poll shows 62 percent support the presid...,0,196


In [20]:
len(result['news'][0])

3

In [21]:
result.shape

(12791, 4)

In [22]:
result.groupby(['category']).count()

Unnamed: 0_level_0,news,label,Length
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
barely-true,2103,2103,2103
false,2507,2507,2507
half-true,2627,2627,2627
mostly-true,2454,2454,2454
pants-fire,1047,1047,1047
true,2053,2053,2053


# Loading Tokenizer and Encoding Data

In [23]:
from transformers import BertTokenizer # part of BERT model 
from torch.utils.data import TensorDataset #setup dataset in a pytorch environment
from torch import Tensor #change number to tensor because run on GPU

In [24]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True) #use bert-as-service to encode data

In [25]:
#from words to encodded 
#this is a dictionary 
encoded_train=tokenizer.batch_encode_plus(
    df_train.news.values,
    add_special_tokens=True, #know where sentences end and where sentences begin
    return_attention_mask=True, #  fix input.  need to make them the same length.
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt' #currently using pytorch 
)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [26]:
#from words to encodded 
encoded_test=tokenizer.batch_encode_plus(
    df_test.news.values,
    add_special_tokens=True,
    return_attention_mask=True, # we use fix input. we need to make them the same length
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt' #currently using pytorch 
)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [27]:
#from words to encodded
encoded_valid=tokenizer.batch_encode_plus(
    df_valid.news.values,
    add_special_tokens=True,
    return_attention_mask=True, # we use fix input. we need to make them the same length
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt' #currently using pytorch 
)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [28]:
input_ids_train=encoded_train['input_ids'] #represent each word as a number; encoded_train is a dictionary
attention_mask_train=encoded_train['attention_mask']
labels_train=torch.tensor(df_train.label.values) #change labels into tensors in pytorch environment

In [29]:
input_ids_test=encoded_test['input_ids']
attention_mask_test=encoded_test['attention_mask'] #input_ids_test and attention mask have same demensions
labels_test=torch.tensor(df_test.label.values) #change labels into tensors in pytorch environment

In [30]:
input_ids_valid=encoded_valid['input_ids']
attention_mask_valid=encoded_valid['attention_mask'] #input_ids_valid and attention mask have same demensions
labels_valid=torch.tensor(df_valid.label.values) #change labels into tensors in pytorch environment

In [31]:
#using dataset in Pytorch environment 
traindataset=TensorDataset(input_ids_train,#number
                           attention_mask_train,
                           labels_train) #tensor

In [32]:
testdataset=TensorDataset(input_ids_test,
                           attention_mask_test,
                           labels_test)

In [33]:
validdataset=TensorDataset(input_ids_valid,
                           attention_mask_valid,
                           labels_valid)

In [34]:
len(traindataset)# check 

10240

In [35]:
len(testdataset) #check

1267

In [36]:
len(validdataset) #check

1284

In [37]:
attention_mask_test.shape

torch.Size([1267, 256])

In [38]:
encoded_train['attention_mask']

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

# Importing BERT pretained model

In [39]:
from transformers import BertForSequenceClassification 

In [40]:
#use bert-base-uncased save time. dont use bert large 
model=BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=6,output_attentions=False,output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# DataLoader

In [41]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


In [42]:
dataloader_train=DataLoader(
    traindataset,
    sampler=RandomSampler(traindataset),#randomize dataset during training
    batch_size=4
)

In [43]:
dataloader_test=DataLoader(
    testdataset,
    sampler=SequentialSampler(testdataset),
    batch_size=32
)

In [44]:
dataloader_valid=DataLoader(
    validdataset,
    sampler=SequentialSampler(validdataset),
    batch_size=32
)

# Optimizer and Scheduler

In [45]:
#optimizer define learning rate
from transformers import AdamW, get_linear_schedule_with_warmup

In [46]:
optimizer=AdamW(model.parameters(),lr=2e-5,eps=1e-8) #lr:2e-5>5e-5

In [47]:
epochs=5
scheduler=get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=len(dataloader_train)*epochs)
#len(dataloader_train)*epochs : how many times learning rate needs to be changed 

# Setting up Performance Metrics

In [48]:
def f1_score_function(pred, labels):
    pred_flatten=np.argmax(pred,axis=1).flatten()
    labels_flat=labels.flatten()
    return f1_score(labels_flat,pred_flatten,average='weighted') #use macro, depends on situation

# Training Loop

In [49]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
model.to(device)
print(device)

cuda


In [50]:
#seed values, safe!
import random

random.seed(17)
np.random.seed(17)
torch.manual_seed(17)
torch.cuda.manual_seed_all(17)

In [51]:
def evaluate(dataloader_test):
    model.eval()
    loss_val_total=0
    predictions,true_vals=[],[]
    
    for batch in tqdm(dataloader_test):
        batch=tuple(b.to(device) for b in batch)
        inputs={'input_ids':batch[0],'attention_mask':batch[1],'labels':batch[2]}
        
        with torch.no_grad():
            outputs=model(**inputs)
            
        loss=outputs[0]
        logits=outputs[1]
        loss_val_total+=loss.item()
        
        logits=logits.detach().cpu().numpy()
        label_ids=inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg=loss_val_total/len(dataloader_test)
    
    predictions=np.concatenate(predictions,axis=0)
    true_vals=np.concatenate(true_vals,axis=0)
    
    return loss_val_avg,predictions, true_vals

In [52]:
for epoch in tqdm(range(1,epochs+1)):
    model.train()
    loss_train_total=0
    progress_bar=tqdm(dataloader_train,
                     desc='Epoch {:1d}'.format(epoch),
                     leave=False)
    for batch in progress_bar:
        model.zero_grad()
        batch=tuple(b.to(device) for b in batch)
        inputs={
            'input_ids':batch[0],
            'attention_mask':batch[1],
            'labels':batch[2]
        }
        outputs=model(**inputs)
        
        loss=outputs[0]
        loss_train_total+=loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss':'{:.3f}'.format(loss.item()/len(batch))})
    
    torch.save(model.state_dict(),f'Models/BERT_test{epoch}.model')
    
    tqdm.write('\nEpoch {epoch}')
    
    loss_train_avg=loss_train_total/len(dataloader_train)
    tqdm.write(f'Training Loss: {loss_train_avg}')
    
    val_loss,predictions,true_vals=evaluate(dataloader_test)
    val_f1=f1_score_function(predictions,true_vals)
    tqdm.write(f'Test Loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')
        
        
        
        

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=2560.0, style=ProgressStyle(description_wid…


Epoch {epoch}
Training Loss: 1.7210439275484533


HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


Test Loss: 1.6652259528636932
F1 Score (weighted): 0.26667281948074734


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=2560.0, style=ProgressStyle(description_wid…


Epoch {epoch}
Training Loss: 1.571524359099567


HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


Test Loss: 1.7038526713848114
F1 Score (weighted): 0.2613241042528043


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=2560.0, style=ProgressStyle(description_wid…


Epoch {epoch}
Training Loss: 1.178377306467155


HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


Test Loss: 1.9510326087474823
F1 Score (weighted): 0.2747776114679362


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=2560.0, style=ProgressStyle(description_wid…


Epoch {epoch}
Training Loss: 0.7352157044588239


HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


Test Loss: 2.637904095649719
F1 Score (weighted): 0.25306513004651426


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=2560.0, style=ProgressStyle(description_wid…


Epoch {epoch}
Training Loss: 0.4450014728774477


HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


Test Loss: 3.218305027484894
F1 Score (weighted): 0.25635908004619845

