<a href="https://colab.research.google.com/github/gandharvsuri/Real-or-Not-/blob/master/Real_or_Not%3F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
!pip install transformers



In [50]:
# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch
import tensorflow as tf

# Preliminaries

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [51]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [52]:
#Load Data

train_data = pd.read_csv('/content/drive/My Drive/Real or Not?/train.csv')
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [53]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [54]:
train_data["keyword"].fillna(value = "NotKnown",  inplace = True)
train_data["location"].fillna(value = "NotKnown",  inplace = True)

In [55]:
print(train_data.shape[0])

7613


In [56]:
text = train_data.text.values
target = train_data.target.values

In [57]:
from transformers import BertTokenizer

tokenizer =   BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case = True)

In [58]:
#testing out the tokenizer

print("Original text : ",text[0])
print("Tokenized : ",tokenizer.tokenize(text[0]))
print("Tok ids : ", tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[0])))

Original text :  Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Tokenized :  ['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', '#', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all']
Tok ids :  [2256, 15616, 2024, 1996, 3114, 1997, 2023, 1001, 8372, 2089, 16455, 9641, 2149, 2035]


In [59]:
#Convert to BERT format

input_ids = []
for tweet in text:
  encoded_sent = tokenizer.encode(
      tweet,
      add_special_tokens = True,
      max_length = 200,
      pad_to_max_length = True,
      truncation = True
      #return_tensors = 'pt'
  )

  input_ids.append(encoded_sent)
  
print("Original tweet : ",text[0])
print("Encoded : ",input_ids[0])

Original tweet :  Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Encoded :  [101, 2256, 15616, 2024, 1996, 3114, 1997, 2023, 1001, 8372, 2089, 16455, 9641, 2149, 2035, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [60]:
attention_masks = []

for tweet in input_ids:
  att_mask = [int(token_id>0) for token_id in tweet]
  attention_masks.append(att_mask)

In [61]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids,target,random_state = 2018, test_size = 0.1)

train_masks, val_masks, _, _ = train_test_split(attention_masks,target,random_state = 2018,test_size = 0.1)

In [62]:
train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)

train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)


In [63]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size = batch_size)


In [90]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [65]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [66]:
from transformers import get_linear_schedule_with_warmup

epochs = 4

total_steps = len(train_dataloader)*epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
scheduler

<torch.optim.lr_scheduler.LambdaLR at 0x7fd7ca8883c8>

In [67]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [35]:
import random 

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_val = []
min_val_acc = 0.0

for e in range(epochs):

  #+++++++++++++++++++++++++++++++#
        #TRAINING LOOP#
  #+++++++++++++++++++++++++++++++#
  print("Running Training......")
  print("======Epoch {:}/{:}======".format(e+1,epochs))

  running_loss = 0.0

  model.train()

  for step,batch in enumerate(train_dataloader):

    b_input_ids = batch[0].to(device)
    b_input_masks = batch[1].to(device)
    b_labels = batch[2].to(device)

    model.zero_grad()
    optimizer.zero_grad()

    output = model(b_input_ids, token_type_ids = None, attention_mask = b_input_masks, labels = b_labels)

    loss = output[0]

    running_loss += loss.item()
    loss.backward()

    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    optimizer.step()

    scheduler.step()

  avg_training_loss = running_loss/len(train_dataloader)

  loss_val.append(avg_training_loss)

  print("Training Loss : {:}".format(avg_training_loss))
  print("Running Validation.....")


  #++++++++++++++++++++++++++++++#
  #       VALIDATION
  #++++++++++++++++++++++++++++++#
  val_acc = 0.0
  model.eval()

  for batch in val_dataloader:

    b_input_ids = batch[0].to(device)
    b_input_masks = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():
      output = model(b_input_ids, token_type_ids = None, attention_mask = b_input_masks)

    logits = output[0]
    
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
        
    # Calculate the accuracy for this batch of test sentences.
    val_acc += flat_accuracy(logits,label_ids)
 
  print("Validation accuracy : {:}".format(val_acc/len(val_dataloader)))

  print("")

  if(val_acc > min_val_acc):
    min_val_loss = val_loss;
    print("Saving Model.....")
    torch.save(model.state_dict(),'/content/drive/My Drive/Real or Not?/model.pt')

Running Training......
Training Loss : 0.04418663308132952
Running Validation.....
Validation accuracy : 0.8255208333333334

Saving Model.....
Running Training......
Training Loss : 0.22215062920693396
Running Validation.....
Validation accuracy : 0.8247395833333333

Saving Model.....
Running Training......
Training Loss : 0.22657466982871224
Running Validation.....
Validation accuracy : 0.8255208333333334

Saving Model.....
Running Training......
Training Loss : 0.22796820802071363
Running Validation.....
Validation accuracy : 0.8247395833333333

Saving Model.....


In [91]:
model.load_state_dict(torch.load('/content/drive/My Drive/Real or Not?/model.pt'))

<All keys matched successfully>

In [75]:
test_data = pd.read_csv('/content/drive/My Drive/Real or Not?/test.csv')
sample_submission = pd.read_csv('/content/drive/My Drive/Real or Not?/sample_submission.csv')
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [70]:
test_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [84]:
test_text = test_data.text.values

test_input_ids = []

for tweet in test_text:
  encoded_sent = tokenizer.encode(
      tweet,
      add_special_tokens = True,
      max_length = 200,
      pad_to_max_length = True,
      truncation = True
      #return_tensors = 'pt'
  )

  test_input_ids.append(encoded_sent)


test_attention_masks = []

for tweet in test_input_ids:
  att_mask = [int(token_id>0) for token_id in tweet]
  test_attention_masks.append(att_mask)

In [80]:
sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      3263 non-null   int64
 1   target  3263 non-null   int64
dtypes: int64(2)
memory usage: 51.1 KB


In [85]:
pred_inputs = torch.tensor(test_input_ids)
pred_masks = torch.tensor(test_attention_masks)
labels = np.array(sample_submission["target"])
pred_labels = torch.tensor(labels)

batch_size = 16

pred_data = TensorDataset(pred_inputs, pred_masks, pred_labels)
pred_sampler = SequentialSampler(pred_data)
pred_dataloader = DataLoader(pred_data, sampler = pred_sampler, batch_size = batch_size)

In [93]:
preds = []
true_labels = []

model.eval()

for batch in pred_dataloader:
  b_input_ids = batch[0].to(device)
  b_input_masks = batch[1].to(device)
  b_labels = batch[2].to(device)

  with torch.no_grad():
    output = model(b_input_ids, token_type_ids = None, attention_mask = b_input_masks)

    logits = output[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    preds.append(logits)
    true_labels.append(label_ids)



In [97]:
flat_predictions = [item for sublist in preds for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

In [98]:
flat_predictions

array([1, 1, 1, ..., 1, 1, 1])

In [99]:
sample_submission['target'] = flat_predictions
sample_submission.to_csv('/content/drive/My Drive/Real or Not?/submission.csv', index = False)