In [1]:
!pip install tweet-preprocessor



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))


Found GPU at: /device:GPU:0


In [4]:
!pip install pytorch-transformers
!pip install transformers



In [5]:
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import preprocessor as p

from transformers import XLNetTokenizer, XLNetForSequenceClassification, DebertaTokenizer, DebertaForSequenceClassification, RobertaTokenizerFast, RobertaForSequenceClassification, ElectraTokenizer, ElectraForSequenceClassification
from transformers import AdamW

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)


'Tesla T4'

In [7]:
df = pd.read_csv("../data/Constraint_Train.csv")
val_df = pd.read_csv("../data/Constraint_Val.csv")
test_df = pd.read_csv("../data/english_test_with_labels.csv")
test_label_df = pd.read_csv('../data/english_test_with_labels.csv')


In [8]:
df.shape, val_df.shape, test_df.shape

((6420, 3), (2140, 3), (2140, 3))

In [9]:
def preprocess(row):
  text = row['tweet']
  # text = text.strip('\xa0')
  text = p.clean(text)
  # text = re.sub(r'\([0-9]+\)', '', text).strip()    
  return text


In [10]:
# df['tweet'] = df.apply(lambda x: preprocess(x), 1)
# val_df['tweet'] = val_df.apply(lambda x: preprocess(x), 1)
# test_df['tweet'] = test_df.apply(lambda x: preprocess(x), 1)


In [11]:
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [12]:
df.iloc[1, 1]

'States reported 1121 deaths a small rise from last Tuesday. Southern states reported 640 of those deaths. https://t.co/YASGRTT4ux'

In [13]:
test_df.head(2)

Unnamed: 0,id,tweet,label
0,1,Our daily update is published. States reported...,real
1,2,Alfalfa is the only cure for COVID-19.,fake


In [14]:
def map_label(row):
  return 0 if row['label']=='real' else 1

df['label_encoded'] = df.apply(lambda x: map_label(x), 1)
val_df['label_encoded'] = val_df.apply(lambda x: map_label(x), 1)
# test_df['label_encoded'] = test_df.apply(lambda x: map_label(x), 1)


In [15]:
train_sentences = df.tweet.values
train_token_ids = df.id.values
val_sentences = val_df.tweet.values
val_token_ids = val_df.id.values
test_sentences = test_df.tweet.values
test_token_ids = test_df.id.values


In [16]:
train_sentences = [sentence + " [SEP] [CLS]" for sentence in train_sentences]
train_labels = df.label_encoded.values
val_sentences = [sentence + " [SEP] [CLS]" for sentence in val_sentences]
val_labels = val_df.label_encoded.values
test_sentences = [sentence + " [SEP] [CLS]" for sentence in test_sentences]


In [17]:
MAX_LEN = 128
batch_size = 64


In [18]:
def get_dataloader(network, train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids):
  dic = {"XLNET": "xlnet-base-cased", "ROBERT": "roberta-base", "XLM-ROBERT": "xlm-roberta-base", "DeBERTa": "microsoft/deberta-base", "Electra": "google/electra-base-discriminator"}
  if network=='XLNET':
    tokenizer = XLNetTokenizer.from_pretrained(dic[network], do_lower_case=True)
  elif network=='DeBERTa':
    tokenizer = DebertaTokenizer.from_pretrained(dic[network], do_lower_case=True)
  elif network=='Electra':
    tokenizer = ElectraTokenizer.from_pretrained(dic[network], do_lower_case=True)
  else:
    tokenizer = RobertaTokenizerFast.from_pretrained(dic[network], do_lower_case=True)

  tokenized_train_texts = [tokenizer.tokenize(sent) for sent in train_sentences]
  print ("Tokenize the first sentence:")
  print (tokenized_train_texts[0])

  tokenized_val_texts = [tokenizer.tokenize(sent) for sent in val_sentences]
  tokenized_test_texts = [tokenizer.tokenize(sent) for sent in test_sentences]
  input_train_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_train_texts]
  input_val_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_val_texts]
  input_test_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_test_texts]

  input_train_ids = pad_sequences(input_train_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
  input_val_ids = pad_sequences(input_val_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
  input_test_ids = pad_sequences(input_test_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  # Create attention masks
  train_attention_masks = []

  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_train_ids:
    seq_mask = [float(i>0) for i in seq]
    train_attention_masks.append(seq_mask)
  train_masks = np.array(train_attention_masks)

  # Create attention masks
  val_attention_masks = []

  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_val_ids:
    seq_mask = [float(i>0) for i in seq]
    val_attention_masks.append(seq_mask)
  validation_masks = np.array(val_attention_masks)

  # Create attention masks
  test_attention_masks = []

  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_test_ids:
    seq_mask = [float(i>0) for i in seq]
    test_attention_masks.append(seq_mask)
  test_masks = np.array(test_attention_masks)

  train_inputs = torch.tensor(input_train_ids)
  validation_inputs = torch.tensor(input_val_ids)
  test_inputs = torch.tensor(input_test_ids)
  train_labels = torch.tensor(train_labels)
  validation_labels = torch.tensor(val_labels)
  train_masks = torch.tensor(train_masks)
  validation_masks = torch.tensor(validation_masks)
  test_masks = torch.tensor(test_masks)

  train_data = TensorDataset(torch.tensor(train_token_ids), train_inputs, train_masks, train_labels)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  validation_data = TensorDataset(torch.tensor(val_token_ids), validation_inputs, validation_masks, validation_labels)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  test_data = TensorDataset(torch.tensor(test_token_ids), test_inputs, test_masks)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  return train_dataloader, validation_dataloader, test_dataloader


In [19]:
# train_dataloader_XLNET, val_dataloader_XLNET, test_dataloader_XLNET = get_dataloader("XLNET", train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids)


In [20]:
# train_dataloader_ROBERT, val_dataloader_ROBERT, test_dataloader_ROBERT = get_dataloader("ROBERT", train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids)


In [21]:
# train_dataloader_XLM_ROBERT, val_dataloader_XLM_ROBERT, test_dataloader_XLM_ROBERT = get_dataloader("XLM-ROBERT", train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids)


In [22]:
# train_dataloader_DeBERTa, val_dataloader_DeBERTa, test_dataloader_DeBERTa = get_dataloader("DeBERTa", train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids)


In [23]:
train_dataloader_Electra, val_dataloader_Electra, test_dataloader_Electra = get_dataloader("Electra", train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


Tokenize the first sentence:
['the', 'cdc', 'currently', 'reports', '99', '##0', '##31', 'deaths', '.', 'in', 'general', 'the', 'disc', '##re', '##pan', '##cies', 'in', 'death', 'counts', 'between', 'different', 'sources', 'are', 'small', 'and', 'ex', '##pl', '##ica', '##ble', '.', 'the', 'death', 'toll', 'stands', 'at', 'roughly', '1000', '##00', 'people', 'today', '.', '[SEP]', '[CLS]']


In [24]:
# model1 = RobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)
# directory_path = '/content/drive/MyDrive/Constraint/models/Best models'
# model1.load_state_dict(torch.load(directory_path+'/XLM-ROBERTa_base_preprocess_link_v1.ckpt'))
# model1.eval()
# model1.cuda()


In [25]:
# model2 = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
# directory_path = '/content/drive/MyDrive/Constraint/models/Best models'
# model2.load_state_dict(torch.load(directory_path+'/ROBERTa_base_preprocess_v2.ckpt'))
# model2.eval()
# model2.cuda()


In [26]:
# model3 = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
# directory_path = '/content/drive/MyDrive/Constraint/models/Best models'
# model3.load_state_dict(torch.load(directory_path+'/XLNet_base_cased_v2.ckpt'))
# model3.eval()
# model3.cuda()


In [27]:
# model4 = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2)
# directory_path = '/content/drive/MyDrive/Constraint/models/Best models'
# model4.load_state_dict(torch.load(directory_path+'/DeBERTa_base_preprocess_link_v1.ckpt'))
# model4.eval()
# model4.cuda()



In [28]:
model5 = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)
directory_path = '/content/drive/MyDrive/Constraint/models/Best models'
model5.load_state_dict(torch.load(directory_path+'/Electra_base_0.973414_preprocess_model.ckpt'))
model5.eval()
model5.cuda()



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440343552.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.d

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [29]:
def get_model_preds_labels(model, dataloader, mode='train'):
  data_vectors = []
  labels = []
  ids = []

  with torch.no_grad():
      correct = 0
      total = 0
      for i, batch in enumerate(dataloader):
        batch = tuple(t.to(device) for t in batch)
        if mode=='test':
            token_ids, b_input_ids, b_input_mask = batch
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            for point, tk_id in zip(F.softmax(outputs.logits).tolist(), token_ids.tolist()):
              data_vectors.append(point)       
              ids.append(tk_id)     
        else:
          # Unpack the inputs from our dataloader
          token_ids, b_input_ids, b_input_mask, b_labels = batch
          # Forward pass
          outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
          for point, lab, tk_id in zip(F.softmax(outputs.logits).tolist(), b_labels.tolist(), token_ids.tolist()):
            data_vectors.append(point)
            labels.append(lab)
            ids.append(tk_id)

  if mode=='test':
    return data_vectors, ids
  else:
    return data_vectors, labels, ids


In [30]:
# m1, lb, train_ids1 = get_model_preds_labels(model1, train_dataloader_XLM_ROBERT)

In [31]:
# m2, _, train_ids2 = get_model_preds_labels(model2, train_dataloader_ROBERT)

In [32]:
# m3, _, train_ids3 = get_model_preds_labels(model3, train_dataloader_XLNET)

In [33]:
# m4, _, train_ids4 = get_model_preds_labels(model4, train_dataloader_DeBERTa)

In [34]:
m5, _, train_ids5 = get_model_preds_labels(model5, train_dataloader_Electra)



In [35]:
pred_labels = ["real" if np.argmax(np.array([each]), 1)[0]==0 else "fake" for each in m5]
train_pred_df = pd.DataFrame({'id': train_ids5, 'predicted_label': pred_labels})
train_pred_df = df[["id", "label"]].merge(train_pred_df, on='id', how='left')
train_pred_df.head()

Unnamed: 0,id,label,predicted_label
0,1,real,real
1,2,real,real
2,3,fake,fake
3,4,real,real
4,5,real,real


In [36]:
train_pred_df.shape

(6420, 3)

In [37]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score


In [38]:
confusion_matrix(train_pred_df['label'].values, train_pred_df['predicted_label'].values)

array([[2887,  173],
       [   0, 3360]])

In [39]:
accuracy_score(train_pred_df['label'].values, train_pred_df['predicted_label'].values)

0.9730529595015577

In [40]:
precision_score(train_pred_df['label'].values, train_pred_df['predicted_label'].values, average='micro')

0.9730529595015577

In [41]:
recall_score(train_pred_df['label'].values, train_pred_df['predicted_label'].values, average='micro')

0.9730529595015577

In [42]:
f1_score(train_pred_df['label'].values, train_pred_df['predicted_label'].values, average='micro')

0.9730529595015577

In [43]:
# v1, v_lb, v_ids1 = get_model_preds_labels(model1, val_dataloader_XLM_ROBERT)

In [44]:
# v2, _, v_ids2 = get_model_preds_labels(model2, val_dataloader_ROBERT)

In [45]:
# v3, _, v_ids3 = get_model_preds_labels(model3, val_dataloader_XLNET)

In [46]:
# v4, _, v_ids4 = get_model_preds_labels(model4, val_dataloader_DeBERTa)

In [47]:
v5, _, v_ids5 = get_model_preds_labels(model5, val_dataloader_Electra)



In [48]:
val_pred_labels = ["real" if np.argmax(np.array([each]), 1)[0]==0 else "fake" for each in v5]
val_pred_df = pd.DataFrame({'id': v_ids5, 'predicted_label': val_pred_labels})
val_pred_df = val_df[["id", "label"]].merge(val_pred_df, on='id', how='left')
val_pred_df.head()



Unnamed: 0,id,label,predicted_label
0,1,fake,fake
1,2,fake,real
2,3,fake,fake
3,4,fake,fake
4,5,real,real


In [49]:
confusion_matrix(val_pred_df['label'].values, val_pred_df['predicted_label'].values)

array([[ 913,  107],
       [   5, 1115]])

In [50]:
accuracy_score(val_pred_df['label'].values, val_pred_df['predicted_label'].values)

0.9476635514018692

In [51]:
precision_score(val_pred_df['label'].values, val_pred_df['predicted_label'].values, average='micro')

0.9476635514018692

In [52]:
recall_score(val_pred_df['label'].values, val_pred_df['predicted_label'].values, average='micro')

0.9476635514018692

In [53]:
f1_score(val_pred_df['label'].values, val_pred_df['predicted_label'].values, average='micro')

0.9476635514018692

In [54]:
val_pred_df.shape

(2140, 3)

In [55]:
# t1, t_ids1 = get_model_preds_labels(model1, test_dataloader_XLM_ROBERT, 'test')

In [56]:
# t2, t_ids2 = get_model_preds_labels(model2, test_dataloader_ROBERT, 'test')

In [57]:
# t3, t_ids3 = get_model_preds_labels(model3, test_dataloader_XLNET, 'test')

In [58]:
# t4, t_ids4 = get_model_preds_labels(model4, test_dataloader_DeBERTa, 'test')

In [59]:
t5, t_ids5 = get_model_preds_labels(model5, test_dataloader_Electra, 'test')

  


In [60]:
test_pred_labels = ["real" if np.argmax(np.array([each]), 1)[0]==0 else "fake" for each in t5]
test_pred_df = pd.DataFrame({'id': t_ids5, 'predicted_label': test_pred_labels})
test_pred_df = test_label_df[["id", "label"]].merge(test_pred_df, on='id', how='left')
test_pred_df.head()



Unnamed: 0,id,label,predicted_label
0,1,real,real
1,2,fake,fake
2,3,fake,fake
3,4,real,real
4,5,real,real


In [61]:
test_pred_df.shape


(2140, 3)

In [62]:
confusion_matrix(test_pred_df['label'].values, test_pred_df['predicted_label'].values)

array([[ 924,   96],
       [   5, 1115]])

In [63]:
accuracy_score(test_pred_df['label'].values, test_pred_df['predicted_label'].values)

0.952803738317757

In [64]:
precision_score(test_pred_df['label'].values, test_pred_df['predicted_label'].values, average='micro')

0.952803738317757

In [65]:
recall_score(test_pred_df['label'].values, test_pred_df['predicted_label'].values, average='micro')

0.952803738317757

In [66]:
f1_score(test_pred_df['label'].values, test_pred_df['predicted_label'].values, average='micro')

0.952803738317757