# Run models in this notebook

# Google Colab Setup

In [1]:
# Only run this cell if using google 

from google.colab import drive
drive.mount('/content/gdrive')
import os

# Replace the "project" with the name of the project directory in google drive
os.chdir("gdrive/My Drive/Project")

Mounted at /content/gdrive


# Install and Import

In [2]:
!pip3 install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 7.4MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 23.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 55.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=ee24f3063e67d

In [3]:
# Import modules
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

In [4]:
# Check for either GPU or CPU
# If using colab, then select GPU by doing Runtime -> Change runtime type -> Hardware accelerator -> GPU

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Preprocess Text Data

In [None]:
def remove_useless():
    

In [30]:
# Train data
MAX_LENGTH = 128

train_df = pd.read_json('/content/train.jsonl', lines=True)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tweets = train_df.response.values
labels = []
for label in train_df.label.values:
    if label == 'SARCASM':
        labels.append(0)
    else:
        labels.append(1)
labels = torch.tensor(labels)

input_ids = []
attention_masks = []

for tweet in tweets:
    encoded = tokenizer.encode_plus(tweet, add_special_tokens=True, max_length=MAX_LENGTH, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
    
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

train_dataset = TensorDataset(input_ids, attention_masks, labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [48]:
# Test data
MAX_LENGTH = 128

test_df = pd.read_json('/content/test.jsonl', lines=True)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids = []
attention_masks = []

tweets = test_df.response.values
for tweet in tweets:
    encoded = tokenizer.encode_plus(tweet, add_special_tokens=True, max_length=MAX_LENGTH, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
    
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

test_dataset = TensorDataset(input_ids, attention_masks)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# BERT

In [34]:
# Create BERT Model

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

# Train

In [32]:
# Hyperparameters

num_epochs = 10
learning_rate = 1e-4
weight_decay = 0
batch_size = 32

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [33]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1)

In [35]:
# Train

model.train()

for epoch in range(num_epochs):
    for i, (input_ids, attention_masks, labels) in enumerate(train_loader):
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)
        
        output = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels)

        optimizer.zero_grad()
        loss = output.loss
        loss.backward()
        optimizer.step()
    print("Epoch: %d" % epoch)

Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9


# Evaluate

In [57]:
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1)

In [61]:
# Evaluate
model.eval()

pred = []

with torch.no_grad():
    for i, (input_ids, attention_masks) in enumerate(test_loader):
        input_ids, attention_masks = input_ids.to(device), attention_masks.to(device)
        output = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
        pred.append(torch.argmax(output.logits, 1).tolist())

In [60]:
print(pred)

[tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([1], device='cuda:0'), tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([1], device='cuda:0'), tensor([1], device='cuda:0'), tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0], device='cuda:0'), tensor([0

In [67]:
with open('answer.txt', 'w') as f:
  for i, label in enumerate(pred):
    if label == 1:
      f.write('twitter_' + str(i+1) + ',NOT_SARCASM\n')
    else:
      f.write('twitter_' + str(i+1) + ',SARCASM\n')
