In [1]:
import torch

torch.cuda.is_available()

True

In [2]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, 2)
        self.softmax = nn.Softmax(dim=1)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, x: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        #print(src)
        #print(f"0: {src.shape}")
        x = self.embedding(x) * math.sqrt(self.d_model)
        #print(f"1: {src.shape}")
        x = self.pos_encoder(x)
        #print(f"2: {src.shape}")
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(x)).to(device)
        x = self.transformer_encoder(x, src_mask)
        #print(f"3: {x.shape}")
        x = self.linear(x)[:,-1]
        # output = self.sigmoid(output)
        #print(f"4: {x.shape}")
        # print(output)
        x = self.softmax(x)
        #print(f"5: {x.shape}")
        return x

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 10_000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [4]:
import pandas as pd

df = pd.read_csv('data/train.csv') #.dropna(axis=1).drop('id', axis=1)
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
import numpy as np

train_iter = df.values.tolist()
train_iter

[[1,
  nan,
  nan,
  'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
  1],
 [4, nan, nan, 'Forest fire near La Ronge Sask. Canada', 1],
 [5,
  nan,
  nan,
  "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
  1],
 [6,
  nan,
  nan,
  '13,000 people receive #wildfires evacuation orders in California ',
  1],
 [7,
  nan,
  nan,
  'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ',
  1],
 [8,
  nan,
  nan,
  '#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires',
  1],
 [10,
  nan,
  nan,
  '#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas',
  1],
 [13,
  nan,
  nan,
  "I'm on top of the hill and I can see a fire in the woods...",
  1],
 [14,
  nan,
  nan,
  "There's an emergency evacuation happening now in the building across the stree

In [6]:
from torchdata.datapipes.iter import IterableWrapper, FileOpener
import os

def get_data(raw_iter):
    return zip(*[(data[3], int(data[4])) for data in raw_iter])
def yield_X(raw_iter):
    for data in raw_iter:
        yield data[3]

def yield_y(raw_iter):
    for data in raw_iter:
        yield data[4]

def get_name(path_and_stream):
    return os.path.basename(path_and_stream[0]), path_and_stream[1]

def get_csv_dataset(csv_path):
    datapipe1 = IterableWrapper([csv_path])
    datapipe2 = FileOpener(datapipe1, mode="b")
    datapipe3 = datapipe2.map(get_name)
    csv_parser_dp = datapipe3.parse_csv(skip_lines=1)
    train_iter = csv_parser_dp
    X, y = get_data(train_iter)
    return X,y

X, y = get_csv_dataset("data/train.csv")
# X_eval, y_eval = get_csv_dataset("data/test.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


def pad_TextSequence(batch):
  return torch.nn.utils.rnn.pad_sequence(batch,batch_first=True, padding_value=0)

# train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, X), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

def x_data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return pad_TextSequence(data) #torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def data_process(X, y):
    X = x_data_process(X)
    y = torch.tensor(y, dtype=torch.int).unsqueeze(1)
    return X.to(device), y.to(device)

X, y = data_process(X, y)
X, X_eval = X[:6000], X[6000:]
y, y_eval = y[:6000], y[6000:]
# ``train_iter`` was "consumed" by the process of building the vocab,
# so we have to create it again
#train_iter, val_iter, test_iter = WikiText2()
#val_data = data_process(val_iter)
#test_data = data_process(test_iter)
X.shape, y.shape

(torch.Size([6000, 74]), torch.Size([6000, 1]))

In [9]:

def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
# train_data = batchify(train_data, batch_size)  # shape ``[seq_len, batch_size]``
#val_data = batchify(val_data, eval_batch_size)
#test_data = batchify(test_data, eval_batch_size)

In [10]:
bptt = 100
def get_batch(X: Tensor, y:Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        i: int

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """
    seq_len = min(bptt, len(X) - 1 - i)
    data = X[i:i+seq_len]
    target = y[i:i+seq_len]
    return data, target

In [11]:
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 4  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)



In [12]:
"""

import time

criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()

    #for i in range(10):
    output = model(X)
    output_flat = output.view(-1, ntokens)
    loss = criterion(output_flat, y)

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()

    total_loss += loss.item()
    if 1:
        lr = scheduler.get_last_lr()[0]
        ms_per_batch = (time.time() - start_time) * 1000 / log_interval
        cur_loss = total_loss / log_interval
        ppl = math.exp(cur_loss)
        print(f'| epoch {i:3d}'
                f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
        total_loss = 0
        start_time = time.time()"""

"\n\nimport time\n\ncriterion = nn.CrossEntropyLoss()\nlr = 5.0  # learning rate\noptimizer = torch.optim.SGD(model.parameters(), lr=lr)\nscheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)\n\ndef train(model: nn.Module) -> None:\n    model.train()  # turn on train mode\n    total_loss = 0.\n    log_interval = 200\n    start_time = time.time()\n\n    #for i in range(10):\n    output = model(X)\n    output_flat = output.view(-1, ntokens)\n    loss = criterion(output_flat, y)\n\n    optimizer.zero_grad()\n    loss.backward()\n    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)\n    optimizer.step()\n\n    total_loss += loss.item()\n    if 1:\n        lr = scheduler.get_last_lr()[0]\n        ms_per_batch = (time.time() - start_time) * 1000 / log_interval\n        cur_loss = total_loss / log_interval\n        ppl = math.exp(cur_loss)\n        print(f'| epoch {i:3d}'\n                f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '\n                f'loss {cur_

In [13]:
y_one_hot = torch.eye(2)[y.cpu()].squeeze(1).to(device)
y_one_hot

tensor([[0., 1.],
        [0., 1.],
        [0., 1.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]], device='cuda:0')

In [14]:
import time

criterion = nn.MSELoss()
lr = 1.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 50
    start_time = time.time()

    num_batches = len(X) // bptt
    for batch, i in enumerate(range(0, X.size(0) - 1, bptt)):
        data, targets = get_batch(X, y_one_hot, i)
        output = model(data)
        #print(output.shape,targets.shape)
        #print("output:", output.shape)
        #print("target:", targets.shape)
        # output_flat = output.view(-1, ntokens)
        # loss = criterion(output_flat, targets)
        loss = criterion(output, targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()
            
def evaluate(model: nn.Module, X_eval: Tensor, y_eval: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for i in range(0, X_eval.size(0) - 1, bptt):
            data, targets = get_batch(X_eval, y_eval, i)
            seq_len = data.size(0)
            output = model(data)
            total_loss += seq_len * criterion(output, targets).item()
    return total_loss / (len(X_eval) - 1)

In [15]:
best_val_loss = float('inf')
epochs = 50

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model)
        val_loss = evaluate(model, X_eval, y_eval)
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    model.load_state_dict(torch.load(best_model_params_path)) # load best model states

| epoch   1 |    50/   60 batches | lr 2.0 | ms/batch 33.25 | loss  0.42 | ppl     1.53


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  2.05s | valid loss  0.50 | valid ppl     1.65
-----------------------------------------------------------------------------------------
| epoch   2 |    50/   60 batches | lr 1.9 | ms/batch 29.01 | loss  0.43 | ppl     1.54
-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  1.89s | valid loss  0.50 | valid ppl     1.65
-----------------------------------------------------------------------------------------
| epoch   3 |    50/   60 batches | lr 1.805 | ms/batch 36.49 | loss  0.43 | ppl     1.54
-----------------------------------------------------------------------------------------
| end of epoch   3 | time:  2.29s | valid loss  0.50 | valid ppl     1.65
-----------------------------------------------------------------------------------------
| epoch   4 |    50/   60 batches | lr 1.7147499999999998 | 

In [None]:
test_loss = evaluate(model, X_eval, y_eval)
test_ppl = math.exp(test_loss)
print('=' * 89)
print(f'| End of training | test loss {test_loss:5.2f} | '
      f'test ppl {test_ppl:8.2f}')
print('=' * 89)

In [None]:
y_pred = model(X)
y_pred_classes = list(map(lambda x: torch.argmax(x), y_pred))
corrects = [y_p == y_e for y_p, y_e in zip(y_pred_classes, y)]
sum(corrects)/len(y_pred)

In [None]:
y_pred = model(X_eval)
y_pred_classes = list(map(lambda x: torch.argmax(x), y_pred))
corrects = [y_p == y_e for y_p, y_e in zip(y_pred_classes, y_eval)]
sum(corrects)/len(y_pred)