<a href="https://colab.research.google.com/github/darisoy/EE517_Sp21/blob/master/hw2/BOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [146]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm_notebook as tqdm
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

from google.colab import drive
drive.mount('/content/drive')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [147]:
class SSTDataset(Dataset):
  def __init__(self, file):
    self.df = pd.read_csv(file, delimiter='\t')
    self.texts = self.df['sentence']
    self.labels = self.df['label']

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    text = self.texts[index]
    label = self.labels[index]

    return (text, label)

In [148]:
tokenizer = get_tokenizer('basic_english')
train_iter = SSTDataset('/content/drive/MyDrive/train.tsv')
counter = Counter()
for i in range(len(train_iter)):
  (line, label) = train_iter[i]
  counter.update(tokenizer(line))
vocab = Vocab(counter, min_freq=1)

In [149]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: int(x)

In [150]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list, text_list, offsets   


In [151]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [152]:
num_class = 2
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class)
epochs = 10
LR = 5 
BATCH_SIZE = 6
  
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

train_dataloader = DataLoader(SSTDataset('/content/drive/MyDrive/train.tsv'), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(SSTDataset('/content/drive/MyDrive/dev.tsv'), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)


In [153]:
# train
for epoch in range(epochs):
  running_loss = 0.0
  for label, text, offsets in tqdm(train_dataloader):
    optimizer.zero_grad()
    outputs = model(text, offsets)
    loss = criterion(outputs, label)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
  print('[Epoch %d]\tTrain Loss: \t\t%.3f' % (epoch+1, running_loss / len(train_dataloader)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


[Epoch 1]	Train Loss: 		0.594


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


[Epoch 2]	Train Loss: 		0.330


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


[Epoch 3]	Train Loss: 		0.272


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


[Epoch 4]	Train Loss: 		0.249


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


[Epoch 5]	Train Loss: 		0.230


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


[Epoch 6]	Train Loss: 		0.218


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


[Epoch 7]	Train Loss: 		0.211


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


[Epoch 8]	Train Loss: 		0.202


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


[Epoch 9]	Train Loss: 		0.196


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


[Epoch 10]	Train Loss: 		0.190


In [155]:
#Calculate loss on test set
correct = 0
total = 0
total_loss = 0
with torch.no_grad():
  for label, text, offsets in tqdm(valid_dataloader):
    outputs = model(text, offsets)
    _, predicted = torch.max(outputs.data, 1)
    total += label.size(0)
    correct += (predicted == label).sum().item()
    total_loss += criterion(outputs, label).item()

print('Validation Loss: \t%.3f' % (total_loss / len(valid_dataloader)))
print('Validation Accuracy: \t%.2f %%' % (100 * correct / total))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))


Validation Loss: 	0.477
Validation Accuracy: 	79.93 %
