# Build up the dataset

## Create the Monolingual Data

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd

df = pd.read_csv('drive/My Drive/tweets_monolingual_5p.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id_str,screen_name,raw_text,chn_text,eng_text,monolingual_text
0,0,1193152895337689088,YXSzzzz,@gww067 Haha there is no way Ryza is not cute 😂,,Haha there is no way Ryza is not cute,Haha there is no way Ryza is not cute
1,1,1193152895337689088,YXSzzzz,乳不平何以平天下!\nLess busty testing. https://t.co/Iy...,乳不平何以平天下!,Less busty testing.,乳不平何以平天下!
2,3,1193152895337689088,YXSzzzz,@HrJasn 乳不巨何以聚人心 (誤,乳不巨何以聚人心誤,,乳不巨何以聚人心誤
3,4,1193152895337689088,YXSzzzz,因為蠻多人問。。。我就統一發個 😂\nBecuz too many people ask ....,因為蠻多人問。。。我就統一發個這個是我自己無聊畫的，不是…,Becuz too many people ask ... I will post thi...,因為蠻多人問。。。我就統一發個這個是我自己無聊畫的，不是…
4,5,1193152895337689088,YXSzzzz,因為蠻多人問。。。我就統一發個 😂\nBecuz too many people ask ....,因為蠻多人問。。。我就統一發個這個是我自己無聊畫的，不是…,Becuz too many people ask ... I will post thi...,Becuz too many people ask ... I will post thi...


## Preprocess and Split the Dataset

In [0]:
!pip3 install polyglot PyICU pycld2 morfessor

Collecting polyglot
[?25l  Downloading https://files.pythonhosted.org/packages/e7/98/e24e2489114c5112b083714277204d92d372f5bbe00d5507acf40370edb9/polyglot-16.7.4.tar.gz (126kB)
[K     |████████████████████████████████| 133kB 2.7MB/s 
[?25hCollecting PyICU
[?25l  Downloading https://files.pythonhosted.org/packages/95/0c/0fb09019efb65a29789ec5538f8e521b8f548da6935a3a474e19fbf2ea4d/PyICU-2.4.2.tar.gz (219kB)
[K     |████████████████████████████████| 225kB 7.9MB/s 
[?25hCollecting pycld2
[?25l  Downloading https://files.pythonhosted.org/packages/21/d2/8b0def84a53c88d0eb27c67b05269fbd16ad68df8c78849e7b5d65e6aec3/pycld2-0.41.tar.gz (41.4MB)
[K     |████████████████████████████████| 41.4MB 211kB/s 
[?25hCollecting morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Building wheels for collected packages: polyglot, PyICU, pycld2
  Building wheel for polyglot (setup.py) ... 

In [0]:
from polyglot.text import Text
import logging

logging.getLogger().setLevel(logging.ERROR)

def poly_tokenizer(raw_text):
    return Text(raw_text).words

In [0]:
%%time
from torchtext.data import TabularDataset, Field, LabelField

TEXT = Field(sequential=True, tokenize=ploy_tokenizer, lower=True)
LABEL = LabelField()

datafields = {'screen_name': ('label', LABEL), 'monolingual_text': ('text', TEXT)}
tweets_dataset = TabularDataset(path='drive/My Drive/tweets_monolingual_5p.csv', format='csv', fields=datafields)

vars(tweets_dataset[1])

CPU times: user 10.5 s, sys: 74.6 ms, total: 10.5 s
Wall time: 10.6 s


In [0]:
train_data, valid_data, test_data = tweets_dataset.split(split_ratio=[0.7, 0.1, 0.2], stratified=True)

print('Size of train, valid & test=', len(train_data), len(valid_data), len(test_data))

Size of train, valid & test= 48420 13830 6920


## Load Custom Aligned Vectors

In [0]:
from torchtext.vocab import Vocab, Vectors

chn_vector = Vectors(name='drive/My Drive/embeddings/wiki.zh.align.vec')
eng_vector = Vectors(name='drive/My Drive/embeddings/wiki.en.align.vec')

TEXT.build_vocab(train_data, vectors=[chn_vector, eng_vector])
LABEL.build_vocab(train_data)

In [0]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x:len(x.text),
    device = device)

# CNN Definition

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))

        return self.fc(cat)

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 600
N_FILTERS = 100
FILTER_SIZES = [2,3,4,5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [0]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 34,606,250 trainable parameters


# Training

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

In [0]:
import time

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text)
        loss = criterion(predictions, batch.label)
        acc = categorical_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text)
            loss = criterion(predictions, batch.label)
            acc = categorical_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = 8

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 7s
	Train Loss: 0.452 | Train Acc: 87.52%
	 Val. Loss: 1.286 |  Val. Acc: 70.92%
Epoch: 02 | Epoch Time: 1m 6s
	Train Loss: 0.239 | Train Acc: 93.16%
	 Val. Loss: 1.437 |  Val. Acc: 70.75%
Epoch: 03 | Epoch Time: 1m 6s
	Train Loss: 0.156 | Train Acc: 95.56%
	 Val. Loss: 1.624 |  Val. Acc: 70.44%
Epoch: 04 | Epoch Time: 1m 7s
	Train Loss: 0.116 | Train Acc: 96.60%
	 Val. Loss: 1.790 |  Val. Acc: 69.69%
Epoch: 05 | Epoch Time: 1m 7s
	Train Loss: 0.101 | Train Acc: 97.01%
	 Val. Loss: 1.974 |  Val. Acc: 69.56%
Epoch: 06 | Epoch Time: 1m 6s
	Train Loss: 0.089 | Train Acc: 97.43%
	 Val. Loss: 2.128 |  Val. Acc: 69.44%
Epoch: 07 | Epoch Time: 1m 6s
	Train Loss: 0.079 | Train Acc: 97.74%
	 Val. Loss: 2.277 |  Val. Acc: 69.23%
Epoch: 08 | Epoch Time: 1m 6s
	Train Loss: 0.072 | Train Acc: 97.97%
	 Val. Loss: 2.538 |  Val. Acc: 68.34%
Epoch: 09 | Epoch Time: 1m 7s
	Train Loss: 0.072 | Train Acc: 97.96%
	 Val. Loss: 2.679 |  Val. Acc: 68.95%
Epoch: 10 | Epoch Time: 1m 6

In [0]:
model.load_state_dict(torch.load('model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.304 | Test Acc: 70.68%
