In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/

/content/drive/MyDrive


In [3]:
!pip install fastText

Collecting fastText
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l[K     |████▊                           | 10 kB 23.3 MB/s eta 0:00:01[K     |█████████▌                      | 20 kB 25.4 MB/s eta 0:00:01[K     |██████████████▎                 | 30 kB 12.9 MB/s eta 0:00:01[K     |███████████████████             | 40 kB 9.8 MB/s eta 0:00:01[K     |███████████████████████▉        | 51 kB 5.2 MB/s eta 0:00:01[K     |████████████████████████████▋   | 61 kB 5.7 MB/s eta 0:00:01[K     |████████████████████████████████| 68 kB 3.5 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.8.1-py2.py3-none-any.whl (208 kB)
Building wheels for collected packages: fastText
  Building wheel for fastText (setup.py) ... [?25l[?25hdone
  Created wheel for fastText: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3121271 sha256=34dee338d0509444b432302480baf47d02c49e8b84d5110ceb7aefdf42626246
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a659

In [4]:
import fasttext

# Load Data

In [5]:
import json
import pandas as pd

def load_jsonl(fname):
    fin = open(fname, encoding="utf-8")
    data = []
    for line in fin:
        d = json.loads(line.strip())
        data.append(d)

    return data

def save_jsonl(data, filename):
    with open(filename, "w", encoding="utf-8") as fo:
        for idx, d in enumerate(data):
            fo.write(json.dumps(d, ensure_ascii=False))
            fo.write("\n")

In [6]:
DIR = "Mispelling/misspelling-semantics/Datasets/"
traindata = load_jsonl(f"{DIR}/WisesightSentiment/tokenized_train.jsonl")
validdata = load_jsonl(f"{DIR}/WisesightSentiment/tokenized_valid.jsonl")
testdata = load_jsonl(f"{DIR}/WisesightSentiment/tokenized_test-misp.jsonl")


In [7]:
import itertools
def filterByMode(data, mode=None):
  output = []
  for sent in data:
    if mode is None:
      tokenized = [seg[0] for seg in sent["segments"]]
    elif mode=="corr":
      tokenized = [seg[1] for seg in sent["segments"]]
      if len(sent["misp_tokens"])==0:
        continue
    else:
      tokenized = [seg[0] for seg in sent["segments"]]
      if len(sent["misp_tokens"])==0:
        continue
    
    tokenized = list(itertools.chain(*tokenized))
  
    output.append({
        "category": sent["category"],
        "text": sent["text"],
        "tokenized": tokenized,
        "segments": sent["segments"]
    })

  return output

traindata
validdata
allTestdata = filterByMode(testdata)
corrTestdata = filterByMode(testdata, "corr")
mispTestdata = filterByMode(testdata, "misp")
len(allTestdata), len(corrTestdata), len(mispTestdata)

(2671, 880, 880)

# Create LSTM

In [8]:
import numpy as np
import random
import torch
import os

seed = 0
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)    
np.random.seed(seed)
np.random.RandomState(seed)

torch.manual_seed(seed) 
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) #seed all gpus    
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.enabled = False  
torch.backends.cudnn.benchmark = False


In [9]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [10]:
!pip install -q pythainlp

[K     |████████████████████████████████| 11.0 MB 4.4 MB/s 
[K     |████████████████████████████████| 743 kB 50.6 MB/s 
[?25h

In [11]:
import pythainlp

In [12]:
import os
import time
import torch
import torch.optim as optim
import torch.nn as nn

import os
import sys
from argparse import ArgumentParser

criterion = nn.CrossEntropyLoss()


def get_args():
    parser = ArgumentParser(description='LSTM')
#     parser.add_argument('mode', type=str, help = 'tokenizing mode ')
    parser.add_argument('--epochs', type=int, default=50, help = 'epochs')
    parser.add_argument('--batch_size', type=int, default=128)
    parser.add_argument('--d_embed', type=int, default=100)
    parser.add_argument('--lr', type=float, default=.001)
    parser.add_argument('--dev_every', type=int, default=100)
    parser.add_argument('--dp_ratio', type=int, default=0.2)
    parser.add_argument('--save_path', type=str, default='results', help='path to save the model')
    
    try:
        args = parser.parse_args([])
    except:
        parser.print_help()
        sys.exit(1)

    return args

In [13]:
               
args = get_args()
args.epochs = 50
args.batch_size = 64
args.dev_every = 50

In [14]:
wv = fasttext.load_model(f"{DIR}/../Models/cc.th.300.bin")
# wv = fasttext.load_model(f"Mispelling/Models/fasttext_orcl.bin")



In [15]:
"DONE"

'DONE'

### Build Datasets

In [16]:
from torchtext.legacy import data
import torchtext.vocab as vocab

from tqdm.notebook import tqdm

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Baseline Datasets

In [18]:
LABELS = {
    "neg": 2,
    "neu": 1,
    "pos": 0,
    "q": 1 # used to be 3
}

def removeQuestion(label):    
  return LABELS[label]

# TEXT = data.Field(sequential=True, lower=False)
CATEGORY = data.Field(sequential=False, use_vocab=False, preprocessing=removeQuestion)
TOKEN = data.Field(sequential=True, lower=False)

raw_datasets = {
    "train": traindata,
    "validation": validdata,
    "test": allTestdata,
    "test-corr": corrTestdata,
    "test-misp": mispTestdata
}

raw_fields = [
    # ('text', TEXT), 
    ('category', CATEGORY),
    ('tokenized', TOKEN)
]

fields = {}
for f in raw_fields:
  fields[f[0]] = f

datasets = {}
for k in raw_datasets:  
  examples = [data.Example.fromdict(d, fields=fields) for d in raw_datasets[k]]
  d = data.Dataset(examples, fields=raw_fields)
  datasets[k] = d


#### MAE Datasets

In [19]:
# 

In [20]:
MC = load_jsonl(f"{DIR}/../test_mispelling_correction.jsonl")[0]
def createMAEDataset(data, pre_segmented=False, mode=None, mst=False):
  output = []
  cnt, mstcnt = 0, 0

  segIdx = 0
  if mode=="corr":
    segIdx = 1  # ignore misspelling with MC

  for sent in data:
    newtokens = []
    misptokens = []
    if (mode=="misp" or mode=="corr") and len(sent["misp_tokens"])==0:
        continue

    if pre_segmented:
      for seg in sent["segments"]:
        for token in zip(seg[0], seg[1]):
          newtokens.append(token[segIdx])
          misptokens.append(token[1])
          if mst:
            msttokens = additionalToken(token[segIdx])
            newtokens += msttokens
            misptokens += msttokens

          if token[0]!=token[1]:
            mstcnt += 1
          
    else:
      
      for token in sent["tokenized"]:
        w = norm_word(token)
        detectedMsp = (len(w) >= 4) and (w in MC)
        if detectedMsp:
            corr, mint = MC[w]
            misptokens.append(corr)
            if mode=="corr":
              token = corr
            mstcnt += 1
        else:
          misptokens.append(token)
        newtokens.append(token)

        if detectedMsp and mst:
          msttokens = additionalToken(token)
          newtokens += msttokens
          misptokens += msttokens


    cnt += len(newtokens)

    output.append({
        "category": sent["category"],
        "text": sent["text"],
        "tokenized": newtokens,
        "misp": misptokens
    })

  print(f"#Misp Tokens: {mstcnt} tokens; {(mstcnt)*100/cnt:.2f}%")
  return output

mae_raw_datasets = {
    "test": createMAEDataset(testdata, pre_segmented=True),
    "test-corr": createMAEDataset(testdata, pre_segmented=True, mode="corr"),
    "test-misp": createMAEDataset(testdata, pre_segmented=True, mode="misp"),
}

mae_raw_fields = [
    ('category', CATEGORY),
    ('tokenized', TOKEN),
    ('misp', TOKEN),
]

mae_fields = {}
for f in mae_raw_fields:
  mae_fields[f[0]] = f

print()
MAEdatasets = {}
for k in mae_raw_datasets:  
  print(f"Processed: {k}")
  examples = [data.Example.fromdict(d, fields=mae_fields) for d in mae_raw_datasets[k]]
  d = data.Dataset(examples, fields=mae_raw_fields)
  MAEdatasets[k] = d


#Misp Tokens: 1213 tokens; 1.67%
#Misp Tokens: 1213 tokens; 5.60%
#Misp Tokens: 1213 tokens; 5.60%

Processed: test
Processed: test-corr
Processed: test-misp


#### MST Datasets

In [21]:
from itertools import groupby
MD = load_jsonl(f"{DIR}/../train_mispelling_dection.jsonl")[0]

def norm_word(word):
    groups = [list(s) for _, s in groupby(word)]
    ch = []
    extraToken = ""
    for g in groups:
        if len(g)>=3:
            if g[0]=="5":
              extraToken = "<lol>"
            else:
              extraToken = "<rep>"
            ch.append(g[0])  
        else:
            ch += g
    word = "".join(ch)+extraToken
    return word

def additionalToken(word):
  tokens = []
  w = norm_word(word)
  if "<lol>" in w:
    tokens.append("<lol>")
  elif "<rep>" in w:
    tokens.append("<rep>")
  elif w in MD:
      corr, mint = MD[w]
      if mint:
        tokens.append("<int>")
      else:
        tokens.append("<msp>")
  return tokens

def addMSTTokens(data, pre_segmented=False):
  output = []
  cnt, mstcnt = 0, 0
  for sent in data:
    newtokens = []
    if pre_segmented:
      for seg in sent["segments"]:
        for token in zip(seg[0], seg[1]):
          newtokens.append(token[0])
          if token[0]==token[1]:
            continue
          
          newtokens += additionalToken(token[0])
    else:
      for token in sent["tokenized"]:
        newtokens.append(token)
        # if len(w) < 4:
        #   continue
        newtokens += additionalToken(token[0])

    cnt += len(sent["tokenized"])
    mstcnt += len(newtokens)

    output.append({
        "category": sent["category"],
        "text": sent["text"],
        "tokenized": newtokens,
    })

  print(f"#New MST Tokens: {mstcnt - cnt} tokens; {(mstcnt - cnt)*100/cnt:.2f}%")
  return output

In [22]:
MSTdatasets = {}
for k in raw_datasets:
  print(f"Processed: {k}")
  raw = addMSTTokens(raw_datasets[k], pre_segmented=k.startswith("test"))  #only pre-segmented in test set
  examples = [data.Example.fromdict(d, fields=fields) for d in raw]
  d = data.Dataset(examples, fields=raw_fields)
  MSTdatasets[k] = d
  print("")

Processed: train
#New MST Tokens: 19356 tokens; 3.18%

Processed: validation
#New MST Tokens: 2147 tokens; 3.19%

Processed: test
#New MST Tokens: 616 tokens; 0.85%

Processed: test-corr
#New MST Tokens: 616 tokens; 2.84%

Processed: test-misp
#New MST Tokens: 616 tokens; 2.84%



In [23]:
# Both

#### MAE+MST

In [24]:
both_raw_datasets = {
    "test": createMAEDataset(testdata, pre_segmented=True, mst=True),
    "test-corr": createMAEDataset(testdata, pre_segmented=True, mode="corr", mst=True),
    "test-misp": createMAEDataset(testdata, pre_segmented=True, mode="misp", mst=True),
}
print()
BOTHdataset = {}
for k in both_raw_datasets:  
  print(f"Processed: {k}")
  examples = [data.Example.fromdict(d, fields=mae_fields) for d in both_raw_datasets[k]]
  d = data.Dataset(examples, fields=mae_raw_fields)
  BOTHdataset[k] = d


#Misp Tokens: 1213 tokens; 1.63%
#Misp Tokens: 1213 tokens; 5.50%
#Misp Tokens: 1213 tokens; 5.37%

Processed: test
Processed: test-corr
Processed: test-misp


In [25]:
W2V_WINDOW = 5 
W2V_MIN_COUNT = 0

# TEXT.build_vocab(datasets["train"], min_freq=W2V_MIN_COUNT, )
TOKEN.build_vocab(datasets["train"], datasets["validation"], datasets["test"], datasets["test-corr"], MSTdatasets["test"], min_freq=W2V_MIN_COUNT, )
CATEGORY.build_vocab(datasets["train"])

In [26]:
"#Token",len(TOKEN.vocab)

('#Token', 44697)

In [27]:
CATEGORY.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x7f58c88912d0>>,
            {0: 3, 1: 1, 2: 2, '<unk>': 0})

In [28]:
import fasttext

# ref: https://medium.com/@rohit_agrawal/using-fine-tuned-gensim-word2vec-embeddings-with-torchtext-and-pytorch-17eea2883cd
def set_wv_vectors(field, vectors, debug=False):
    W2V_SIZE = vectors.get_dimension()
    
    words = vectors.get_words()
    vocab_size = len(words)
    word2vec_vectors = []
    for token, idx in tqdm(field.vocab.stoi.items()):
        if idx==0:
            word2vec_vectors.append(torch.zeros(W2V_SIZE))
            continue
            
        word2vec_vectors.append(torch.FloatTensor(vectors[token]))

    field.vocab.set_vectors(field.vocab.stoi, word2vec_vectors, W2V_SIZE)

In [29]:
set_wv_vectors(TOKEN, wv)

  0%|          | 0/44697 [00:00<?, ?it/s]

In [30]:
del wv

In [31]:
import gc
gc.collect()

119

In [32]:
train_iter, validation_iter, test_iter = data.BucketIterator.splits(
    (datasets["train"], datasets["validation"], datasets["test"]), 

    batch_size=args.batch_size, 
    
    # Sort all examples in data using `sort_key`.
    sort=True,
    sort_key=lambda x: len(x.tokenized),
    sort_within_batch=False,
    shuffle=True,
    
    device=device)

In [33]:
def evaluate(loader, model, return_pred=False):
    model.eval()
    loader.sort = False
    loader.sort_within_batch = False
    loader.init_epoch()

    # calculate accuracy on validation set
    n_correct, n = 0, 0
    losses = []
    answers = []
    with torch.no_grad():
        for batch_idx, batch in enumerate(loader):
            answer = model(batch)
            answers.append((answer, batch.category))
            n_correct += (torch.max(answer, 1)[1].view(batch.category.size()) == batch.category).sum().item()
            n += answer.shape[0]
            loss = criterion(answer, batch.category)
            losses.append(loss.data.cpu().numpy())
    acc = 100. * n_correct / n
    loss = np.mean(losses)
    
    if not return_pred:
        return acc, loss
    
    
    predict = torch.cat([a for a,_ in answers])
    labels = torch.cat([a for _,a in answers])
    return acc, loss, predict, labels

In [34]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_out, d_ff=256, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 256
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_out)
        
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

class Classifier(nn.Module):

    def __init__(self,
                 n_embed=10000,
                 d_embed=300,
                 d_hidden=256,
                 d_out=2,
                 dp=0.2,
                 embed_weight=None,
                 eow_idx=2):
        super(Classifier, self).__init__()

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.embed = nn.Embedding(n_embed, d_embed)
        
        if embed_weight is not None:
            # embed_weight = inputs.vocab.vectors
            self.embed.weight.data.copy_(embed_weight)
            self.embed.weight.requires_grad = False
       
        # self.norm = Norm(d_embed)
        self.bilstm = torch.nn.LSTM(input_size=d_embed, hidden_size=d_hidden, num_layers=1, bidirectional=True, dropout=dp)
        self.ff = FeedForward(2*d_hidden, d_out, d_hidden)
        
        self.dropout =  nn.Dropout(dp)

    def forward(self, batch):
        tokens = batch.tokenized  
        # misp = batch.misp  
        label = batch.category

        w = self.embed(tokens)
        # m = self.embed(misp)
        # w = (w + m)/2
        o, (h, c) = self.bilstm(w)
        
        x = torch.cat((h[0,:,:], h[1,:,:]), dim=1)
        # x = self.norm(x)
        x = self.ff(self.dropout(x))
        
        return x


### Train Model

In [35]:
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import pandas as pd

def train_model(train_iter, validation_iter):
    n_embed = len(TOKEN.vocab)
    d_out = len(CATEGORY.vocab)

    model = Classifier(d_embed=args.d_embed, d_hidden=args.d_embed, d_out=d_out, dp=0.2, embed_weight=TOKEN.vocab.vectors, n_embed=n_embed)
    model.to(device)

    opt = optim.Adam(model.parameters(), lr=args.lr)

    acc, val_loss = evaluate(validation_iter, model)
    best_acc = acc

#     print('epoch |   %        |  loss  |  avg   |val loss|   acc   |  best  | time | save |')
#     print('val   |            |        |        | {:.4f} | {:.4f} | {:.4f} |      |      |'.format(val_loss, acc, best_acc))

    iterations = 0
    last_val_iter = 0
    train_loss = 0
    start = time.time()
    
    train_stat = []
    with tqdm(total=args.epochs*len(train_iter)) as pbar:
    
      for epoch in range(args.epochs):
          train_iter.init_epoch()
          n_correct, n_total, train_loss = 0, 0, 0
          last_val_iter = 0
  #         print(epoch, end=' ')
          for batch_idx, batch in enumerate(train_iter):
              # switch model to training mode, clear gradient accumulators
              model.train();
              opt.zero_grad()

              iterations += 1

              # forward pass
              answer = model(batch)
              loss = criterion(answer, batch.category)

              loss.backward();
              opt.step()

              train_loss += loss.item()
  #             print('\r {:4d} | {:4d}/{} | {:.4f} | {:.4f} |'.format(
  #                 epoch, args.batch_size * (batch_idx + 1), len(train), loss.item(),
  #                         train_loss / (iterations - last_val_iter)), end='')
              
              stat = {
                  "epoch": epoch,
                  "step": iterations,
                  "train_loss": loss.item(),
                  "avg_loss": train_loss / (iterations - last_val_iter)
              }

              if iterations > 0 and iterations % args.dev_every == 0:
                  acc, val_loss = evaluate(validation_iter, model)
                  _save_ckp = '*'
                  if acc > best_acc:
                      best_acc = acc
                      # torch.save(model.state_dict(), args.save_path)


  #                 print(
  #                     ' {:.4f} | {:.4f} | {:.4f} | {:.2f} | {:4s} |'.format(
  #                         val_loss, acc, best_acc, (time.time() - start) / 60,
  #                         _save_ckp))

                  train_loss = 0
                  last_val_iter = iterations
                  stat["val_loss"] = val_loss
                  stat["acc"] = acc
                  stat["best_acc"] = best_acc
                  stat["time"] = (time.time() - start)
          
              
              train_stat.append(stat)
              pbar.update(1)
    
    acc, test_loss, predict, labels = evaluate(test_iter, model, return_pred=True)
    print(acc, test_loss)

    output = []
    # _predict = predict.cpu().numpy()
    # _labels = labels.cpu().numpy()
    # for idx, t in enumerate(test):
    #     output.append({
    #         "text": t.text,
    #         "label": t.category,
    #         # "tokens": json.dumps(t.tokens, ensure_ascii=False),
    #         "predict": json.dumps(_predict[idx].tolist(), ensure_ascii=False),
    # #         "_label": _labels[idx]
    #     })

    output = pd.DataFrame(output)
    # output.to_csv(f"Mispelling/Outputs/{expname}_{tokenType}.csv", index=False)
    
    train_stat = pd.DataFrame(train_stat)
    # train_stat.to_csv(f"Mispelling/Outputs/{expname}_{tokenType}_train_stat.csv", index=False)
    return model, output, train_stat

In [36]:
args

Namespace(batch_size=64, d_embed=100, dev_every=50, dp_ratio=0.2, epochs=50, lr=0.001, save_path='results')

In [37]:
n_embed = len(TOKEN.vocab)
d_out = len(CATEGORY.vocab)
print(n_embed, d_out)

args.epochs = 50
args.d_embed = 300

44697 4


In [38]:
model, output, train_stat = train_model(train_iter, validation_iter)

  "num_layers={}".format(dropout, num_layers))


  0%|          | 0/16900 [00:00<?, ?it/s]

63.534256832646946 3.580515


In [39]:
test_iter, corr_iter, misp_iter = data.BucketIterator.splits(
    (datasets["test"], datasets["test-corr"], datasets["test-misp"]), 

    batch_size=args.batch_size, 

    # Sort all examples in data using `sort_key`.
    sort=True,
    sort_key=lambda x: len(x.tokenized),
    sort_within_batch=False,
    shuffle=True,
    
    device=device)

In [40]:
acc, test_loss, predict, labels = evaluate(test_iter, model, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  63.534256832646946


In [41]:
acc, test_loss, predict, labels = evaluate(corr_iter, model, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  58.63636363636363


In [65]:
for sents in zip(datasets["test-corr"], datasets["test-misp"]):
  print(sents[0].tokenized)
  print(sents[1].tokenized)
  break

['การ', 'ด่า', 'ไป', 'เหมือน', 'ได้', 'บรรเทา', 'ความ', 'เครียด', 'เฉย', 'ๆ', ' ', 'แต่', 'บีทีเอส', ' ', '(', 'รถ', 'ไฟฟ้า', ')', ' ', 'มัน', 'สำนึก', 'ไหม', ' ', 'ก็', 'ไม่', 'อ่ะ', ' ', '😕']
['การ', 'ด่า', 'ไป', 'เหมือน', 'ได้', 'บรรเทา', 'ความ', 'เครียด', 'เฉย', 'ๆ', ' ', 'แต่', 'บีทีเอส', ' ', '(', 'รถ', 'ไฟฟ้า', ')', ' ', 'มัน', 'สำนึก', 'มั้ย', ' ', 'ก็', 'ไม่', 'อ่ะ', ' ', '😕']


In [42]:
acc, test_loss, predict, labels = evaluate(misp_iter, model, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  57.84090909090909


# Misspelling Average Embedding [MAE]

In [43]:
class MAEClassifier(nn.Module):

    def __init__(self, refModel):
        super(MAEClassifier, self).__init__()

        self.ref = refModel

    def forward(self, batch):
        tokens = batch.tokenized  
        misp = batch.misp  
        label = batch.category

        w = self.ref.embed(tokens)
        m = self.ref.embed(misp)
        w = (w + m)/2
        o, (h, c) = self.ref.bilstm(w)
        
        x = torch.cat((h[0,:,:], h[1,:,:]), dim=1)
        x = self.ref.ff(self.ref.dropout(x))
        return x


In [44]:
maeModel = MAEClassifier(model)

In [45]:
test_iter, corr_iter, misp_iter = data.BucketIterator.splits(
    (MAEdatasets["test"], MAEdatasets["test-corr"], MAEdatasets["test-misp"]), 

    batch_size=args.batch_size, 

    # Sort all examples in data using `sort_key`.
    sort=True,
    sort_key=lambda x: len(x.tokenized),
    sort_within_batch=False,
    shuffle=True,
    
    device=device)

In [46]:
acc, test_loss, predict, labels = evaluate(test_iter, maeModel, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  64.05840509172594


In [47]:
acc, test_loss, predict, labels = evaluate(corr_iter, maeModel, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  58.63636363636363


In [48]:
acc, test_loss, predict, labels = evaluate(misp_iter, maeModel, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  59.31818181818182


# Misspelling Semantics Tokens

In [49]:
train_iter, validation_iter, test_iter = data.BucketIterator.splits(
    (MSTdatasets["train"], MSTdatasets["validation"], MSTdatasets["test"]), 

    batch_size=args.batch_size, 
    
    # Sort all examples in data using `sort_key`.
    sort=True,
    sort_key=lambda x: len(x.tokenized),
    sort_within_batch=False,
    shuffle=True,
    
    device=device)

In [50]:
model, output, train_stat = train_model(train_iter, validation_iter)

  "num_layers={}".format(dropout, num_layers))


  0%|          | 0/16900 [00:00<?, ?it/s]

62.748034444028455 3.6352968


In [51]:
test_iter, corr_iter, misp_iter = data.BucketIterator.splits(
    (MSTdatasets["test"], MSTdatasets["test-corr"], MSTdatasets["test-misp"]), 

    batch_size=args.batch_size, 

    # Sort all examples in data using `sort_key`.
    sort=True,
    sort_key=lambda x: len(x.tokenized),
    sort_within_batch=False,
    shuffle=True,
    
    device=device)

In [52]:
acc, test_loss, predict, labels = evaluate(test_iter, model, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  62.748034444028455


In [53]:
acc, test_loss, predict, labels = evaluate(corr_iter, model, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  57.27272727272727


In [54]:
acc, test_loss, predict, labels = evaluate(misp_iter, model, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  57.27272727272727


# Both 

In [55]:
maeModel = MAEClassifier(model)

In [56]:
test_iter, corr_iter, misp_iter = data.BucketIterator.splits(
    (BOTHdataset["test"], BOTHdataset["test-corr"], BOTHdataset["test-misp"]), 

    batch_size=args.batch_size, 

    # Sort all examples in data using `sort_key`.
    sort=True,
    sort_key=lambda x: len(x.tokenized),
    sort_within_batch=False,
    shuffle=True,
    
    device=device)

In [57]:
acc, test_loss, predict, labels = evaluate(test_iter, maeModel, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  63.23474354174466


In [58]:
acc, test_loss, predict, labels = evaluate(corr_iter, maeModel, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  57.95454545454545


In [59]:
acc, test_loss, predict, labels = evaluate(misp_iter, maeModel, return_pred=True)
print("Accuracy: ", acc)

Accuracy:  57.84090909090909
