<a href="https://colab.research.google.com/github/chan-98/nlp-scripts/blob/main/language_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install overrides
!pip install allennlp
!pip install allennlp-models
!git clone https://github.com/mhagiwara/realworldnlp.git
%cd realworldnlp

Cloning into 'realworldnlp'...
remote: Enumerating objects: 668, done.[K
remote: Counting objects: 100% (166/166), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 668 (delta 126), reused 110 (delta 110), pack-reused 502[K
Receiving objects: 100% (668/668), 4.94 MiB | 16.74 MiB/s, done.
Resolving deltas: 100% (390/390), done.
/content/realworldnlp


In [None]:
from typing import Dict

import numpy as np
import torch
import torch.optim as optim
from allennlp.common.file_utils import cached_path
from allennlp.data.data_loaders import MultiProcessDataLoader
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.fields import LabelField, TextField
from allennlp.data.instance import Instance
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.training import GradientDescentTrainer
from overrides import overrides

from examples.sentiment.sst_classifier import LstmClassifier

In [None]:
EMBEDDING_DIM = 16
HIDDEN_DIM = 16

In [None]:
class TatoebaSentenceReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer]=None):
        super().__init__()
        self.tokenizer = CharacterTokenizer()
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

    @overrides
    def text_to_instance(self, tokens, label=None):
        fields = {}

        fields['tokens'] = TextField(tokens, self.token_indexers)
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields)

    @overrides
    def _read(self, file_path: str):
        file_path = cached_path(file_path)
        with open(file_path, "r") as text_file:
            for line in text_file:
                lang_id, sent = line.rstrip().split('\t')

                tokens = self.tokenizer.tokenize(sent)

                yield self.text_to_instance(tokens, lang_id)

In [None]:
def classify(text: str, model: LstmClassifier):
    tokenizer = CharacterTokenizer()
    token_indexers = {'tokens': SingleIdTokenIndexer()}

    tokens = tokenizer.tokenize(text)
    instance = Instance({'tokens': TextField(tokens, token_indexers)})
    logits = model.forward_on_instance(instance)['logits']
    label_id = np.argmax(logits)
    label = model.vocab.get_token_from_index(label_id, 'labels')

    print('text: {}, label: {}'.format(text, label))

In [None]:
reader = TatoebaSentenceReader()
train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/tatoeba/sentences.top10langs.train.tsv'
dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/tatoeba/sentences.top10langs.dev.tsv'

In [None]:
sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler)

loading instances: 100000it [00:11, 8525.00it/s]
loading instances: 10000it [00:00, 13912.28it/s]


In [None]:
vocab = Vocabulary.from_instances(train_data_loader.iter_instances(),
                                  min_count={'tokens': 3})
train_data_loader.index_with(vocab)
dev_data_loader.index_with(vocab)

building vocab: 100000it [00:03, 26051.99it/s]


In [None]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [None]:
encoder = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [None]:
model = LstmClassifier(word_embeddings, encoder, vocab, positive_label='eng')

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    patience=10,
    num_epochs=20,
    cuda_device=-1)

trainer.train()

accuracy: 0.5994, precision: 0.6914, recall: 0.6018, f1: 0.6435, batch_loss: 0.7900, loss: 1.0935 ||: 100%|##########| 3125/3125 [01:38<00:00, 31.61it/s]
accuracy: 0.7846, precision: 0.8372, recall: 0.7510, f1: 0.7918, batch_loss: 0.5400, loss: 0.6200 ||: 100%|##########| 313/313 [00:01<00:00, 163.26it/s]
accuracy: 0.8139, precision: 0.8424, recall: 0.8514, f1: 0.8469, batch_loss: 0.7892, loss: 0.5251 ||: 100%|##########| 3125/3125 [01:27<00:00, 35.83it/s]
accuracy: 0.8487, precision: 0.8924, recall: 0.8460, f1: 0.8686, batch_loss: 0.2727, loss: 0.4271 ||: 100%|##########| 313/313 [00:02<00:00, 148.20it/s]
accuracy: 0.8601, precision: 0.8807, recall: 0.8791, f1: 0.8799, batch_loss: 0.2914, loss: 0.3953 ||: 100%|##########| 3125/3125 [01:37<00:00, 32.18it/s]
accuracy: 0.8734, precision: 0.8420, recall: 0.9110, f1: 0.8751, batch_loss: 0.3157, loss: 0.3562 ||: 100%|##########| 313/313 [00:02<00:00, 131.26it/s]
accuracy: 0.8834, precision: 0.8997, recall: 0.8929, f1: 0.8963, batch_loss: 0.

{'best_epoch': 16,
 'peak_worker_0_memory_MB': 1627.65625,
 'training_duration': '0:31:53.926124',
 'epoch': 19,
 'training_accuracy': 0.97179,
 'training_precision': 0.9806321859359741,
 'training_recall': 0.9771999716758728,
 'training_f1': 0.9789131283760071,
 'training_loss': 0.08398211163324304,
 'training_worker_0_memory_MB': 1627.65625,
 'validation_accuracy': 0.9695,
 'validation_precision': 0.9806910753250122,
 'validation_recall': 0.9649999737739563,
 'validation_f1': 0.9727821946144104,
 'validation_loss': 0.09664935189111087,
 'best_validation_accuracy': 0.9699,
 'best_validation_precision': 0.9712586998939514,
 'best_validation_recall': 0.9800000190734863,
 'best_validation_f1': 0.9756098389625549,
 'best_validation_loss': 0.09551395257213748}

In [None]:
classify('Take your raincoat in case it rains.', model)

text: Take your raincoat in case it rains., label: eng


In [None]:
classify('Tu me recuerdas a mi padre.', model)

text: Tu me recuerdas a mi padre., label: spa


In [None]:
classify('Wie organisierst du das Essen am Mittag?', model)

text: Wie organisierst du das Essen am Mittag?, label: deu


In [None]:
classify("Il est des cas où cette règle ne s'applique pas.", model)

text: Il est des cas où cette règle ne s'applique pas., label: fra


In [None]:
classify('Estou fazendo um passeio em um parque.', model)

text: Estou fazendo um passeio em um parque., label: por


In [None]:
classify('Ve, postmorgaŭ jam estas la limdato.', model)

text: Ve, postmorgaŭ jam estas la limdato., label: epo


In [None]:
classify('Credevo che sarebbe venuto.', model)

text: Credevo che sarebbe venuto., label: ita


In [None]:
classify('Nem tudja, hogy én egy macska vagyok.', model)

text: Nem tudja, hogy én egy macska vagyok., label: hun


In [None]:
classify('Nella ur nli qrib acemma deg tenwalt.', model)

text: Nella ur nli qrib acemma deg tenwalt., label: ber


In [None]:
classify('Kurşun kalemin yok, değil mi?', model)

text: Kurşun kalemin yok, değil mi?, label: tur
