In [None]:
#http://mlexplained.com/2019/01/30/an-in-depth-tutorial-to-allennlp-from-basics-to-elmo-and-bert/
#%load_ext autoreload
#%autoreload 2
from pathlib import Path
from typing import *
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

from functools import partial
from overrides import overrides
from allennlp.data import Instance
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.nn import util as nn_util
from allennlp.data.fields import TextField, MetadataField, ArrayField
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers import SingleIdTokenIndexer


from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers import DatasetReader

In [None]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=True,
    seed=1,
    batch_size=64,
    lr=3e-4,
    epochs=2,
    hidden_sz=64,
    max_seq_len=100, # necessary to limit memory usage
    max_vocab_size=100000,
)
from allennlp.common.checks import ConfigurationError
USE_GPU = torch.cuda.is_available()
DATA_ROOT = Path("../data") / "jigsaw"
torch.manual_seed(config.seed)

In [None]:
label_cols = ["toxic", "severe_toxic", "obscene",
              "threat", "insult", "identity_hate"]


class JigsawDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_seq_len: Optional[int]=config.max_seq_len) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[Token], id: str,
                         labels: np.ndarray) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}
        
        id_field = MetadataField(id)
        fields["id"] = id_field
        
        label_field = ArrayField(array=labels)
        fields["label"] = label_field

        return Instance(fields)
    
    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        df = pd.read_csv(file_path)
        if config.testing: df = df.head(1000)
        for i, row in df.iterrows():
            yield self.text_to_instance(
                [Token(x) for x in self.tokenizer(row["comment_text"])],
                row["id"], row[label_cols].values,
            )


In [None]:
train = pd.read_csv("../data/jigsaw/train.csv")
test = pd.read_csv("../data/jigsaw/test.csv")


In [None]:
train.head()

In [None]:
test.head()

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [None]:
import allennlp.data.tokenizers
from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter

#!python -m spacy download en_core_web_sm
#!python -m spacy download en


a = SimpleWordSplitter().split_words("this is a . test sentence")
print(type(a))
print(a)

In [None]:
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
#language='en_core_web_sm', pos_tags=False
a = SpacyWordSplitter().split_words("this is a . test sentence")
print(type(a))
print(a)

In [None]:
# the token indexer is responsible for mapping tokens to integers
token_indexer = SingleIdTokenIndexer()

def tokenizer(x: str):
    return [w.text for w in
            SpacyWordSplitter(language='en_core_web_sm', 
                              pos_tags=False).split_words(x)[:config.max_seq_len]]
reader = JigsawDatasetReader(
    tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer}
)

train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", "test.csv"])
val_ds = None



In [3]:
import pandas as pd
#import spacy
#nlp = spacy.load("en_core_web_sm")
train = pd.read_csv("../data/jigsaw/train.csv")
test = pd.read_csv("../data/jigsaw/test.csv")
print(train.head())
print(test.head())
small_train = train[['comment_text','toxic','severe_toxic','obscene','threat','insult','identity_hate']]
small_test = test[['comment_text','toxic','severe_toxic','obscene','threat','insult','identity_hate']]


                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
                 id                                       comment_text
0  00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...
1  0000247867823ef7  ==

In [None]:
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.nn.util import get_text_field_mask
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder

class BaselineModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 out_sz: int=len(label_cols)):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        self.loss = nn.BCEWithLogitsLoss()
        
    def forward(self, tokens: Dict[str, torch.Tensor],
                id: Any, label: torch.Tensor) -> torch.Tensor:
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        state = self.encoder(embeddings, mask)
        class_logits = self.projection(state)
        
        output = {"class_logits": class_logits}
        output["loss"] = self.loss(class_logits, label)

        return output