In [63]:
import re
import json
import tqdm
import numpy as np
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [3]:
df = pd.read_csv("../data/sentiment/train.csv")
df.drop_duplicates("text", inplace=True)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train, valid = train_test_split(df, test_size=0.1, random_state=42)

In [6]:
tdata = []

for row in train.itertuples():
    encoded = tokenizer(row.text)
    tokens = encoded["input_ids"]

    l = 0
    res = ""
    start = row.text.index(row.important_span_text)
    end = start + len(row.important_span_text)
    decoded = []
    
    y = [0]
    p = 1 if row.sentiment == "positive" else 2
    
    assert row.important_span_text in row.text
    
    for tok in tokens[1:-1]:
        dec = tokenizer.decode(tok)
        decoded.append(dec)

        if start - 1 <= l < end:
            res = res + dec
            y.append(p)
        else:
            y.append(0)
        
        l += len(dec)

    res = res[1:] if res.startswith(" ") else res
    y.append(0)
    assert l == len(row.text), decoded
    assert res == row.important_span_text
    
    encoded["labels"] = y
    tdata.append(encoded)

2022-04-29 23:58:53.847890: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-29 23:58:53.847910: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [7]:
vdata = []

for row in valid.itertuples():
    encoded = tokenizer(row.text)
    tokens = encoded["input_ids"]

    l = 0
    res = ""
    start = row.text.index(row.important_span_text)
    end = start + len(row.important_span_text)
    decoded = []
    
    y = [0]
    p = 1 if row.sentiment == "positive" else 2
    
    assert row.important_span_text in row.text
    
    for tok in tokens[1:-1]:
        dec = tokenizer.decode(tok)
        decoded.append(dec)

        if start - 1 <= l < end:
            res = res + dec
            y.append(p)
        else:
            y.append(0)
        
        l += len(dec)

    res = res[1:] if res.startswith(" ") else res
    y.append(0)
    assert l == len(row.text), decoded
    assert res == row.important_span_text
    
    encoded["labels"] = y
    vdata.append(encoded)

In [8]:
print(len(tdata), len(vdata))

7660 852


In [9]:
from transformers import TrainingArguments
from transformers import DataCollatorForTokenClassification
from transformers import create_optimizer
from transformers import TFAutoModelForTokenClassification
import tensorflow as tf
from datasets import Dataset

In [10]:
model = TFAutoModelForTokenClassification.from_pretrained("roberta-base", num_labels=3)

2022-04-29 23:58:57.368469: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-29 23:58:57.368488: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-29 23:58:57.368502: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (eka-thinkpad): /proc/driver/nvidia/version does not exist
2022-04-29 23:58:57.368656: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFRobertaForTokenClassification.

Some layers 

In [11]:
training_args = TrainingArguments(output_dir="../models")

In [12]:
label_names = train['sentiment'].unique()

In [13]:
ds_train = Dataset.from_pandas(pd.DataFrame(tdata))
ds_val = Dataset.from_pandas(pd.DataFrame(vdata))

In [14]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [15]:
tf_train = ds_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

tf_valid = ds_val.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

  return array(a, dtype, copy=False, order=order)


In [16]:
batch_size = 32
num_train_epochs = 5
num_train_steps = (len(tdata) // batch_size) * num_train_epochs

optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [17]:
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [18]:
model.fit(x=tf_train, validation_data=tf_valid, epochs=5)

2022-04-29 23:58:59.064111: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2030303a90>

In [19]:
from transformers import pipeline

In [20]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [21]:
with open("../data/sentiment/testing.json") as infile:
    data = json.load(infile)

In [71]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, "x", data)


In [90]:
results = []

for row in tqdm.tqdm(data, total=len(data)):
    all_options = []
    res = ""
    text = row["text"]
    text = text.replace("’", "'").replace("“", "\"").replace("”", "\"").replace("–","-").replace("…", ".").replace("‘", "'").replace("—", "-")
    text = remove_emojis(text)

    for word in nlp(text):
        if word["entity"] != "LABEL_0":
            res = res + word["word"].replace("Ġ", " ")
        else:
            all_options.append(res)
            res = ""

    all_options.append(res)
    sel = max(all_options, key=lambda x: len(x))
    cleaned = re.sub("(^[\:, ]+)|([\. ]+$)", "", sel)
    start = 0
    end = 0
    if len(cleaned) > 1:
        if cleaned in text:
            start = text.index(cleaned)
            end = start + len(cleaned)
        #else:
        #    end = cleaned
    results.append((start, end))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [22:21<00:00,  7.46it/s]


In [91]:
submission = pd.DataFrame(data=results, columns=["start", "end"])

In [95]:
submission.reset_index().rename(columns={"index": "id"}).to_csv("../predictions/t2_roberta_tokenizer.csv",index=False)