In [1]:
import re
import json
import tqdm
import numpy as np
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

In [162]:
df = pd.read_csv("../data/sentiment/train.csv")
df.drop_duplicates("text", inplace=True)

In [139]:
from sklearn.model_selection import train_test_split

In [163]:
train, valid = train_test_split(df, test_size=0.1, random_state=42)

In [175]:
tdata = []

for row in train.itertuples():
    text = tokenizer.decode(tokenizer(row.text)["input_ids"][1:-1])
    encoded = tokenizer(text)
    tokens = encoded["input_ids"]

    l = 0
    res = ""
    part = tokenizer.decode(tokenizer(row.important_span_text)["input_ids"][1:-1])
    if part not in text:
        print(part, text)
        continue
    
    start = text.index(part)
    end = start + len(part)
    decoded = []
    
    y = [0]
    p = 1 if row.sentiment == "positive" else 2
    
    for i, tok in enumerate(tokens[1:-1]):
        dec = tokenizer.decode(tok, clean_up_tokenization_spaces=False)
        if dec.startswith("##"):
            dec = dec.replace("##", "")
        elif i != 0:
            dec = " " + dec
        
        decoded.append(dec)

        if start - 1 <= l < end:
            res = res + dec
            y.append(p)
        else:
            y.append(0)
        
        l += len(dec)

    res = res[1:] if res.startswith(" ") else res
    y.append(0)
    #assert l == len(text), decoded
    #assert res == part
    
    encoded["labels"] = y
    tdata.append(encoded)

do not have any standardized meaning under ifrs adjusted gross margin, adjusted gross margin % and adjusted ebitda are non - ifrs financial measures not defined by and don't have any standardized meaning under ifrs.
do not regain significant control the bulls don't regain significant control without reclaiming its 200 - day moving average ( which would put it above the rest of its daily moving averages ) until that happens, it remains a two - way market.
do not i don't use " fibs " in the " traditional " manner ( retracements ).
' s " impressive mental gymnastics " jackson palmer, co - creator of dogecoin, says it's " impressive mental gymnastics " to associate " freedom " with elon musk's bid to buy twitter.
' extraordinarily elevated'inflation numbers the white house is bracing for'extraordinarily elevated'inflation numbers to be reflected in tuesday's data from the labor department, which will be released on tuesday.
do not have enough throughput he suggested that current blockchain

In [173]:
part

'has gifted the luna foundation guard ( lfg ) 10 million luna'

In [171]:
print(text)

adjusted gross margin, adjusted gross margin % and adjusted ebitda are non - ifrs financial measures not defined by and don't have any standardized meaning under ifrs.


In [168]:
print(l, len(text))

0 167


In [166]:
res

''

In [169]:
row

Pandas(Index=3445, text='Adjusted gross margin, adjusted gross margin % and adjusted EBITDA are non-IFRS financial measures not defined by and do not have any standardized meaning under IFRS.', start_char_pos=118, end_char_pos=163, important_span_text='do not have any standardized meaning under IFRS', sentiment='negative')

In [176]:
vdata = []

for row in valid.itertuples():
    text = tokenizer.decode(tokenizer(row.text)["input_ids"][1:-1])
    encoded = tokenizer(text)
    tokens = encoded["input_ids"]

    l = 0
    res = ""
    part = tokenizer.decode(tokenizer(row.important_span_text)["input_ids"][1:-1])
    if part not in text:
        print(part, text)
        continue
    
    start = text.index(part)
    end = start + len(part)
    decoded = []
    
    y = [0]
    p = 1 if row.sentiment == "positive" else 2
    
    for i, tok in enumerate(tokens[1:-1]):
        dec = tokenizer.decode(tok, clean_up_tokenization_spaces=False)
        if dec.startswith("##"):
            dec = dec.replace("##", "")
        elif i != 0:
            dec = " " + dec
        
        decoded.append(dec)

        if start - 1 <= l < end:
            res = res + dec
            y.append(p)
        else:
            y.append(0)
        
        l += len(dec)

    res = res[1:] if res.startswith(" ") else res
    y.append(0)
    #assert l == len(text), decoded
    #assert res == part
    
    encoded["labels"] = y
    vdata.append(encoded)

vulnerable ' annual report :'vulnerable'and the impact of the political and security situation in israel on our business.
fda approved biocryst pharmaceuticals ' in december 2020, the fda approved biocryst pharmaceuticals'orladeyo ( berotralstat ) for hae in adults and patients 12 years and older.


In [177]:
print(len(tdata), len(vdata))

7645 850


In [178]:
from transformers import TrainingArguments
from transformers import DataCollatorForTokenClassification
from transformers import create_optimizer
from transformers import TFAutoModelForTokenClassification
import tensorflow as tf
from datasets import Dataset

In [189]:
model = TFAutoModelForTokenClassification.from_pretrained("ProsusAI/finbert", from_pt=True, num_labels=3)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForTokenClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [190]:
training_args = TrainingArguments(output_dir="../models")

In [191]:
label_names = train['sentiment'].unique()

In [192]:
ds_train = Dataset.from_pandas(pd.DataFrame(tdata))
ds_val = Dataset.from_pandas(pd.DataFrame(vdata))

In [193]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [194]:
tf_train = ds_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

tf_valid = ds_val.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

In [195]:
batch_size = 32
num_train_epochs = 1
num_train_steps = (len(tdata) // batch_size) * num_train_epochs

optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [196]:
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [197]:
model.fit(x=tf_train, validation_data=tf_valid, epochs=1)



<keras.callbacks.History at 0x7fcf07137c40>

In [198]:
from transformers import pipeline

In [199]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [200]:
with open("../data/sentiment/testing.json") as infile:
    data = json.load(infile)

In [201]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, "x", data)


In [202]:
results = []

for row in tqdm.tqdm(data, total=len(data)):
    all_options = []
    res = ""
    text = row["text"]
    text = text.replace("’", "'").replace("“", "\"").replace("”", "\"").replace("–","-").replace("…", ".").replace("‘", "'").replace("—", "-")
    text = remove_emojis(text)

    for word in nlp(text):
        if word["entity"] != "LABEL_0":
            res = res + word["word"].replace("Ġ", " ")
        else:
            all_options.append(res)
            res = ""

    all_options.append(res)
    sel = max(all_options, key=lambda x: len(x))
    cleaned = re.sub("(^[\:, ]+)|([\. ]+$)", "", sel)
    start = 0
    end = 0
    if len(cleaned) > 1:
        if cleaned in text:
            start = text.index(cleaned)
            end = start + len(cleaned)
        #else:
        #    end = cleaned
    results.append((start, end))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [25:22<00:00,  6.57it/s]


In [203]:
submission = pd.DataFrame(data=results, columns=["start", "end"])

In [204]:
submission.reset_index().rename(columns={"index": "id"}).to_csv("../predictions/t2_roberta_tokenizer.csv",index=False)