In [1]:
### inspired by https://github.com/explosion/spacy-pytorch-transformers/blob/master/examples/train_textcat.py

####  load csv and convert to json format

In [2]:
import os
import pandas as pd
os.getcwd()

def load_directory_data(directory):
  
  # data = load_directory_data(os.path.join(directory, "economic_sentiment_data.csv"))

  data = pd.read_csv(os.path.join(directory, "economic_sentiment_data.csv"))
  
  data = data[['sentence','polarity']]
  
  print(data.shape)

  return data

data_folder = '../data/'
full_data_df = load_directory_data(data_folder)

train_df = full_data_df.iloc[:3000]
test_df = full_data_df.iloc[3000:]

# convert to json
train_df.to_json( os.path.join(data_folder, 'train.json'), orient= 'index')
test_df.to_json(os.path.join(data_folder,'test.json'), orient= 'index', )

(3750, 2)


#### load json as dictionary, convert to list of tuples as required by spaCy

In [3]:
import json
train_json_str = train_df.to_json(orient= 'index')
test_json_str = test_df.to_json(orient='index')

In [4]:
train_json = json.loads(train_json_str)
test_json = json.loads(test_json_str)

In [5]:
TRAIN_DATA =[]

for v in train_json.values():
    TRAIN_DATA.append((v['sentence'],{'cats':{'POSITIVE': v['polarity'],'NEGATIVE': 1-v['polarity']}}))

In [6]:
TEST_DATA =[]
for v in test_json.values():
    TEST_DATA.append((v['sentence'],{'cats':{'POSITIVE': v['polarity'],'NEGATIVE': 1-v['polarity']}}))

In [7]:
# import rhinoscriptsyntax as rs
# import json

# #prompt the user for a file to import
# filter = "JSON file (*.json)|*.json|All Files (*.*)|*.*||"
# filename = rs.OpenFileName("Open JSON File", filter)

# with open(os.path.join(data_folder, 'train.json'),'r') as file:
#     train_json = file.load(file)

In [8]:
import spacy
from spacy.util import minibatch
import random
spacy.prefer_gpu()

nlp = spacy.load("en_pytt_bertbaseuncased_lg")

In [9]:
print(nlp.pipe_names) # ["sentencizer", "pytt_wordpiecer", "pytt_tok2vec"]
textcat = nlp.create_pipe("pytt_textcat", config={"exclusive_classes": True})
for label in ("POSITIVE", "NEGATIVE"):
    textcat.add_label(label)
nlp.add_pipe(textcat)
print(nlp.pipe_names)

['sentencizer', 'pytt_wordpiecer', 'pytt_tok2vec']
['sentencizer', 'pytt_wordpiecer', 'pytt_tok2vec', 'pytt_textcat']


In [106]:
## training is very slow, 25 minutes per epoch

In [114]:
# optimizer = nlp.resume_training()

for i in range(3):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for batch in minibatch(TRAIN_DATA, size=32):
        texts, cats = zip(*batch)
        nlp.update(texts, cats, sgd=optimizer, losses=losses, drop=0.1)
    #scores = nlp.evaluate(TEST_DATA)
    print(i, losses) #scores, 

0 {'pytt_textcat': 0.011014029983925866}
1 {'pytt_textcat': 0.008851727914588992}


KeyboardInterrupt: 

##### 0 {'pytt_textcat': 0.0007305766448553186}
##### 1 {'pytt_textcat': 0.0005921392457821639}
##### 0 {'pytt_textcat': 0.0004447193523446913}
##### 1 {'pytt_textcat': 0.0003327218119011377}
##### 2 {'pytt_textcat': 0.000254198645507131}
##### 0 {'pytt_textcat': 0.011014029983925866}
##### 1 {'pytt_textcat': 0.008851727914588992}

#### evaluate on test set

In [115]:
import tqdm

In [116]:
def evaluate(nlp, texts, cats):
    tp = 0.0  # True positives
    fp = 0.0  # False positives
    fn = 0.0  # False negatives
    tn = 0.0  # True negatives
    total_words = sum(len(text.split()) for text in texts)
    with tqdm.tqdm(total=total_words, leave=False) as pbar:
        for i, doc in enumerate(nlp.pipe(texts, batch_size=128)):
            gold = cats[i]['cats']
            for label, score in doc.cats.items():
                if label not in gold:
                    continue
                if label == "NEGATIVE":
                    continue
                if score >= 0.5 and gold[label] >= 0.5:
                    tp += 1.0
                elif score >= 0.5 and gold[label] < 0.5:
                    fp += 1.0
                elif score < 0.5 and gold[label] < 0.5:
                    tn += 1
                elif score < 0.5 and gold[label] >= 0.5:
                    fn += 1
            pbar.update(len(doc.text.split()))
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [117]:
eval_texts, eval_cats = zip(*TEST_DATA)

In [118]:
evaluate(nlp, eval_texts[:], eval_cats[:])

                                                       

{'textcat_f': 0.6401273885146456,
 'textcat_p': 0.5661971830826423,
 'textcat_r': 0.7362637362367669}

In [121]:
###{'textcat_f': 0.6401273885146456,
### 'textcat_p': 0.5661971830826423,
### 'textcat_r': 0.7362637362367669}

#### some tests using articles from today's FT

In [119]:
test_text = "Indian shares endured their worst July in 17 years, a sign that stewing trouble in the country’s economy has been catching up with the stock market after a stellar run."
doc = nlp(test_text)
print(test_text, doc.cats)

test_text = "KKR has won the race to buy German payments group Heidelpay for more than €600m in a fresh sign of investor appetite for companies that offer digital alternatives to cash."
doc = nlp(test_text)
print(test_text, doc.cats)

Indian shares endured their worst July in 17 years, a sign that stewing trouble in the country’s economy has been catching up with the stock market after a stellar run. {'POSITIVE': 0.0, 'NEGATIVE': 1.0}
KKR has won the race to buy German payments group Heidelpay for more than €600m in a fresh sign of investor appetite for companies that offer digital alternatives to cash. {'POSITIVE': 1.0, 'NEGATIVE': 0.0}


In [120]:
nlp.to_disk("../model/bert-textcat")