In [12]:
import pandas as pd

In [13]:
df = pd.read_csv('../data/sentences.csv', header=None, delimiter=';', error_bad_lines=False, names=['website','text', 'keywords'])

In [14]:
from nltk.corpus import stopwords
import re
from ast import literal_eval
stop = stopwords.words('english')

def normalize_text(text):
    norm_text = text.lower()
    norm_text = norm_text.replace('<br />', '')
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", "\\1", norm_text)
    norm_text = ' '.join(norm_text.split())
    return norm_text

def remove_stop_words(text):
    return " ".join([item.lower() for item in text.split() if item not in stop])

def remove_non_ascii(text):
    return ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in text])

df['text'] = df['text'].apply(remove_non_ascii)
df['text'] = df['text'].apply(normalize_text)
#df['text'] = df['text'].apply(remove_stop_words)
df["text"] = df['text'].str.replace('[^\w\s]','')
df['keywords'] = df['keywords'].apply(literal_eval)

In [15]:
import re
result = []
for index, row in df.iterrows():
    ret_dic = {}
    ret_dic['entities'] = []
    ret_sentence = row.text
    for keyword in row.keywords:
        keyword = keyword.lower()
        ret_dic['entities'].extend([(m.start(), m.end(), "THREAT_GROUP") for m in re.finditer(keyword, row.text)])
    result.append((ret_sentence, ret_dic))

In [16]:
TRAIN_DATA = result

In [17]:
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import spacy
from spacy import displacy



def main(new_model_name='threat_test', n_iter=20, LABEL = "THREAT_GROUP"):
    #nlp = spacy.load('en_core_web_sm')
    nlp = spacy.blank('en')  # create blank Language class
    #print("Created blank 'en' model")
    
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)
    optimizer = nlp.begin_training()

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print('Losses', losses)

    test_text = 'apt3 also known as ups gothic panda tg-011 is a sophisticated threat group that has been active since at least 2010'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    doc = nlp(test_text)
    displacy.serve(doc, style='ent')

In [None]:
main()

Losses {u'ner': 45.616202465517524}
Losses {u'ner': 34.501424441126616}
Losses {u'ner': 26.612149420847842}
Losses {u'ner': 19.47262269989022}
Losses {u'ner': 14.86779618980406}
Losses {u'ner': 11.543234692413737}
Losses {u'ner': 9.981635776888385}
Losses {u'ner': 6.078508888126309}
Losses {u'ner': 6.589563756615391}
Losses {u'ner': 5.462666871891496}
Losses {u'ner': 3.423468109031284}
Losses {u'ner': 3.184500525871258}
Losses {u'ner': 3.477061922782298}
Losses {u'ner': 2.8875157847607555}
Losses {u'ner': 2.468300917949294}
Losses {u'ner': 1.8711684199720249}
Losses {u'ner': 1.8899188461321006}
Losses {u'ner': 1.559912711816471}
Losses {u'ner': 1.8805002857553175}
Losses {u'ner': 0.8610457257382147}
Entities in 'apt3 also known as ups gothic panda tg-011 is a sophisticated threat group that has been active since at least 2010'
THREAT_GROUP apt3
THREAT_GROUP ups
THREAT_GROUP gothic
THREAT_GROUP panda
THREAT_GROUP tg-011

[93m    Serving on port 5000...[0m
    Using the 'ent' visualize

127.0.0.1 - - [02/Dec/2018 15:45:59] "GET / HTTP/1.1" 200 2474
127.0.0.1 - - [02/Dec/2018 15:46:00] "GET /favicon.ico HTTP/1.1" 200 2474
