# Create Training Data

In [1]:
import spacy
import json
import random
from spacy.training.example import Example


In [2]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def save_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
        
def convert(model,text):
    doc = nlp(text)
    results = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if len(entities)>0:
        results = [text,{"entities":entities}]
    return (results)
    

In [3]:
role_descriptions = load_data('../data/description.txt')
nlp = spacy.load("../ner_ruler/")
TRAIN_DATA = []
for description in role_descriptions:
    results = convert(nlp,description)
    if (results != None) and (results != []):
        TRAIN_DATA.append(results)

In [4]:
valid_data_len = round(len(TRAIN_DATA)*(0.2))
VALID_DATA = TRAIN_DATA[-valid_data_len:]
TRAIN_DATA = TRAIN_DATA[:-valid_data_len]

In [5]:
save_data('./data/pl_training_data.json',TRAIN_DATA)
save_data('./data/pl_validate_data.json',VALID_DATA)

## Create Test Data

In [6]:
role_descriptions = load_data('../data/test_description.txt')
nlp = spacy.load("../ner_ruler")
TEST_DATA = []
for description in role_descriptions:
    results = convert(nlp,description)
    if (results != None) and (results != []):
        TEST_DATA.append(results)
save_data('data/pl_test_data.json',TEST_DATA)

# Train Model(V2)


In [110]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        # ner = nlp.create_pipe("ner")
        ruler = nlp.add_pipe("ner",last=True)
        
    for _, annotations in TRAIN_DATA:
        # print(annotations)
        for ent in annotations.get("entities"):
            ruler.add_label(ent[2])
            
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.initialize()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update(
                        [example],
                        drop=0.2,
                        sgd=optimizer,
                        losses=losses
                )
            print(losses)
    return (nlp)


                    
            

In [None]:
TRAIN_DATA = load_data("data/pl_training_data.json")
nlp = train_spacy(TRAIN_DATA, 30)
nlp.to_disk("pl_ner_model")

In [122]:
sentence_array = []
with open("data/test.txt","r") as f:
    for line in f:
        line = line.replace('\n'," ")
        sentence_array.append(line)
test = ""
# temp = ""
for sen in sentence_array:
    test = test+sen

In [124]:
nlp = spacy.load("pl_ner_model")
doc = nlp(test)
for ent in doc.ents:
    print(ent.text,ent.label_)
    



Python PROGLANG
Python PROGLANG
Python PROGLANG
Python PROGLANG
Java PROGLANG
Java PROGLANG
Java PROGLANG
scala PROGLANG
Java PROGLANG
Java PROGLANG
