In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pytorch_pretrained_bert seqeval transformers &> /dev/null
#import spacy


In [None]:
import numpy as np
import pandas as pd

import spacy
from spacy.gold import biluo_tags_from_offsets
#nlp = spacy.load("en_core_web_trf")

from tqdm import trange
import torch
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

from seqeval.metrics import classification_report, accuracy_score, f1_score

# Adding '\n' to the default spacy tokenizer
nlp = spacy.blank('en')
prefixes = ('\\n', ) + nlp.Defaults.prefixes
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search

In [None]:
# Personal Custom Tags Dictionary
entity_dict = {
    'Name': 'NAME', 
    'College Name': 'CLG',
    'Degree': 'DEG',
    'Graduation Year': 'GRADYEAR',
    'Years of Experience': 'YOE',
    'Companies worked at': 'COMPANY',
    'Designation': 'DESIG',
    'Skills': 'SKILLS',
    'Location': 'LOC',
    'Email Address': 'EMAIL'
}

#extra
entities = ['NAME', 'CLG', 'DEG', 'GRADYEAR', 'YOE', 'COMPANY', 'DESIG', 'SKILLS', 'LOC', 'EMAIL']

In [8]:
# loading the dataset
df = pd.read_json("/content/drive/MyDrive/Colab Notebooks/datasets/RP/NER_RESUME.json", lines=True)
df = df.drop(['extras'], axis=1)
df["content"] = df["content"].apply(lambda x: x.replace("\n", " "))
df.head(5)

0    Abhishek Jha Application Development Associate...
1    Afreen Jamadar Active member of IIIT Committee...
2    Akhil Yadav Polemaina Hyderabad, Telangana - E...
3    Alok Khandai Operational Analyst (SQL DBA) Eng...
4    Ananya Chavan lecturer - oracle tutorials  Mum...
Name: content, dtype: object

In [10]:
def mergeIntervals(intervals):
    sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
    merged = []

    for higher in sorted_by_lower_bound:
        if not merged:
            merged.append(higher)
        else:
            lower = merged[-1]
            if higher[0] <= lower[1]:
                if lower[2] is higher[2]:
                    upper_bound = max(lower[1], higher[1])
                    merged[-1] = (lower[0], upper_bound, lower[2])
                else:
                    if lower[1] > higher[1]:
                        merged[-1] = lower
                    else:
                        merged[-1] = (lower[0], higher[1], higher[2])
            else:
                merged.append(higher)

    return merged

In [11]:
# From 'annotation' column, we are extracting the starting index, ending index, entity label
# So that we can convert the content in BILOU format

def get_entities(df):
    
    entities = []
    
    for i in range(len(df)):
        entity = []
    
        for annot in df['annotation'][i]:
            try:
                ent = entity_dict[annot['label'][0]]
                start = annot['points'][0]['start']
                end = annot['points'][0]['end'] + 1
                entity.append((start, end, ent))
            except:
                pass
    
        entity = mergeIntervals(entity)
        entities.append(entity)
    
    return entities

In [12]:
# Adding a new column 'entities'
df['annotation'] = get_entities(df)
df.head()

Unnamed: 0,content,annotation
0,Abhishek Jha\nApplication Development Associat...,"[(0, 12, NAME), (13, 46, DESIG), (49, 58, COMP..."
1,Afreen Jamadar\nActive member of IIIT Committe...,"[(0, 14, NAME), (62, 68, LOC), (104, 148, EMAI..."
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[(0, 21, NAME), (22, 31, LOC), (65, 117, EMAIL..."
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[(0, 12, NAME), (13, 51, DESIG), (54, 60, COMP..."
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[(0, 13, NAME), (14, 22, DESIG), (24, 41, COMP..."


In [14]:
def get_train_data(df):
    tags = []
    sentences = []

    for i in range(len(df)):
        text = df['content'][i]
        entities = df['annotation'][i]
    
        doc = nlp(text)
    
        tag = biluo_tags_from_offsets(doc, entities)
        tmp = pd.DataFrame([list(doc), tag]).T
        loc = []
        for i in range(len(tmp)):
            if tmp[0][i].text is '.' and tmp[1][i] is 'O':
                loc.append(i)
        loc.append(len(doc))
    
        last = 0
        data = []
        for pos in loc:
            data.append([list(doc)[last:pos], tag[last:pos]])
            last = pos
    
        for d in data:
            tag = ['O' if t is '-' else t for t in d[1]]
            if len(set(tag)) > 1:
                sentences.append(d[0])
                tags.append(tag)
    
    return sentences, tags

In [15]:
sentences, tags = get_train_data(df)
len(sentences), len(tags)

(781, 781)

In [16]:
tag_vals = set(['X', '[CLS]', '[SEP]'])
for i in range(len(tags)):
    tag_vals = tag_vals.union(tags[i])
#tag_vals

In [17]:
tag2idx = {t: i for i, t in enumerate(tag_vals)}
#tag2idx

In [18]:
idx2tag = {tag2idx[key] : key for key in tag2idx.keys()}
#idx2tag

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [20]:
#tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=False)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased', do_lower_case=False)

In [21]:
def get_tokenized_train_data(sentences, tags):

    tokenized_texts = []
    word_piece_labels = []

    for word_list, label in zip(sentences, tags):
    
        # Add [CLS] at the front
        temp_lable = ['[CLS]']
        temp_token = ['[CLS]']
    
        for word, lab in zip(word_list, label):
            token_list = tokenizer.tokenize(word.text)
            for m, token in enumerate(token_list):
                temp_token.append(token)
                if m == 0:
                    temp_lable.append(lab)
                else:
                    temp_lable.append('X')  
                
        # Add [SEP] at the end
        temp_lable.append('[SEP]')
        temp_token.append('[SEP]')
    
        tokenized_texts.append(temp_token)
        word_piece_labels.append(temp_lable)
    
    return tokenized_texts, word_piece_labels

In [22]:
tokenized_texts, word_piece_labels = get_tokenized_train_data(sentences, tags)

In [23]:
print(tokenized_texts[0])
print(word_piece_labels[0])

['[CLS]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '-', '[UNK]', '[UNK]', ',', '[UNK]', '-', '[UNK]', 'me', 'on', '[UNK]', ':', 'indeed', '.', 'com', '/', 'r', '/', '[UNK]', '-', '[UNK]', '/', '10', '##e', '##7', '##a', '##8', '##cb', '##7', '##32', '##bc', '##43', '##a', '•', '[UNK]', 'work', 'for', 'an', 'organization', 'which', 'provides', 'me', 'the', 'opportunity', 'to', 'improve', 'my', 'skills', 'and', 'knowledge', 'for', 'my', 'individual', 'and', 'company', "'", 's', 'growth', 'in', 'best', 'possible', 'ways', '[SEP]']
['[CLS]', 'B-NAME', 'L-NAME', 'B-DESIG', 'I-DESIG', 'L-DESIG', 'O', 'U-COMPANY', 'U-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-EMAIL', 'I-EMAIL', 'I-EMAIL', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'X', 'O', 'O', 'O', 'O', 'O', '[SEP]']


In [24]:
MAX_LEN = 512
bs = 2

In [25]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
#print(len(input_ids[0]))
#print(input_ids[0])

In [26]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels], maxlen=MAX_LEN, value=tag2idx["O"], 
                     padding="post", dtype="long", truncating="post")
print(len(tags[0]))
print(tags[0])

512
[ 7  8  6 33 26 28 23 27 22 23 23 23 23 23 23  2 17 17 13 13 13 13 13 13
 13 13 13 13 13 13 13 13 13 13 13 13 13 13 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 13 23 23 23 23 23  5 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 2

In [27]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [28]:
tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks = train_test_split(input_ids, tags, attention_masks, random_state=2020, 
                                                                                 test_size=0.3)

In [29]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [30]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [31]:
model = BertForTokenClassification.from_pretrained("bert-large-uncased", num_labels=len(tag2idx))

In [32]:
model.cuda();

In [33]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [34]:
import torch
torch.cuda.empty_cache()

In [35]:
epochs = 13
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
        #print(".[]....")
        #print("....[].")
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

Epoch:   8%|▊         | 1/13 [03:48<45:47, 228.94s/it]

Train loss: 0.8820762320425047


Epoch:  15%|█▌        | 2/13 [07:39<42:09, 229.93s/it]

Train loss: 0.5484054081496739


Epoch:  23%|██▎       | 3/13 [11:30<38:22, 230.23s/it]

Train loss: 0.4609235546827971


Epoch:  31%|███       | 4/13 [15:20<34:33, 230.38s/it]

Train loss: 0.4023334008228757


Epoch:  38%|███▊      | 5/13 [19:11<30:43, 230.42s/it]

Train loss: 0.3113791994859666


Epoch:  46%|████▌     | 6/13 [23:01<26:53, 230.44s/it]

Train loss: 0.2392332181743868


Epoch:  54%|█████▍    | 7/13 [26:52<23:02, 230.50s/it]

Train loss: 0.20443352115180194


Epoch:  62%|██████▏   | 8/13 [30:42<19:12, 230.55s/it]

Train loss: 0.17460755896396363


Epoch:  69%|██████▉   | 9/13 [34:33<15:21, 230.50s/it]

Train loss: 0.15355948534974959


Epoch:  77%|███████▋  | 10/13 [38:23<11:31, 230.48s/it]

Train loss: 0.127370508356129


Epoch:  85%|████████▍ | 11/13 [42:14<07:40, 230.48s/it]

Train loss: 0.11430928743940748


Epoch:  92%|█████████▏| 12/13 [46:04<03:50, 230.54s/it]

Train loss: 0.09304973166717401


Epoch: 100%|██████████| 13/13 [49:55<00:00, 230.43s/it]

Train loss: 0.08442735569355429





In [42]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/Auto Tokenizer/')


IsADirectoryError: ignored

In [2]:
# save
output_model = '/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/Auto Tokenizer/large-uncased-bert-300.pth'

# def save(model, optimizer):
#     # save
#     torch.save({
#         'model_state_dict': model.state_dict(),
#         'optimizer_state_dict': optimizer.state_dict()
#     }, output_model)

# save(model, optimizer)

#load
checkpoint = torch.load(output_model, map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])


NameError: ignored

In [43]:
from transformers import AutoTokenizer
from pytorch_pretrained_bert import BertForSequenceClassification

tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/Auto Tokenizer/large-uncased-bert-30")

model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/Auto Tokenizer/large-uncased-bert-30")


AttributeError: ignored

In [39]:
# tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/Auto Tokenizer/")
# model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/Auto Tokenizer/")

In [None]:
import json
json.dump(model.to_json(), open("/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/Auto Tokenizer/large-uncased-bert-300.json", "w"))

In [41]:
model.eval()

y_true = []
y_pred = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch

    #inputing data in cpu
    with torch.no_grad():
        logits = model(input_ids, token_type_ids=None, attention_mask=input_mask,)

    logits = logits.detach().cpu().numpy()
    logits = [list(p) for p in np.argmax(logits, axis=2)]
    
    label_ids = label_ids.to('cpu').numpy()
    input_mask = input_mask.to('cpu').numpy()
    
    #evaluation main
    for i,mask in enumerate(input_mask):
        temp_1 = [] # Real one
        temp_2 = [] # Predict one
        
        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if idx2tag[label_ids[i][j]] != "X" and idx2tag[label_ids[i][j]] != "[CLS]" and idx2tag[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    #check which are true
                    for k in entities:
                      if idx2tag[logits[i][j]]==k:
                        print(idx2tag[logits[i][j]])
                    temp_1.append(idx2tag[label_ids[i][j]])
                    temp_2.append(idx2tag[logits[i][j]])
            else:
                break
        
            
        y_true.append(temp_1)
        y_pred.append(temp_2)
print(y_true)
print(y_pred)

print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))

print(classification_report(y_true, y_pred,digits=4))

[['B-NAME', 'L-NAME', 'U-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DESIG', 'L-DESIG', 'O', 'O', 'O', 'U-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DEG', 'I-DEG', 'I-DEG', 'I-DEG', 'I-DEG', 'I-DEG', 'I-DEG', 'I-DEG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-SKILLS', 'I-S

  _warn_prf(average, modifier, msg_start, len(result))


### SPACY

In [2]:
pip install spacy==2.2.4 fitz textract tika &> /dev/null

In [2]:
import spacy #2.2.4 only compatabile

model = '/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/spacy/model'
nlp_model = spacy.load(model)

In [26]:
import textract
import re
from tika import parser 
fname = '/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/Test- Resumes/Resumes/Resume 2 pdf/type_14.pdf'

# text1 = textract.process('/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/Test- Resumes/Profile.pdf')
# text2 = textract.process('/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/Test- Resumes/test.pdf')

raw = parser.from_file(fname)
print(raw['content'])


def pre_process(text):
  text = re.sub(r'[^\x00-\x7f]',r'', text)
  text = re.sub("\n", " ",text)
  return text

#pre_process(text2) 









































2018 - 2021

2018

2015

Jennifer Lawrence
3535 Rosewood Court City, State, Zip: Rochester, Minnesota(MN), 55902

507-270-2214 | lemuel1980@gmail.com

Objective

Diligent Manager with 8+ years experience with a large cloud services agency. Seeking to further IT career as a
Department Manager at Aptive by leveraging my statistical, managerial and communicative skills.

Experience

Microsoft
Senior Software Developer
Oversaw software development and coded profitable apps using C++ and C#
Developed cross platform compatible solutions.

Education

Carnegie Mellon University
Masters of Computer Science 
9.6

Pillai College of Engineering
Bachelor of Information Technology 
7.8

Skills

C

C++

Data Structures and Algorithms

Python

Java

Machine Learning

Data Analytics

PostgreSQL

Oracle SQL

Javascript

Pearl

Projects

Customer Relationship Management
This CRM project is an integrated approach of E-mail and user management to acquiring, recognizi

In [27]:
import pickle
train_data= pickle.load(open('/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/spacy/train_data.pkl','rb')) #trying model 2 bigger dataset


doc = nlp_model(raw['content'])
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}} - {ent.text}')

DESIGNATION                    - Diligent Manager
YEARS OF EXPERIENCE            - 8+ years experience with a large cloud services agency. Seeking to further IT career as a
Department Manager at Aptive by leveraging my statistical, managerial and communicative skills.

Experience

Microsoft
Senior Software Developer
Oversaw software development and coded profitable apps using C++ and C#
Developed cross platform compatible solutions.

Education

Carnegie Mellon University
Masters of Computer Science 
9.6

Pillai College of Engineering
Bachelor of Information Technology 
7.8

Skills

C
DEGREE                         - C++
DEGREE                         - Java
COLLEGE NAME                   - Machine Learning
COMPANIES WORKED AT            - Oracle SQL
DESIGNATION                    - Customer Relationship Management


In [34]:
import pickle
import json
import spacy
import random
import spacy
from spacy.util import minibatch, compounding 
from pathlib import Path
import os

#LABEL = ['Email Address', 'Links', 'Skills', 'Graduation Year', 'College Name', 'Degree', 'Companies worked at', 'Location', 'Name', 'Designation', 'projects', 'Years of Experience', 'Can Relocate to', 'UNKNOWN', 'Rewards and Achievements', 'Address', 'University', 'Relocate to', 'Certifications', 'state', 'links', 'College', 'training', 'des', 'abc']

def custom_nlp_train(filename):
    with open(filename, 'rb') as fp:
        doc = pickle.load(fp)
    
    ## Creating a blank spacy model
    nlp = spacy.blank('en')
    print("Created a blank en model")

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
        print("Added ner pipe")
    else:
        ner.get_pipe('ner')

    # for i in LABEL:
    #     ner.add_label(i)


    # add labels
    for _, annotations in doc:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
        
    optimizer = nlp.begin_training()
    
    for itn in range(30):
                random.shuffle(doc)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(doc, size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(
                        texts,  # batch of texts
                        annotations,  # batch of annotations
                        drop=0.3,  # dropout - make it harder to memorise data
                        losses=losses,
                    )
                print("Losses", losses)

    output_dir = os.path.dirname(os.path.realpath(__file__)) + "/training_data"
    new_model_name="customspacy"
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()

    nlp.meta["name"] = new_model_name  # rename model
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)           
            

    


if __name__ == "__main__":
    custom_nlp_train('/content/drive/MyDrive/Colab Notebooks/datasets/d.pkl')   

Created a blank en model
Added ner pipe


ValueError: ignored

In [8]:
import pickle
import spacy
import random
nlp = spacy.blank('en')
train_data= pickle.load(open('/content/drive/MyDrive/Colab Notebooks/datasets/d.pkl','rb'))

#defining the traning model

def train_model(train_data):
    if 'ner' not in nlp.pipe_names:
        ner=nlp.create_pipe('ner')
        nlp.add_pipe(ner,last=True)
        
    for _, annotation in train_data:
        for ent in annotation['entities']:
            ner.add_label(ent[2])

# Now to remove other pipelines - we define this here - READ SPACY WEBSITE

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
            optimizer = nlp.begin_training()
            for itn in range(50):
                print("Statring iteration " + str(itn))
                random.shuffle(train_data)
                losses = {}
                index = 0
                for text, annotations in train_data:
                    try:
                        nlp.update(
                            [text],  # batch of texts
                            [annotations],  # batch of annotations
                            drop=0.2,  # dropout - make it harder to memorise data
                            sgd=optimizer,  # callable to update weights
                            losses=losses)
                    except Exception as e:
                        pass
                    
                print(losses)

In [None]:
train_model(train_data) #7159

In [10]:
nlp.to_disk('/content/drive/MyDrive/Colab Notebooks/Models/DeepBlue/700-new-code')