# Extracting entities given the dataset with help of LLM prompt
Entity Extraction with Generative Models


In [1]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
# reading the data
df = pd.read_json('../data/relations_dev.txt')

In [39]:
#df_nested_list.head()

In [32]:
# installing the libe
!pip install cohere requests tqdm

In [92]:
import cohere
import pandas as pd
import requests
import datetime
from tqdm import tqdm
pd.set_option('display.max_colwidth', None)

In [139]:
def get_job_titles():
    data = pd.read_json('../data/relations_dev.txt')
    #data['document'].head()
    return[a['document'] for a in data]

In [42]:
# API
my_API = "nMqcDQ2gC0vxFVwIDvUINps6zJTvYZOskiz5dhq1"

In [94]:
# connecting to cohere
co = cohere.Client(my_API)

## Preparing examples for the prompt
In our prompt, we'll present the model with examples for the type of output we're after. We basically get a set of subreddit article titles, and label them ourselves. The label here is the name of the movie mentioned in the title (and "none" if no movie is mentioned).

In [115]:
 # job description
job_examples =[("text", "Bachelor",
        "start", '0',
        "end", '8',
        "token_start", '0',
        "token_end", '0',
        "entityLabel", "DIPLOMA"),  
        ("text", "Mechanical Engineering",
        "start", '21',
        "end", '43',
        "token_start", '4',
        "token_end", '5',
        "entityLabel", "DIPLOMA_MAJOR"), 
        ( "text", "10+ years",
        "start", '0',
        "end", '9',
        "token_start", '0',
        "token_end", '2',
        "entityLabel", "EXPERIENCE"),
        ("text", "8+ years",
        "start", '0',
        "end", '8',
        "token_start", '0',
        "token_end", '2',
        "entityLabel", "EXPERIENCE")]

### Creating the extraction prompt

In [111]:
class cohereExtractor():
    def __init__(self, examples, example_labels, labels, task_desciption, example_prompt):
        self.examples = examples
        self.example_labels = example_labels
        self.labels = labels
        self.task_desciption = task_desciption
        self.example_prompt = example_prompt

    def make_prompt(self, example):
        examples = self.examples + [example]
        labels = self.example_labels + [""]
        return (self.task_desciption +
                "\n---\n".join( [examples[i] + "\n" +
                                self.example_prompt + 
                                 labels[i] for i in range(len(examples))]))

    def extract(self, example):
        extraction = co.generate(
          model='large',
          prompt=self.make_prompt(example),
          max_tokens=10,
          temperature=0.1,
          stop_sequences=["\n"])
        return(extraction.generations[0].text[:-1])


cohereJobExtractor = cohereExtractor([e[1] for e in job_examples], 
                                       [e[0] for e in job_examples], [],
                                       "", 
                                       "the entites:")

In [114]:
# This is what the prompt looks like:
print(cohereJobExtractor.make_prompt("text"))

Bachelor
the entites:text
---
Mechanical Engineering
the entites:text
---
10+ years
the entites:text
---
8+ years
the entites:text
---
starts
the entites:


## Getting the data
Make the API call to get the jobs.

In [153]:
num_posts = 10

job_list = get_post_titles(size=num_posts, 
      after=str(int(datetime.datetime(2022,1,1,0,0).timestamp())), 
      before=str(int(datetime.datetime(2023,1,1,0,0).timestamp())), 
      subreddit="jobs", 
      sort_type="score", 
      sort="desc")

# Show the list
job_list

['Causes of the Great Resignation',
 'Sent a thank you email, and I just heard back…',
 'Boss wants me to tell him how I can progress… I don’t know if I can.',
 'I GOT A NEW JOB!!',
 'How do I not take rejections personally?',
 'Resigned with 2 weeks\' notice, then told that I would be "ineligible for rehire" unless I gave 4 weeks\' notice',
 'Wife (F28) in Marketing facing a brick wall in the way of career development',
 'Rejected for lack of experience',
 'I got a follow up call after an interview!',
 "I'm TIRED of dealing with recruiters, but applying to jobs lately directly on their portal/site is just like yelling into the void and I NEVER hear back. Not sure what's wrong with everyone's hiring processes."]

## Running the model
And now we loop over the posts and process each one of them with our extractor.

In [154]:
results = []
for text in tqdm(job_list):
    try:
        extracted_text = cohereJobExtractor.extract(text)
        results.append(extracted_text)
    except Exception as e:
        print('ERROR: ', e)

100%|██████████| 10/10 [00:21<00:00,  2.20s/it]


In [155]:
# look at the result
pd.DataFrame(data={'text': job_list, 'extracted_entity': results})

Unnamed: 0,text,extracted_entity
0,Causes of the Great Resignation,text
1,"Sent a thank you email, and I just heard back…",text
2,Boss wants me to tell him how I can progress… I don’t know if I can.,text
3,I GOT A NEW JOB!!,text
4,How do I not take rejections personally?,text
5,"Resigned with 2 weeks' notice, then told that I would be ""ineligible for rehire"" unless I gave 4 weeks' notice",text
6,Wife (F28) in Marketing facing a brick wall in the way of career development,text
7,Rejected for lack of experience,text
8,I got a follow up call after an interview!,text
9,"I'm TIRED of dealing with recruiters, but applying to jobs lately directly on their portal/site is just like yelling into the void and I NEVER hear back. Not sure what's wrong with everyone's hiring processes.",text


## Testing the extraction

In [150]:
test_df = pd.read_json('../data/relations_test.txt')
test_df.head(1)

Unnamed: 0,document,tokens,relations
0,"\nCurrently holding a faculty, industry, or government researcher position.\nPh.D. and publications in machine learning, AI, computer science, statistics, applied mathematics, data science, or related technical fields.\nExperience leading a team in solving analytical problems using quantitative approaches.\nExperience manipulating and analyzing data from different sources.\nExperience in theoretical and empirical research and for answering questions with research.\nAbility to communicate research for public audiences of peers.\nKnowledge in a programming language.\nAbility to obtain and maintain work authorization in the country of employment in 2018.\n\nPREFERRED \n1+ year(s) of work experience in a university, industry, or government lab(s), in a role with primary emphasis on AI research.\nExperience driving original scholarship in collaboration with a team.\nFirst-author publications at peer-reviewed AI conferences (e.g. NIPS, CVPR, ICML, ICLR, ICCV, and ACL).\nExperience in developing and debugging in C/C++, Python, C# and/or Java.","[{'text': 'Ph.D.', 'start': 75, 'end': 80, 'token_start': 14, 'token_end': 14, 'entityLabel': 'DIPLOMA'}, {'text': 'machine learning', 'start': 101, 'end': 117, 'token_start': 18, 'token_end': 19, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'AI', 'start': 119, 'end': 121, 'token_start': 21, 'token_end': 21, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'computer science', 'start': 123, 'end': 139, 'token_start': 23, 'token_end': 24, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'statistics', 'start': 141, 'end': 151, 'token_start': 26, 'token_end': 26, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'applied mathematics', 'start': 153, 'end': 172, 'token_start': 28, 'token_end': 29, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'data science', 'start': 174, 'end': 186, 'token_start': 31, 'token_end': 32, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': '1+ year(s', 'start': 664, 'end': 673, 'token_start': 113, 'token_end': 115, 'entityLabel': 'EXPERIENCE'}, {'text': 'university', 'start': 699, 'end': 709, 'token_start': 122, 'token_end': 122, 'entityLabel': 'SKILLS'}, {'text': 'industry', 'start': 711, 'end': 719, 'token_start': 124, 'token_end': 124, 'entityLabel': 'SKILLS'}, {'text': 'government lab(s)', 'start': 724, 'end': 741, 'token_start': 127, 'token_end': 129, 'entityLabel': 'SKILLS'}, {'text': 'AI', 'start': 778, 'end': 780, 'token_start': 138, 'token_end': 138, 'entityLabel': 'SKILLS'}]","[{'child': 18, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 21, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 23, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 26, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 28, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 31, 'head': 14, 'relationLabel': 'DEGREE_IN'}, {'child': 122, 'head': 113, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 124, 'head': 113, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 127, 'head': 113, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 138, 'head': 113, 'relationLabel': 'EXPERIENCE_IN'}]"


**to be continued...