In [1]:
import pandas as pd
import cohere
import dvc.api as DvcApi
from dotenv import load_dotenv 

In [2]:
import sys, os

sys.path.append(os.path.abspath(os.path.join("../..")))
sys.path.append(os.path.abspath(os.path.join("../scripts")))

In [3]:
import prompt_functions as promptf

In [4]:
load_dotenv(encoding='utf-16')

api_key = os.getenv('apikey')

In [5]:
# Create and retrieve a Cohere API key from os.cohere.ai
co = cohere.Client(api_key)

In [6]:
pd.set_option('display.max_colwidth', None)

# Named Entity Recognition (NER)  with Generative Model of Cohere

In this notebook we use Cohere's generative models to extract entities from a job description. We make use of sturctured generation based on providing multiple examples in the prompt.

The data are job descriptions (together named entities) and relationships between entities in json format. In this preliminary work we will not consider the relationships as NER alone is not enough to achieve this.

- Dataset 1: For development and training
- Dataset 2: For testing and final reporting


## Preparing examples for the prompt

In our prompt, we'll present the model with examples of job descriptions and the type of output we're after.

In [7]:
path = "data/relations_dev.json"
repo = "../"
version = "v2"

data_url = DvcApi.get_url(path = path, repo = repo, rev = version) #could be tag or git commit
data_dev = pd.read_json(data_url)

In [35]:
# we put extracted entities in a clearer way

df = data_dev.drop(columns = ['tokens', 'relations'])
df['tokens'] = data_dev['tokens'].apply(lambda x: promptf.streamline_tokens(x))

In [36]:
df.head(1)

Unnamed: 0,document,tokens
0,"Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience",DIPLOMA: Bachelor\nDIPLOMA_MAJOR: Mechanical Engineering\nDIPLOMA_MAJOR: Physical Science\nEXPERIENCE: 3+ years\nSKILLS: developing\nSKILLS: fiber optic cables\nSKILLS: connector related products


## Zero-Shot Learning

In [10]:
# number of examples is zero

num_Ex = 0

prompt = promptf.make_prompt(data_dev, num_Ex, same_file= True)

In [11]:
# let's look at the prompt

print(prompt)

DOCUMENT: Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience
EXTRACTED TEXT:


In [12]:
extraction = co.generate( model = 'large', prompt = prompt, max_tokens = 100, temperature = 0.1)

In [13]:
print(extraction.generations[0].text)


The Senior Product Manager will be responsible for the development and management of the product line of fiber optic cables and connector related products. The Senior Product Manager will be responsible for the development of new products, product enhancements, and product line extensions. The Senior Product Manager will be responsible for the development of product specifications, product life cycle management, and product costing. The Senior Product Manager will be responsible for the development of product marketing and sales strategies. The Senior Product Manager will be responsible for the development of


In [14]:
# compared to the expected result

print(df.iloc[num_Ex]['tokens'])
#print(promptf.test_labels(data_dev, num_Ex))

DIPLOMA: Bachelor
DIPLOMA_MAJOR: Mechanical Engineering
DIPLOMA_MAJOR: Physical Science
EXPERIENCE: 3+ years
SKILLS: developing
SKILLS: fiber optic cables
SKILLS: connector related products


As expected, zero-shot learning doesn't give us a good answer

_________________________________________________________________________________________________________________________

## Few-Shot Learning

### 1. Include 1-4 examples

In [15]:
# we include 1 example

num_Ex = 1
prompt = promptf.make_prompt(data_dev, num_Ex, same_file= True)

extraction = co.generate( model = 'large', prompt = prompt, max_tokens = 100, temperature = 0.1)

In [16]:
print(extraction.generations[0].text)

print('\ncompared to the expected result:\n')

print(df.iloc[num_Ex]['tokens'])


DIPLOMA: Bachelor
DIPLOMA_MAJOR: Computer Science
EXPERIENCE: 10+ years
SKILLS: release automation engineering
SKILLS: CI/CD or related roles
SKILLS: consumer electronics devices
SKILLS: technical teams
SKILLS: performance management
--
DOCUMENT: Bachelor's degree in Computer Science or related field. 5+ years of experience in software engineering. Experience in release

compared to the expected result:

EXPERIENCE: 10+ years
SKILLS: software engineering
EXPERIENCE: 5+ years
SKILLS: technical management
SKILLS: release engineering
SKILLS: tools engineering
SKILLS: DevOps
DIPLOMA: BS/MS
DIPLOMA_MAJOR: Computer Science


The model is extracting too many labels.

To solve this we lower the maximum number of tokens it can generate.

In [17]:
# we include 2 examples

num_Ex = 2
prompt = promptf.make_prompt(data_dev, num_Ex, same_file= True)

extraction = co.generate( model = 'large', prompt = prompt, max_tokens = 50, temperature = 0.1)

In [18]:
print(extraction.generations[0].text)

print('\ncompared to the expected result:\n')

print(df.iloc[num_Ex]['tokens'])


EXPERIENCE: 3+ years
SKILLS: Swift
SKILLS: Objective-C
SKILLS: iOS internals
SKILLS: app from scratch
SKILLS: portfolio of apps featured in

compared to the expected result:

EXPERIENCE: 3+ years
SKILLS: Swift & Objective-C


In [19]:
# we include 3 examples

num_Ex = 3
prompt = promptf.make_prompt(data_dev, num_Ex, same_file= True)

extraction = co.generate( model = 'large', prompt = prompt, max_tokens = 50, temperature = 0.1)

In [20]:
print(extraction.generations[0].text)

print('\ncompared to the expected result:\n')

print(df.iloc[num_Ex]['tokens'])


EXPERIENCE: 8+ years
SKILLS: software engineering
EXPERIENCE: 5+ years
SKILLS: people management
SKILLS: managing leaders
SKILLS: managing remotely across regions
SK

compared to the expected result:

EXPERIENCE: 8+ years
SKILLS: software engineering
EXPERIENCE: 5+ years
SKILLS: people management
SKILLS: managing leaders


In [21]:
# we include 4 examples

num_Ex = 4
prompt = promptf.make_prompt(data_dev, num_Ex, same_file= True)

extraction = co.generate( model = 'large', prompt = prompt, max_tokens = 50, temperature = 0.5)

In [22]:
print(extraction.generations[0].text)

print('\ncompared to the expected result:\n')

print(df.iloc[num_Ex]['tokens'])


EXPERIENCE: 7+ years
SKILLS: C++
EXPERIENCE: 5+ years
SKILLS: software engineering
SKILLS: real-time environments
SKILLS: game or robotics


compared to the expected result:

DIPLOMA: BS
DIPLOMA_MAJOR: Computer Science
EXPERIENCE: 7+ years
SKILLS: C++
SKILLS: C++11
EXPERIENCE: 5+ years
SKILLS: creating software for real-time environments
SKILLS: games
SKILLS: robotics
EXPERIENCE: 2+ years
SKILLS: managing software engineers


### 3. Validate using 5 examples

In [23]:
%%time
extracted = []

for i in range(5, len(data_dev)):
    index = i
    prompt = promptf.make_prompt(data_dev, index, numEx= 5, same_file= True)
    
    extraction = co.generate( model = 'large', prompt = prompt, max_tokens = 50, temperature = 0.5)
    
    extracted.append(extraction.generations[0].text.strip())

CPU times: total: 734 ms
Wall time: 33.3 s


In [37]:
df.drop(range(5), inplace= True)

df['extractedEntities']  = extracted

In [49]:
# Compare the tokens to the extracted text
df['correct'] = (df['tokens'].str.lower() == df['extractedEntities'].str.lower()).astype(int)

# Print the accuracy
print(f'Accuracy {df["correct"].mean() *100}%')

Accuracy 0.0%


In [None]:
def fun(text):
    x = text.split

In [43]:
x = 'a\nb\nc\nd'

y = x.split()
y

['a', 'b', 'c', 'd']

In [61]:
x = 'A: B'
y = (x.split(':'))
y

TypeError: map() must have at least two arguments.

In [54]:
x = 'a'
y = ['a', 'b']

x in y

True

In [55]:
y.pop('a')
y

TypeError: 'str' object cannot be interpreted as an integer

In [None]:
"""
from concurrent.futures import ThreadPoolExecutor

extracted = []
# Run the model to extract the entities
with ThreadPoolExecutor(max_workers=8) as executor:
    for i in executor.map(cohereMovieExtractor.extract, test_df['text']):
        extracted.append(str(i).strip())
# Save results
test_df['extracted_text'] = extracted

"""

_________________________________________________________________________________________________________________________

In [81]:
"""
def extract(self, example):
    extraction = co.generate(model='large', prompt=self.make_prompt(example), max_tokens=10, 
                             temperature=0.1, stop_sequences=["\n"])
    return(extraction.generations[0].text[:-1])
"""
    
extraction = co.generate( model = 'large', prompt = prompt, max_tokens = 100, temperature = 0.1,
                        stop_sequences = ["--"])

In [83]:
extraction.generations[0]

cohere.Generation {
	text: 
EXPERIENCE: 5+ years
SKILLS: backend services
EXPERIENCE: 4+ years
SKILLS: systems
SKILLS: data infrastructure
--
	likelihood: None
	token_likelihoods: None
}