# 1. Get the packages

In [19]:
import cohere
import pandas as pd
import requests
import datetime
from tqdm import tqdm
pd.set_option('display.max_colwidth', None)


def get_post_titles(**kwargs):
    """ Gets data from the pushshift api. Read more: https://github.com/pushshift/api """
    base_url = f"https://api.pushshift.io/reddit/search/submission/"
    payload = kwargs
    request = requests.get(base_url, params=payload)
    return [a['title'] for a in request.json()['data']]


# 2. Get the Data

In [48]:
df = pd.read_json('../data/entity/relations_dev.txt')
df[:1]

Unnamed: 0,document,tokens,relations
0,"Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience","[{'text': 'Bachelor', 'start': 0, 'end': 8, 'token_start': 0, 'token_end': 0, 'entityLabel': 'DIPLOMA'}, {'text': 'Mechanical Engineering', 'start': 21, 'end': 43, 'token_start': 4, 'token_end': 5, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': 'Physical Science', 'start': 47, 'end': 63, 'token_start': 7, 'token_end': 8, 'entityLabel': 'DIPLOMA_MAJOR'}, {'text': '3+ years', 'start': 64, 'end': 72, 'token_start': 9, 'token_end': 11, 'entityLabel': 'EXPERIENCE'}, {'text': 'developing', 'start': 89, 'end': 99, 'token_start': 15, 'token_end': 15, 'entityLabel': 'SKILLS'}, {'text': 'fiber optic cables', 'start': 114, 'end': 132, 'token_start': 18, 'token_end': 20, 'entityLabel': 'SKILLS'}, {'text': 'connector related products', 'start': 137, 'end': 163, 'token_start': 22, 'token_end': 24, 'entityLabel': 'SKILLS'}]","[{'child': 4, 'head': 0, 'relationLabel': 'DEGREE_IN'}, {'child': 7, 'head': 0, 'relationLabel': 'DEGREE_IN'}, {'child': 15, 'head': 9, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 18, 'head': 9, 'relationLabel': 'EXPERIENCE_IN'}, {'child': 22, 'head': 9, 'relationLabel': 'EXPERIENCE_IN'}]"


# preprocess the data

In [50]:

document_list = []
for i in range(df.shape[0]):
    ent = ""
    docu = df.document.iloc[i]
    for t in df.tokens.iloc[i]:
        ent += f"{t['entityLabel']+':'+t['text']}\n"
    
    document_list.append(docu+'\n'+ent+'--'+'\n')
    
with open('../data/output/entity.txt','w') as f:
    for item in document_list:
        # write each item on a new line
        f.write("%s\n" % item)
    print('Done')
    

Done


In [51]:
import cohere
api_key = pd.read_json("../config/apikey.json")['cohere_key'][0]
co = cohere.Client(api_key)


# Preparing examples for the prompt


# Running the model

In [55]:

def extract(prompt):
    extraction = co.generate(
        model='large',
        prompt=prompt,
        max_tokens=10,
        temperature=0.1,
        stop_sequences=["\n"])
    return(extraction.generations[0].text[:-1])


In [56]:
results = []
for text in tqdm(document_list):
    try:
        extracted_text = extract(text)
        results.append(extracted_text)
    except Exception as e:
        print('ERROR: ', e)


[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
[0m
100%|██████████| 22/22 [01:01<00:00,  2.79s/it]


In [57]:
test_df = pd.DataFrame(data={'text': document_list, 'extracted_text': results})
test_df.head()

Unnamed: 0,text,extracted_text
0,"Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience\nDIPLOMA:Bachelor\nDIPLOMA_MAJOR:Mechanical Engineering\nDIPLOMA_MAJOR:Physical Science\nEXPERIENCE:3+ years\nSKILLS:developing\nSKILLS:fiber optic cables\nSKILLS:connector related products\n--\n",
1,"10+ years of software engineering work experience. Technical experience in release automation engineering, CI/CD or related roles. Experience building and leading a software organization through product design, delivery and commercialization of consumer electronics devices. Experience recruiting and managing technical teams, including performance management. BS/MS in Computer Science. Experience in leading timeline, multi-partner initiatives. Organizational communication and coordination experience. PREFERRED 5+ years of experience with hands-on technical management, release engineering, tools engineering, DevOps, or related area.\nEXPERIENCE:10+ years\nSKILLS:software engineering\nEXPERIENCE:5+ years\nSKILLS:technical management\nSKILLS:release engineering\nSKILLS:tools engineering\nSKILLS:DevOps\nDIPLOMA:BS/MS\nDIPLOMA_MAJOR:Computer Science\n--\n",
2,"3+ years Swift & Objective-C and experience with iOS internals Experience building an entire app from scratch and ideally a portfolio of apps featured in the App Store Someone who knows every trick in the book on UI transitions, network communication and memory/battery efficiency Strong UI/design skill experience is a plus\nEXPERIENCE:3+ years\nSKILLS:Swift & Objective-C\n--\n",
3,"8+ years experience in software engineering leadership 5+ years people management experience including managing leaders and managing remotely across regions Strategic thinker with proven track record of transforming operations to provide customer experience through innovation and improvement Track record of working with VP, C-level Executives Experience deploying operational support models across enterprise organizations Communication/presentations experience Experience working with all levels of management internally and externally Experience meeting objectives in an entrepreneurial environment Collaboration and relationship-building experience BA/BS degree or higher\nEXPERIENCE:8+ years\nSKILLS:software engineering\nEXPERIENCE:5+ years\nSKILLS:people management\nSKILLS:managing leaders\n--\n",
4,"BS degree in Computer Science or related field. 7+ years C++ experience, including C++11 features and principles. 5+ years experience creating software for real-time environments such as games or robotics. 2+ years experience managing software engineers. Proven track record of software development, including shipping one or more products on large code bases that span platforms and tools. Problem solving and optimization experience. Communication experience and demonstrated experience working across disciplines and teams to drive solutions. PREFERRED Hands-on experience with 3D computer vision algorithms including Calibration, SLAM, Reconstruction, Mapping, Localization, Sensor Fusion, State Estimation and Image Processing Experience with designing (products or open-source/git software) of inertial/optical sensing devices Publication in leading workshops or conferences such as CVPR, ECCV, ICCV, SIGGRAPH, ICCP, RSS, ICRA, etc.\nDIPLOMA:BS\nDIPLOMA_MAJOR:Computer Science\nEXPERIENCE:7+ years\nSKILLS:C++\nSKILLS:C++11\nEXPERIENCE:5+ years\nSKILLS:creating software for real-time environments\nSKILLS:games\nSKILLS:robotics\nEXPERIENCE:2+ years\nSKILLS:managing software engineers\n--\n",


In [58]:

response = co.generate(
    model='large',
    prompt=document_list,
    max_tokens=20,
    temperature=0.5,
    k=0,
    p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop_sequences=["--"],
    return_likelihoods='NONE')
print('Prediction: {}'.format(response.generations[0].text))


CohereError: invalid type, for proper usage, please refer to https://docs.cohere.ai/generate-reference