# Import Libraries

In [1]:
import sys,os,logging
sys.path.insert(0,'../scripts/')
from cohere_entity_extraction import cohere_extractor
from preprocessor import input_preprocessor
from data_fetch import get_job_data
import config,mlflow 

logging.basicConfig(filename='../log/log.log', filemode='a',encoding='utf-8', level=logging.DEBUG)

# Get the data

In [2]:
job_training_data = get_job_data(path='data/relations_dev.txt',repo='C:/Users/User/Desktop/Prompt-Engineering',version='relations_dev.txt_v1')
job_test_data = get_job_data(path='data/relations_test.txt',repo='C:/Users/User/Desktop/Prompt-Engineering',version='relations_test_v1')



In [3]:
job_training_data.head()

Unnamed: 0,document,tokens,relations
0,Bachelor's degree in Mechanical Engineering or...,"[{'text': 'Bachelor', 'start': 0, 'end': 8, 't...","[{'child': 4, 'head': 0, 'relationLabel': 'DEG..."
1,10+ years of software engineering work experie...,"[{'text': '10+ years', 'start': 0, 'end': 9, '...","[{'child': 4, 'head': 0, 'relationLabel': 'EXP..."
2,3+ years Swift & Objective-C and experience wi...,"[{'text': '3+ years', 'start': 0, 'end': 8, 't...","[{'child': 3, 'head': 0, 'relationLabel': 'EXP..."
3,8+ years experience in software engineering le...,"[{'text': '8+ years', 'start': 0, 'end': 8, 't...","[{'child': 5, 'head': 0, 'relationLabel': 'EXP..."
4,BS degree in Computer Science or related field...,"[{'text': 'BS', 'start': 0, 'end': 2, 'token_s...","[{'child': 3, 'head': 0, 'relationLabel': 'DEG..."


In [4]:
job_test_data.head()

Unnamed: 0,document,tokens,relations
0,"\nCurrently holding a faculty, industry, or go...","[{'text': 'Ph.D.', 'start': 75, 'end': 80, 'to...","[{'child': 18, 'head': 14, 'relationLabel': 'D..."
1,\n2+ years experience in the online advertisin...,"[{'text': '2+ years', 'start': 1, 'end': 9, 't...","[{'child': 7, 'head': 1, 'relationLabel': 'EXP..."
2,\nBA/BS\n5+ years of program or project manage...,"[{'text': '5+ years', 'start': 7, 'end': 15, '...","[{'child': 11, 'head': 5, 'relationLabel': 'EX..."
3,\nCurrently enrolled in a full-time degree pro...,"[{'text': 'Ph.D.', 'start': 801, 'end': 806, '...","[{'child': 140, 'head': 137, 'relationLabel': ..."
4,\nCurrently enrolled in a full-time degree pro...,"[{'text': 'Ph.D.', 'start': 801, 'end': 806, '...","[{'child': 140, 'head': 137, 'relationLabel': ..."


# Get API key

In [5]:
api_key =config.cohere_api['api_key']

# Process training data

In [3]:
#Preprocess data to make it suitable for the use as examples for our API
job_training_data_processed = input_preprocessor(job_training_data.head(n=5))

job_training_data_processed

["Bachelor's degree in Mechanical Engineering or Physical Science 3+ years track record of developing or specifying fiber optic cables and connector related products Knowledge of fiber optic component, cabling, and interconnect products, technologies, and standards Experience in statistical data analysis Experience with product life cycle management (PLM) process Experience providing solutions to problems and meeting deadlines Experience engaging stakeholders PREFERRED Advanced degree Experience using a software tool for statistical data analysis such as JMP Experience using Agile as product life-cycle management tool Data center or other mission critical development experience\n\nout put:\nDIPLOMA:Bachelor\nDIPLOMA_MAJOR:Mechanical Engineering,Physical Science\nEXPERIENCE:3+ years\nSKILLS:developing,fiber optic cables,connector related products\n--end--\n",
 '10+ years of software engineering work experience. Technical experience in release automation engineering, CI/CD or related rol

# Extract entities from test data

 we have to optimize the number of prompts we give the API to get the best out of our model. We have also structured our traing and input data and provided a delimiter.

In [12]:
mlflow.set_experiment("Large language model")
mlflow.log_param('Model','large')
mlflow.log_param('Max tokens ',100)
mlflow.log_param('Temperature',0.5)
mlflow.log_param('Stop sequences',"--end--")

if not os.path.exists("Large language model entity extraction prompts"):
    os.makedirs("Large language model entity extraction prompts")
with open("Large language model entity extraction prompts/prompts.txt", "w") as f:
            f.write(job_training_data_processed)
mlflow.log_artifacts("Large language model entity extraction prompts")

for each in job_test_data.head(n=8)['document']:
    prom = str(each)+'\n\nout put:'
    extractor = cohere_extractor(api_key,job_training_data_processed,prom)
    print(extractor.replace('--end--','').replace('\n','  '))


  DIPLOMA:Ph.D  DIPLOMA_MAJOR:Computer Science  EXPERIENCE:1+ year(s)  SKILLS:machine learning,AI,computer science,statistics,applied mathematics,data science,research  --end-
  EXPERIENCE:2+ years  SKILLS:presenting,partnering with technical and non-technical teams,communicating analyses,data-sets,statistical software,data extraction tools,sql  DIPLOMA:BS/BA  DIPLOMA_MAJOR:Economics,Statistics,Political Science,History,Psychology  --end-
  EXPERIENCE:5+ years,2+ years  SKILLS:program or project management,technical project/program management,user needs,gathering requirements,defining scope,communication experience  --end-
  DIPLOMA:BS  DIPLOMA_MAJOR:Computer Science  SKILLS:Python,Lua,C++,C,C#,Java  EXPERIENCE:1+ years  --end-
  DIPLOMA:BS  DIPLOMA_MAJOR:Computer Science  EXPERIENCE:3+ years  SKILLS:Python,Lua,C++,C,C#,Java  --end-
  DIPLOMA:MS,PhD  DIPLOMA_MAJOR:Operations Research,Industrial Engineering,Quantitative Finance,Math,Physics  EXPERIENCE:7+ years,5+ years,2+ years  SKILLS

In [6]:
mlflow.set_experiment("Large language model entity extraction")
mlflow.log_param('Model','large')
mlflow.log_param('Max tokens ',100)
mlflow.log_param('Temperature',0.5)
mlflow.log_param('Stop sequences',"--end--")

if not os.path.exists("Large language model entity extraction prompts"):
    os.makedirs("Large language model entity extraction prompts")
with open("Large language model entity extraction prompts/prompts.txt", "w") as f:
            f.write(job_training_data_processed[0])
mlflow.log_artifacts("Large language model entity extraction prompts")

2022/09/17 17:05:09 INFO mlflow.tracking.fluent: Experiment with name 'Large language model entity extraction' does not exist. Creating a new experiment.
