In [None]:
import transformers
import torch
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)
def promptModel(_user_, _system_='You are a helpful digital assistant.', max_tokens=256):
    messages = [
        {"role": "system", "content": _system_},
        {"role": "user",   "content": _user_},
    ]
    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    outputs = pipeline(
        prompt,
        max_new_tokens=max_tokens,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    return outputs[0]["generated_text"][len(prompt):]

In [None]:
import pandas as pd
import numpy as np
import time
import os
_base_dir_ = '../../../data/2014_vast/MC1/News Articles'

files_processed = 0
_lu_       = {'file':[], 'article':[], 'llama3_8b_response':[], 'llama3_8b_time':[]}
_prompt_   = '''Translate the following user text to an RDF graph using both the FOAF, and Schema1 ontologies.
Use the prefix ex: with IRI <http://example.com/> for any created entities.
'''

for _dir_ in os.listdir(_base_dir_):
    for _file_ in os.listdir(os.path.join(_base_dir_, _dir_)):
        _article_raw_ = open(os.path.join(_base_dir_, _dir_, _file_), 'rb').read()
        _article_     = str(_article_raw_) #.replace('\\r', '').split('\\n')
        ts0_model = time.time()
        _response_    = promptModel(_article_, _prompt_, max_tokens=4096)
        ts1_model = time.time()

        _lu_['file'].append(_file_)
        _lu_['article'].append(_article_)
        _lu_['llama3_8b_response'].append(_response_)
        _lu_['llama3_8b_time'].append(ts1_model-ts0_model)

        pd.DataFrame(_lu_).to_csv('llama3_8b_2014_vast_sbs.csv', index=False)
        files_processed += 1
        if files_processed > 1: break
    if files_processed > 1: break


In [None]:
pd.DataFrame(_lu_)

In [None]:
print(_lu_['llama3_8b_response'][1])

In [None]:
def breakIntoSentences(_str_):
    _sentences_ = []
    for _ in _str_.split('.'):
        if len(_) > 0:
            _sentences_.append(_.strip() + '.')
    return _sentences_

def separateArticle(_str_):
    _source_, _title_, _published_, _sentences_ = '', '', '', []
    for _ in _str_.split('\n'):
        if len(_) > 0 and _.startswith('<<') == False:
            if   _.startswith("b'SOURCE:"):     _source_  = _.replace("b'SOURCE: ", '')  .strip()
            elif _.startswith("TITLE:"):      _title_     = _.replace('TITLE: ', '')     .strip()
            elif _.startswith("PUBLISHED:"):  _published_ = _.replace('PUBLISHED: ', '') .strip()
            else:                             _sentences_.extend(breakIntoSentences(_))
    return _source_, _title_, _published_, _sentences_

_src_, _title_, _published_, _sentences_ = separateArticle(_lu_['article'][1].replace('\\r', '').replace('\\n', '\n'))
_sentences_

In [None]:
_prompt_   = '''Translate the following user text to an RDF graph using both the FOAF, and Schema1 ontologies.
Use the prefix ex: with IRI <http://example.com/> for any created entities.
'''
_prompt_   = 'Translate the following text into an CCO ontology represented as JSON.  Only include the JSON structure.'
_sentence_ = 'The army of people Asterian (APA) is paramilitary organization which has been busy with terrorist activities ' + \
             'which are financed by its criminal ventures, which include drug trafficking.'
_response_ = promptModel(_sentence_, _prompt_, max_tokens=4096)
print(_response_)