In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path

mistral_models_path = Path.home().joinpath('mistral_models', 'Nemo-Instruct')
mistral_models_path.mkdir(parents=True, exist_ok=True)

snapshot_download(repo_id="mistralai/Mistral-Nemo-Instruct-2407", allow_patterns=["params.json", "consolidated.safetensors", "tekken.json"], local_dir=mistral_models_path)

from mistral_inference.transformer import Transformer
from mistral_inference.generate import generate

from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest

tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tekken.json")
model = Transformer.from_folder(mistral_models_path)

def promptModel(prompt, max_new_tokens=64, temperature=0.35):
    completion_request = ChatCompletionRequest(messages=[UserMessage(content=prompt)])
    tokens = tokenizer.encode_chat_completion(completion_request).tokens
    out_tokens, _ = generate([tokens], model, max_tokens=max_new_tokens, temperature=temperature, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
    result = tokenizer.decode(out_tokens[0])
    return result

#_response_ = promptModel("How expensive would it be to ask a window cleaner to clean all windows in Paris. Make a reasonable guess in US Dollar.", max_new_tokens=512)
#print(_response_)

In [None]:
import pandas as pd
import numpy as np
n_sentences = 4
df = pd.read_pickle('articles.pkl').sort_values(['directory', 'file', 'sentence_no']).reset_index(drop=True)
for k, k_df in df.groupby(['directory', 'file']):
    if len(k_df) == 0: continue
    if len(k_df) <= 10:
        _strs_ = []
        for i in range(len(k_df)): _strs_.append(k_df.iloc[i]['sentence'])
        _str_ = ' '.join(_strs_)
    else:
        for i in range(len(k_df)-n_sentences+1):
            _strs_ = []
            for j in range(n_sentences): _strs_.append(k_df.iloc[i+j]['sentence'])
            _str_ = ' '.join(_strs_)


In [None]:
prompt_entity_extract = 'Extract all of the entities from the following passage.  Return the entities as an python list of strings.'
promptModel(prompt_entity_extract + '\n\n' + _str_, max_new_tokens=512)