In [1]:
import sys
import os
current_dir = os.getcwd()

sys.path.append(os.path.dirname(current_dir))

from utils import setup_api_key

setup_api_key(file_path='../../config.json')

In [30]:
import os
import tiktoken

from datasets import load_from_disk
from tqdm import tqdm

from operator import itemgetter

from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from typing import List
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
import json


def get_model_id(model_type, run_name, project_name, checkpoint_id):
    return os.path.join(
        model_type, "model_output", project_name, run_name, checkpoint_id
    )


project_config = {
    "survey-json": {
        "project_name": "survey-json-model-inst",
        "train_dataset_path": "../datasets/survey_json_datasets_instruction_train",
        "test_dataset_path": "../datasets/survey_json_datasets_instruction_test",
    },
    "schema": {
        "project_name": "schema-model-inst",
        "train_dataset_path": "../datasets/schema_datasets/schema_data_train",
        "test_dataset_path": "../datasets/schema_datasets/schema_data_test",
    },
    "paraloq": {
        "project_name": "paraloq-model-inst",
        "train_dataset_path": "../datasets/paraloq/paraloq_data_train",
        "test_dataset_path": "../datasets/paraloq/paraloq_data_test",
    },
    "nous": {
        "project_name": "nous-model-inst",
        "train_dataset_path": "../datasets/nous/nous_data_train",
        "test_dataset_path": "../datasets/nous/nous_data_test",
    },
}


def load_project(project="schema"):
    train = load_from_disk(project_config[project]["train_dataset_path"])
    test = load_from_disk(project_config[project]["test_dataset_path"])
    return train, test


def calculate_token(dataset):

    def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
        """Returns the number of tokens in a text string."""
        encoding = tiktoken.get_encoding(encoding_name)
        num_tokens = len(encoding.encode(string))
        return num_tokens
    
    text = ""
    max_text = ""
    for example in dataset:
        text += example["text"]
        max_text = max(max_text, example["text"], key=len)
    num_tokens = num_tokens_from_string(text)
    max_num_tokens = num_tokens_from_string(max_text)
    return num_tokens, max_num_tokens


def run(project="schema"):
    train, test = load_project(project=project)
    train_tokens, max_train_tokens = calculate_token(train)
    test_tokens, max_test_tokens = calculate_token(test)
    num_tokens = max(train_tokens, test_tokens)
    print("Loaded datasets with num examples: ", train.num_rows + test.num_rows)
    print("Num tokens: ", num_tokens)
    print("Max num tokens: ", max(max_train_tokens, max_test_tokens))
    
    

run_list = ["schema", "paraloq", "nous"]
for project in run_list:
    run(project=project)

Loaded datasets with num examples:  485
Num tokens:  129827
Max num tokens:  2008
Loaded datasets with num examples:  484
Num tokens:  567383
Max num tokens:  2946
Loaded datasets with num examples:  100
Num tokens:  30570
Max num tokens:  670


In [42]:
train, test = load_project(project="nous")
print(train[23]['text'])

<s>[INST] You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:
<schema>
{'CustomerVehicleServiceHistory': {'type': 'object', 'properties': {'customerID': {'title': 'Customer ID', 'type': 'string'}, 'vehicleID': {'title': 'Vehicle ID', 'type': 'string'}, 'serviceRecords': {'title': 'Service Records', 'type': 'array', 'items': {'type': 'object', 'properties': {'serviceDate': {'title': 'Service Date', 'type': 'string', 'format': 'date'}, 'description': {'title': 'Description', 'type': 'string'}, 'cost': {'title': 'Cost', 'type': 'number'}}, 'required': ['serviceDate', 'description', 'cost']}}, 'totalSpent': {'title': 'Total Spent', 'type': 'number'}}, 'required': ['customerID', 'vehicleID', 'serviceRecords', 'totalSpent']}}
</schema>
I recently purchased a used car and I'd like to have a complete service history report generated for my vehicle. The vehicle is a 2015 Toyota Camry with the VIN number 4T1BF1FK5FU033209. I am the new owner and my custom