In [2]:
# System
import os
import sys
import json 
import pandas as pd
from dotenv import load_dotenv
from typing import Optional, Dict, List, Union
from IPython.display import display, clear_output
import time

# External
from datasets import DatasetDict, Dataset
import datasets

# Internal
from predict import SQLPredict

# Load environment variables
load_dotenv("../../.env")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
REPLICATE_API_TOKEN = os.environ.get("REPLICATE_API_TOKEN")
REPLICATE_LLAMA_13B_TUNED = os.environ.get("REPLICATE_LLAMA_7B_TUNED")
REPLICATE_LLAMA_13B_BASE = os.environ.get("REPLICATE_LLAMA_13B_BASE")
HUGGING_FACE_API_TOKEN = os.environ.get("HUGGING_FACE_API_TOKEN")
MISTRAL_7B_INSTRUCT = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1"

model_name = "llama_2_13b_base"


In [3]:
sqp = SQLPredict.from_replicate_model(
    openai_api_key=OPENAI_API_KEY,
    replicate_api_key=REPLICATE_API_TOKEN,
    model_name=model_name,
    model_id=REPLICATE_LLAMA_13B_BASE
)

sqp.hf_key = HUGGING_FACE_API_TOKEN
sqp.add_model_endpoint('mistral', MISTRAL_7B_INSTRUCT)

In [4]:
rich_testing = Dataset.load_from_disk("../../local_data/rich_testing_subset_llama_13b_1_0_0_inferences_three")

In [6]:
rich_testing_subset_0_100 = rich_testing.select(range(0, 100))
rich_testing_subset_100_200 = rich_testing.select(range(100, 200))
rich_testing_subset_200_300 = rich_testing.select(range(200, 300))

In [7]:
rich_testing_subset_0_100 = rich_testing_subset_0_100.map(sqp.replicate_dataset_request, fn_kwargs={"model_name": model_name, "column_name": "llama_2_13b_base_inference", "prompt_type": "basic_text_generation"})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [11]:
rich_testing_subset_100_200 = rich_testing_subset_100_200.map(sqp.replicate_dataset_request, fn_kwargs={"model_name": model_name, "column_name": "llama_2_13b_base_inference", "prompt_type": "basic_text_generation"})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [12]:
rich_testing_subset_200_300 = rich_testing_subset_200_300.map(sqp.replicate_dataset_request, fn_kwargs={"model_name": model_name, "column_name": "llama_2_13b_base_inference", "prompt_type": "basic_text_generation"})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

# Mistral Inferences

In [24]:
rich_testing_subset_0_100 = rich_testing_subset_0_100.map(sqp.basic_text_generation_dataset_request, fn_kwargs={"model_name": "mistral", "response_column_name": "mistral_response", "headers": headers}, load_from_cache_file=False)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [25]:
rich_testing_subset_100_200 = rich_testing_subset_100_200.map(sqp.basic_text_generation_dataset_request, fn_kwargs={"model_name": "mistral", "response_column_name": "mistral_response", "headers": headers}, load_from_cache_file=False)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [26]:
rich_testing_subset_200_300 = rich_testing_subset_200_300.map(sqp.basic_text_generation_dataset_request, fn_kwargs={"model_name": "mistral", "response_column_name": "mistral_response", "headers": headers}, load_from_cache_file=False)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [14]:
rich_testing_subset = datasets.concatenate_datasets([rich_testing_subset_0_100, rich_testing_subset_100_200, rich_testing_subset_200_300])

In [16]:
rich_testing_subset.save_to_disk("../../local_data/rich_testing_subset_llama_13b_1_0_0_inferences_four")

Saving the dataset (0/1 shards):   0%|          | 0/300 [00:00<?, ? examples/s]