In [None]:
import os
from dotenv import load_dotenv

import numpy as np
import matplotlib.pyplot as plt

from replicate import Client as rc
from datasets import DatasetDict, Dataset

from data import SQLData
from predict import SQLPredict
from eval import SQLEval

# Load environment variables from .env file
load_dotenv("../.env")

# Now you can get the loaded environment variable using os.environ
GITHUB_GIST_TOKEN = os.environ.get("GITHUB_GIST_TOKEN")
REPLICATE_API_TOKEN = os.environ.get("REPLICATE_API_TOKEN")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

MODEL_VERSION = os.environ.get("REPLICATE_LLAMA_13B_BASE")
MODEL_DESINATION = os.environ.get("REPLICATE_LLAMA_13B_TUNE")
REPLICATE_LLAMA_13B_TUNED = os.environ.get("REPLICATE_LLAMA_13B_TUNED")

dataset_name = 'b-mc2/sql-create-context'
train_test_dataset = "data_llama_13b_1_0_1"
replicate = rc(REPLICATE_API_TOKEN)

data_upload = False
run_training = False

In [None]:
sqp = SQLPredict.from_replicate_model(
    openai_api_key=OPENAI_API_KEY,
    replicate_api_key=REPLICATE_API_TOKEN,
    model_name="llama_2_13b_sql",
    model_id=REPLICATE_LLAMA_13B_TUNED
)
sqd = SQLData.from_sql_create_context()
sqe = SQLEval()

# Data Processing

In [None]:
sqd.preprocess_data(dataset_name=dataset_name)
sqd.filter_data(dataset_name=dataset_name)
sqd.train_test_split(dataset_name=dataset_name, new_dataset_name=train_test_dataset)

# Data Upload

In [None]:
if data_upload:
    training_resp = sqd.upload_jsonl_gist(
        dataset_name=train_test_dataset,
        token=GITHUB_GIST_TOKEN,
        filename="training_data_llama_13b_1_0_1.jsonl",
        description="Training data for the second version of the llama_2_13b_sql (1.0.1) model",
    )
    training_data_url = training_resp['files']['training_data_llama_13b_1_0_1.jsonl']['raw_url']

In [None]:
if data_upload:
    testing_resp = sqd.upload_jsonl_gist(
        dataset_name=train_test_dataset,
        token=GITHUB_GIST_TOKEN,
        filename="testing_data_llama_13b_1_0_1.jsonl",
        dataset_type="test",
        description="Testing data for the first version of the llama_2_13b_sql (1.0.1) model",
    )
    testing_data_url = testing_resp['files']['testing_data_llama_13b_1_0_1.jsonl']['raw_url']

# Model Tuning

In [None]:
if run_training:
    training = replicate.trainings.create(
        version= MODEL_VERSION,
        input={
            "train_data": training_data_url,
            "num_train_epochs": 3,
        },
        destination=MODEL_DESINATION,
    )

# Model Inference

In [None]:
rich_testing = sqd.data[train_test_dataset]['test'].filter(lambda example: example['query_result'] != '[(0,)]' and example['query_result'] != '[(None,)]')

# run in batches of 100 to evaluate time and to avoid timeouts
# run in separate cells in case of timeouts
rich_testing_subset_0_100 = rich_testing.select(range(0, 100))
rich_testing_subset_100_200 = rich_testing.select(range(100, 200))
rich_testing_subset_200_300 = rich_testing.select(range(200, 300))

rich_testing_subset_0_100 = rich_testing_subset_0_100.map(sqd.format_tuning_data)
rich_testing_subset_100_200 = rich_testing_subset_100_200.map(sqd.format_tuning_data)
rich_testing_subset_200_300 = rich_testing_subset_200_300.map(sqd.format_tuning_data)

In [None]:
rich_testing_subset_0_100 = rich_testing_subset_0_100.map(sqp.openai_dataset_request)

In [None]:
rich_testing_subset_100_200 = rich_testing_subset_100_200.map(sqp.openai_dataset_request)

In [None]:
rich_testing_subset_200_300 = rich_testing_subset_200_300.map(sqp.openai_dataset_request)

In [None]:
rich_testing_subset_0_100 = rich_testing_subset_0_100.map(sqp.replicate_dataset_request)

In [None]:
rich_testing_subset_100_200 = rich_testing_subset_100_200.map(sqp.replicate_dataset_request)

In [None]:
rich_testing_subset_200_300 = rich_testing_subset_200_300.map(sqp.replicate_dataset_request)

In [None]:
rich_testing_subset = datasets.concatenate_datasets([rich_testing_subset_0_100, rich_testing_subset_100_200, rich_testing_subset_200_300])

In [None]:
rich_testing_subset.save_to_disk("../local_data/rich_testing_subset_llama_13b_1_0_0_inferences_three")

# Model Evaluation 

### See the eval.ipynb notebook for more details

In [None]:
inference_data = rich_testing_subset.map(sqe.validate_replicate_query)

In [None]:
inference_data = inference_data.map(sqe.replicate_response_parser)
inference_data = inference_data.map(sqe.validate_openai_query)
inference_data = inference_data.map(sqe.validate_replicate_query)
inference_data = inference_data.map(sqe.inference_result_check)

openai_valid_queries = inference_data.filter(lambda x: x["openai_valid"] == True)
replicate_valid_queries = inference_data.filter(lambda x: x["replicate_valid"] == True)

openai_valid_results = inference_data.filter(lambda x: x["openai_correct"] == True)
replicate_valid_results = inference_data.filter(lambda x: x["replicate_correct"] == True)

print("OpenAI valid queries: {}".format(openai_valid_queries.num_rows))
print("Replicate valid queries: {}".format(replicate_valid_queries.num_rows))

print("OpenAI valid results: {}".format(openai_valid_results.num_rows))
print("Replicate valid results: {}".format(replicate_valid_results.num_rows))

print("\nAs a percentage of total queries: \n")
print("OpenAI valid queries: {:.2f}%".format(100 * openai_valid_queries.num_rows / inference_data.num_rows))
print("Replicate valid queries: {:.2f}%".format(100 * replicate_valid_queries.num_rows / inference_data.num_rows))

print("OpenAI valid results: {:.2f}%".format(100 * openai_valid_results.num_rows / inference_data.num_rows))
print("Replicate valid results: {:.2f}%".format(100 * replicate_valid_results.num_rows / inference_data.num_rows))

In [None]:
categories = ['Valid Queries', 'Valid Results']
openai_values = [openai_valid_queries.num_rows, openai_valid_results.num_rows]
replicate_values = [replicate_valid_queries.num_rows, replicate_valid_results.num_rows]

openai_percentages = [100 * openai_valid_queries.num_rows / inference_data.num_rows,
                      100 * openai_valid_results.num_rows / inference_data.num_rows]

replicate_percentages = [100 * replicate_valid_queries.num_rows / inference_data.num_rows,
                         100 * replicate_valid_results.num_rows / inference_data.num_rows]

x = np.arange(len(categories))  # the label locations
width = 0.35  # the width of the bars

fig, ax1 = plt.subplots(figsize=(10, 6))

# Bar charts for absolute values
ax1.bar(x - width/2, openai_values, width, color='tab:blue', label='OpenAI')
ax1.bar(x + width/2, replicate_values, width, color='tab:red', label='Replicate')

# Labeling and other aesthetics
ax1.set_xlabel('Metrics')
ax1.set_ylabel('Counts')
ax1.set_title('Comparison of OpenAI vs Replicate')
ax1.set_xticks(x)
ax1.set_xticklabels(categories)
ax1.legend()

# Adding percentages on top of the bars
for i in range(len(categories)):
    ax1.text(i - width/2, openai_values[i] + 5, f"{openai_percentages[i]:.2f}%", ha='center', va='bottom', color='black', rotation=0)
    ax1.text(i + width/2, replicate_values[i] + 5, f"{replicate_percentages[i]:.2f}%", ha='center', va='bottom', color='black', rotation=0)

plt.tight_layout()
plt.show()