In [1]:
import time
import json
import openai
import re
import sqlite3
import pandas as pd
import numpy as np

from os import path

In [16]:
def get_schema(db_id):
    PATH = 'spider/database/'
    
    path_to_file   = PATH + db_id + '/schema.sql'
    path_to_file_2 = PATH + db_id + '/' + db_id + '.sqlite'
    
    if path.exists(path_to_file):
        x = open(path_to_file, 'r').read()
        # Remove comment lines
        x = re.sub(r"^/\*.*\n", "", x, flags=re.MULTILINE)
        x = re.sub(r'^--.*(\n|$)', '', x, flags=re.MULTILINE)
        x = re.sub(r"^/\*.*\*/", "", x)
        
        x = re.sub(r'CREATE TABLE \t', 'CREATE TABLE ', x)
        
        arr = x.split(';')
        create_tables = []
        for i, stmt in enumerate(arr):
            stmt = stmt.strip()
            if stmt.lower().startswith('create'):
                create_tables.append(stmt.replace('IF NOT EXISTS ', '').replace('\n', ''))
        return ';'.join(create_tables)
      
    elif path.exists(path_to_file_2):
        # Connect to the sqlite file
        conn = sqlite3.connect(path_to_file_2)

        # Read the schema of the twitter_1.sqlite file into a dataframe
        df_conn = pd.read_sql_query("SELECT sql FROM sqlite_master WHERE type='table';", conn)
        
        # Close the connection
        conn.close()
        
        create_tables = []
        for idx, row in df_conn.iterrows():
            row['sql'] = row['sql'].strip()
            if row['sql'].lower().startswith('create'):
                create_tables.append(row['sql'].replace('IF NOT EXISTS ', '').replace('\n', ''))
        return ';'.join(create_tables)
    
    else: return None

def short_open_ai_prompt(x):
    return f'''Schema: {x['schema']}\nQuestion: {x['question']}\n\n###\n\n'''

# From OpenAI: The completion should start with a whitespace character (` `). 
# This tends to produce better results due to the tokenization we use.
def open_ai_completion(x):
    return f" {x['query']}\n" 

# def splitter(x: str):
#     try:
#         arr = x.split(';')
#         create_tables = []
#         for i, stmt in enumerate(arr):
#             stmt = stmt.strip()
#             if stmt.lower().startswith('create'):
#                 create_tables.append(stmt.replace('IF NOT EXISTS ', '').replace('\n', ''))
#     except:
#         arr = x.split(b';')
#         create_tables = []
#         for i, stmt in enumerate(arr):
#             stmt = stmt.strip()
#             if stmt.lower().startswith(b'create'):
#                 create_tables.append(stmt.replace('IF NOT EXISTS ', '').replace('\n', ''))
 
#     return ';'.join(create_tables)

def call_model(row, engine):
    prompt = row["open_ai_prompt"]
    while True:
        try:
            completions = openai.Completion.create(
                engine=engine,
                prompt=prompt,
                max_tokens=1024,
                n=1,
                stop=["\n"],
                temperature=0.5
            )

            print(completions.choices[0].text)
            return completions.choices[0].text
    
        except error.ServiceUnavailableError:
            print('ServiceUnavailableError')
            time.sleep(1)
            continue
            
        except error.InvalidRequestError:
            print('InvalidRequestError: too many tokens')
            break

# Data Preparation

In [17]:
df = pd.read_json('spider/train_spider.json')
df = df[~df['query'].str.contains('JOIN')]
df = df[df['query'].str.count('FROM') <= 1]

# Set schema
df['schema']         = df.apply(lambda x: get_schema(x['db_id']), axis=1)
df['schema'] = df['schema'].apply(lambda x: re.sub('(?i) REFERENCES.*?(;|$)', ';', x))
# Extract the name of the table after FROM
df['table']  = df['query'].apply(lambda x: re.sub('.*FROM', 'FROM', x))\
                          .apply(lambda x: re.sub('^FROM\s+(\S+).*', r'\1', x).rstrip(';')).str.lower()
# Extract the 'create table' statement just for that table
df['schema'] = df.apply(lambda x: ''.join(re.findall(r"(create table `{}`.*?;)".format(x["table"]), 
                                                     x["schema"].lower())) or 
                                  ''.join(re.findall(r"(create table `{}`.*?$)".format(x["table"]), 
                                                     x["schema"].lower())) or 
                                  ''.join(re.findall(r"(create table {}.*?;)".format(x["table"]), 
                                                     x["schema"].lower())) or 
                                  ''.join(re.findall(r"(create table {}.*?$)".format(x["table"]), 
                                                     x["schema"].lower())) or 
                                  ''.join(re.findall(r"(create table \"{}\".*?;)".format(x["table"]), 
                                                     x["schema"].lower())) or
                                  ''.join(re.findall(r"(create table \"{}\".*?$)".format(x["table"]), 
                                                     x["schema"].lower())), axis=1)
df['schema'] = df['schema'].apply(lambda x: re.sub('NOT NULL', '', x))

# Set Open AI prompt and completion
df['open_ai_prompt'] = df.apply(lambda x: short_open_ai_prompt(x), axis=1)
df['open_ai_completion'] = df.apply(lambda x: open_ai_completion(x), axis=1)

# Randomize at the db level
df_db_id = pd.DataFrame(df['db_id'].unique(), columns=['db_id'])
np.random.seed(240956) #set seed
df_db_id['train_test'] = np.random.choice(['train','test'], df_db_id.shape[0], p=[0.8, 0.2])
df = df.merge(df_db_id, on='db_id')
df_train = df[df['train_test'] == 'train'][['open_ai_prompt', 'open_ai_completion']].copy()
df_test  = df[df['train_test'] == 'test'][['open_ai_prompt', 'open_ai_completion']].copy()

In [18]:
df[df['schema'] == '']['db_id'].unique()

array([], dtype=object)

In [24]:
print(df.iloc[110]['open_ai_prompt'])
print(df.iloc[110]['open_ai_completion'])

Schema: create table weather (    date text,    max_temperature_f integer,    mean_temperature_f integer,    min_temperature_f integer,    max_dew_point_f integer,    mean_dew_point_f integer,    min_dew_point_f integer,    max_humidity integer,    mean_humidity integer,    min_humidity integer,    max_sea_level_pressure_inches numeric,    mean_sea_level_pressure_inches numeric,    min_sea_level_pressure_inches numeric,    max_visibility_miles integer,    mean_visibility_miles integer,    min_visibility_miles integer,    max_wind_speed_mph integer,    mean_wind_speed_mph integer,    max_gust_speed_mph integer,    precipitation_inches integer,    cloud_cover integer,    events text,    wind_dir_degrees integer,    zip_code integer)
Question: What are the dates in which the mean sea level pressure was between 30.3 and 31?

###


 SELECT date FROM weather WHERE mean_sea_level_pressure_inches BETWEEN 30.3 AND 31



In [25]:
df_train.shape

(3071, 2)

In [26]:
df_test.shape

(538, 2)

# Training

In [27]:
# Put the training data into jsonl format
data = []
for idx, row in df_train.iterrows():
    data.append({"prompt": row["open_ai_prompt"], "completion": row["open_ai_completion"]})

timestr = time.strftime("%Y%m%d-%H%M%S")
with open("spider_open_ai_fine_tuning_" + timestr + ".jsonl", "w") as outfile:
    for obj in data:
        json.dump(obj, outfile)
        outfile.write("\n")
        
print("spider_open_ai_fine_tuning_" + timestr + ".jsonl")

spider_open_ai_fine_tuning_20230125-135407.jsonl


In [None]:
#!openai tools fine_tunes.prepare_data -f spider_open_ai_fine_tuning_20230125-135407.jsonl
# - There are 3 duplicated prompt-completion sets. These are rows: [1155, 1590, 1591]

In [33]:
df_train_dedup = df_train.copy()
df_train_dedup = df_train_dedup.drop(df_train.index[[1155, 1590, 1591]])
df_train_dedup[(df_train_dedup['open_ai_prompt'] == df_train_dedup.iloc[1155]['open_ai_prompt']) |
               (df_train_dedup['open_ai_prompt'] == df_train_dedup.iloc[1590]['open_ai_prompt']) |
               (df_train_dedup['open_ai_prompt'] == df_train_dedup.iloc[1591]['open_ai_prompt'])]

Unnamed: 0,open_ai_prompt,open_ai_completion
1324,Schema: create table movie(\tmid int primary k...,SELECT director FROM Movie WHERE title = 'Av...
1842,Schema: create table `employees` ( `employee_...,SELECT DISTINCT department_id FROM employees ...
1843,Schema: create table `employees` ( `employee_...,SELECT DISTINCT department_id FROM employees ...


In [34]:
# Put the training data into jsonl format
data = []
for idx, row in df_train_dedup.iterrows():
    data.append({"prompt": row["open_ai_prompt"], "completion": row["open_ai_completion"]})

timestr = time.strftime("%Y%m%d-%H%M%S")
with open("spider_open_ai_fine_tuning_" + timestr + ".jsonl", "w") as outfile:
    for obj in data:
        json.dump(obj, outfile)
        outfile.write("\n")
        
print("spider_open_ai_fine_tuning_" + timestr + ".jsonl")

spider_open_ai_fine_tuning_20230125-135903.jsonl


In [None]:
#!openai tools fine_tunes.prepare_data -f spider_open_ai_fine_tuning_20230125-135903.jsonl

In [None]:
#!openai api fine_tunes.create -t "spider_open_ai_fine_tuning_20230125-135903.jsonl" -m davinci

In [36]:
!openai api fine_tunes.list

{
  "data": [
    {
      "created_at": 1672357089,
      "fine_tuned_model": "davinci:ft-mercator-2022-12-29-23-47-24",
      "hyperparams": {
        "batch_size": 1,
        "learning_rate_multiplier": 0.1,
        "n_epochs": 4,
        "prompt_loss_weight": 0.01
      },
      "id": "ft-ZMShnMrnhxzay9r0mXXEcxyI",
      "model": "davinci",
      "object": "fine-tune",
      "organization_id": "org-ePmgB4qVo14GgUKdUQci6IGz",
      "result_files": [
        {
          "bytes": 7017,
          "created_at": 1672357645,
          "filename": "compiled_results.csv",
          "id": "file-FdDaGVVXGWC08jN4NwodgYnY",
          "object": "file",
          "purpose": "fine-tune-results",
          "status": "processed",
          "status_details": null
        }
      ],
      "status": "succeeded",
      "training_files": [
        {
          "bytes": 3380,
          "created_at": 1672357089,
          "filename": "openai_classification_fine_tuning.txt",
          "id": "file-APupvHFP5Oae

## In Sample Testing

## Out of Sample Testing

In [None]:
data = []
for idx, row in tmp.iterrows():
    print(idx)
    if row['model_response'] != '':
        print('already completed')
        data.append(row['model_response'])
        continue
    if row['schema'] == '':
        print('missing schema')
        data.append('')
        continue
    
    new_response = call_model(row, engine="davinci:ft-mercator-2023-01-24-03-11-01")
    data.append(new_response)
    tmp.loc[idx,'model_response'] = new_response

In [None]:
# ignore white space, case, and trailing semicolons
df_test['correct'] = np.where(df_test['open_ai_completion'].str
                                                           .strip()
                                                           .str
                                                           .lower()
                                                           .apply(lambda x: re.sub('\s+', '',x).rstrip(';')) == 
                              df_test['model_response'].str
                                                       .strip()
                                                       .str
                                                       .lower()
                                                       .apply(lambda x: re.sub('\s+', '',x).rstrip(';')), 1, 0)
df_test['correct'].mean()