# Exploration of SOTAB dataset

In [1]:
import os
import random
import tiktoken
import pandas as pd

In [2]:
gpt4_enc = tiktoken.encoding_for_model("gpt-4")
gpt3_enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [3]:
def read_all(directory):
    dfs = []

    for filename in os.listdir(directory):
        if filename.endswith('.json.gz'):
            file_path = os.path.join(directory, filename)
            df = pd.read_json(file_path, compression='gzip', lines=True)
            dfs.append(df)
    
    return dfs

def read_one(directory, name):
    for filename in os.listdir(directory):
        if name in filename:
            file_path = os.path.join(directory, filename)
            df = pd.read_json(file_path, compression='gzip', lines=True)
            return df
    return None

def average_rows_columns(list_data):
    total_rows = 0
    total_columns = 0
    total_tokens = 0

    min_tokens, max_tokens = float('inf'), 0
    num_dataframes = len(list_data)

    if num_dataframes == 0:
        return 0, 0, 0, 0, 0

    for dataframe in list_data:
        total_rows += dataframe.shape[0]
        total_columns += dataframe.shape[1]
        dataframe_json = dataframe.to_json(orient='records', lines=True)
        this_tokens = len(gpt4_enc.encode(dataframe_json))
        total_tokens += this_tokens
        min_tokens = min(min_tokens, this_tokens)
        max_tokens = max(max_tokens, this_tokens)

    avg_rows = total_rows / num_dataframes
    avg_columns = total_columns / num_dataframes
    avg_tokens = total_tokens / num_dataframes

    return avg_rows, avg_columns, avg_tokens, min_tokens, max_tokens

In [4]:
random.seed(42)
RE_TEST_PATH = "./CPA_Test/Test"
RE_TS_DATA = "./CPA_Test/CPA_test_gt.csv"

In [5]:
sotab_test = read_all(RE_TEST_PATH)
print(f'RE, SOTAB: Number of tables in SOTAB test {len(sotab_test)}')

RE, SOTAB: Number of tables in SOTAB test 6480


In [6]:
avg_rows, avg_columns, avg_tokens, min_tokens, max_tokens = average_rows_columns(sotab_test)
print(f"RE, SOTAB: Average Rows: {avg_rows}, Average Columns: {avg_columns}, Average Cokens: {avg_tokens}, Min Tokens: {min_tokens}, Max Tokens: {max_tokens}")

RE, SOTAB: Average Rows: 249.12438271604938, Average Columns: 9.322685185185184, Average Cokens: 46360.17083333333, Min Tokens: 205, Max Tokens: 14382326


In [7]:
random.seed(42)
CTA_TEST_PATH = "./CTA_Test/Test"
CTA_TS_DATA = "./CTA_Test/CTA_test_gt.csv"

In [8]:
sotab_test = read_all(CTA_TEST_PATH)
print(f'CTA, SOTAB: Number of tables in SOTAB test {len(sotab_test)}')

CTA, SOTAB: Number of tables in SOTAB test 7026


In [9]:
avg_rows, avg_columns, avg_tokens, min_tokens, max_tokens = average_rows_columns(sotab_test)
print(f"CTA, SOTAB: Average Rows: {avg_rows}, Average Columns: {avg_columns}, Average Cokens: {avg_tokens}, Min Tokens: {min_tokens}, Max Tokens: {max_tokens}")

CTA, SOTAB: Average Rows: 199.76757756902933, Average Columns: 8.37688585254768, Average Cokens: 40862.961144321096, Min Tokens: 156, Max Tokens: 12732297
