In [2]:
import pandas as pd
import glob
import os

In [3]:
# Paths to datasets with DePlot tables
data_pattern = '/Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/**/*-dp.csv'

In [6]:
''' Convert data string to a DataFrame and a title '''
def string_to_dataframe_and_title(data_string, filepath):
    # Split the data into rows
    rows = data_string.strip().split("<0x0A>")
    
    title = None
    header = None

    # Check if the first row is a title and extract it
    if "TITLE |" in rows[0]:
        potential_title = rows[0].split(" | ", 1)[1]  # Split and take second part as the potential title
        if potential_title.strip().lower() not in ['', 'title']: # Handle cases with placeholder titles
            title = potential_title  # Only set title if its meaningful
        rows.pop(0)  # Remove the title row from processing

    # The next row should be the header
    if rows:
        header = [h.strip() for h in rows.pop(0).split("|") if h.strip()]
        # rows.pop(0).split(" | ")
    else:
        raise ValueError("No header row found after the title row.")

    # Check if there are any data rows left after removing title and header
    if not rows:
        raise ValueError("No data rows found after the header row.")

    # Split remaining rows based on number of headers
    records = []
    for row in rows:
        record = [r.strip() for r in row.strip().split("|") if r.strip()]
        if len(record) != len(header):
            print(f"File: {filepath}: Row has {len(record)} columns, expected {len(header)} - row data: {record}")
            return None, title  # to keep original CSV instead
        records.append(record)

    # Create df and convert data to numeric where possible
    df = pd.DataFrame(records, columns=header).apply(pd.to_numeric, errors='ignore')

    return df, title

In [7]:
''' Save the title to a separate txt file '''
def save_title_to_file(title, filepath):
    if title:
        title_filepath = f"{filepath.rsplit('.', 1)[0]}-title.txt" # Add '-title' extention
        with open(title_filepath, 'w') as f:
            f.write(title)

In [8]:
# Pattern for matching csvs in FigureQA and PlotQA
for filepath in glob.glob(data_pattern, recursive=True):
    with open(filepath, 'r') as file:
        data_string = file.read()

    try:
        result = string_to_dataframe_and_title(data_string, filepath)
        if result is None:
            print(f"Skipping conversion for {filepath} because it removes data.")
            continue

        df, title = result

        if df is not None:
            # Save the DataFrame (overwrite original file)
            df.to_csv(filepath, index=False)
                
            # Save the title to a separate file
            save_title_to_file(title, filepath)
    except ValueError as e:
        print(f"Error processing {filepath}: {e}")

File: /Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_81266-train-dp.csv: Row has 7 columns, expected 8 - row data: ['Lime Green', '94.1', '94.7', '94.8', '94.17', '94.08', '94.0']
File: /Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_16865-train-dp.csv: Row has 4 columns, expected 3 - row data: ['Dark Orchid', '1', '1', '1']
File: /Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_15311-train-dp.csv: Row has 4 columns, expected 3 - row data: ['Medium Aqua', '1.000', '11.000', '11.000']
File: /Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_86227-train-dp.csv: Row has 4 columns, expected 6 - row data: ['xaxis_label', '86.7', '67.3', '6']
File: /Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_10137-train-dp.csv: Row has 6 columns, expec

  df = pd.DataFrame(records, columns=header).apply(pd.to_numeric, errors='ignore')


In [9]:
''' Check if CSV was converted properly '''
def test_csv_conversion(directory_pattern):
    not_converted_properly = []

    # Loop through each file in directory
    for filepath in glob.glob(directory_pattern, recursive=True):
        # Only check converted files
        if filepath.endswith('-dp.csv'):
            try:
                df = pd.read_csv(filepath)
                # If shape is (0, 1), add to list
                if df.shape == (0, 1):
                    not_converted_properly.append(filepath)
            except Exception as e: # If error reading CSV, add to list
                not_converted_properly.append(filepath)

    return not_converted_properly

# Check for files not converted properly
csvs_not_converted_properly = test_csv_conversion(data_pattern)
csvs_not_converted_properly

['/Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_81266-train-dp.csv',
 '/Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_16865-train-dp.csv',
 '/Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_15311-train-dp.csv',
 '/Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_86227-train-dp.csv',
 '/Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_10137-train-dp.csv',
 '/Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_21272-train-dp.csv',
 '/Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_88572-train-dp.csv',
 '/Users/angwang/ChartFC/seed_datasets_150_GF/3_translated_data_150_GF/FigureQA/train/tables/figureQA_38848-train-dp.csv',
 '/Users/angwang

In [10]:
# ''' Delete existing converted files '''
# # Recursively match files in subdirectories
# for pattern in patterns:
#     for filepath in glob.glob(pattern, recursive=True):
#         if filepath.endswith('-converted.csv') or filepath.endswith('-title.txt'):
#             os.remove(filepath)  # Delete the file

# # Check for remaining files
# remaining_converted_files = []
# for pattern in patterns:
#     remaining_converted_files.extend([f for f in glob.glob(pattern, recursive=True) if f.endswith('-converted.csv') or f.endswith('-title.txt')])
# remaining_converted_files