In [1]:
import pandas as pd
import glob
import os

In [2]:
# Paths to datasets with DePlot tables
data_pattern = '../seed_datasets_100/3_translated_data_100/**/*-ctt.csv'

In [3]:
''' Convert data string to a DataFrame and a title '''
def string_to_dataframe_and_title(data_string, filepath):
    # Split the data into rows
    rows = data_string.strip().split("&&&")
    
    title = None
    header = None

    # Check if the first row is a title and extract it
    if "TITLE |" in rows[0]:
        potential_title = rows[0].split(" | ", 1)[1]  # Split and take second part as the potential title
        if potential_title.strip().lower() not in ['', 'title']: # Handle cases with placeholder titles
            title = potential_title  # Only set title if it's meaningful
        rows.pop(0)  # Remove the title row from processing

    # The next row should be the header
    if rows:
        header = [h.strip() for h in rows.pop(0).split("|") if h.strip()]
    else:
        raise ValueError("No header row found after the title row.")

    # Check if there are any data rows left after removing title and header
    if not rows:
        raise ValueError("No data rows found after the header row.")

    # Split remaining rows based on number of headers
    records = []
    for row in rows:
        record = [r.strip() for r in row.strip().split("|") if r.strip()]
        if len(record) != len(header):
            print(f"File: {filepath}: Row has {len(record)} columns, expected {len(header)} - row data: {record}")
            return None, title  # to keep original CSV instead
        records.append(record)

    # Create df and convert data to numeric where possible
    df = pd.DataFrame(records, columns=header).apply(pd.to_numeric, errors='ignore')

    return df, title

In [4]:
''' Save the title to a separate txt file '''
def save_title_to_file(title, filepath):
    if title:
        title_filepath = f"{filepath.rsplit('.', 1)[0]}-title.txt" # Add '-title' extention
        with open(title_filepath, 'w') as f:
            f.write(title)

In [5]:
# Pattern for matching csvs in FigureQA and PlotQA
for filepath in glob.glob(data_pattern, recursive=True):
    with open(filepath, 'r') as file:
        data_string = file.read()

    try:
        result = string_to_dataframe_and_title(data_string, filepath)
        if result is None:
            print(f"Skipping conversion for {filepath} because it removes data.")
            continue

        df, title = result

        if df is not None:
            # Save the DataFrame (overwrite original file)
            df.to_csv(filepath, index=False)
                
            # Save the title to a separate file
            save_title_to_file(title, filepath)
    except ValueError as e:
        print(f"Error processing {filepath}: {e}")

File: ../seed_datasets_100/3_translated_data_100/PlotQA/train/tables/plotQA_58019-train-ctt.csv: Row has 4 columns, expected 3 - row data: ['Russian Federation', '-0.7474747474747474784', '1986', '1986']
File: ../seed_datasets_100/3_translated_data_100/PlotQA/train/tables/plotQA_126784-train-ctt.csv: Row has 2 columns, expected 3 - row data: ['Sri Lanka', '0.2']
File: ../seed_datasets_100/3_translated_data_100/PlotQA/train/tables/plotQA_92173-train-ctt.csv: Row has 4 columns, expected 3 - row data: ['& United Kingdom', '3', '2', '23.62929292929262695269531']
File: ../seed_datasets_100/3_translated_data_100/PlotQA/val/tables/plotQA_19779-val-ctt.csv: Row has 4 columns, expected 3 - row data: ['Self-employed', '4', '5', '51.2000007629395']
File: ../seed_datasets_100/3_translated_data_100/FigureQA/train/tables/figureQA_63892-train-ctt.csv: Row has 9 columns, expected 7 - row data: ['Web Pry', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0']
File: ../seed_datasets_100/3_translated_d

In [6]:
''' Check if CSV has any data '''
def test_csv_conversion(directory_pattern):
    not_converted_properly = []

    # Loop through each file in directory
    for filepath in glob.glob(directory_pattern, recursive=True):
        # Only check converted files
        if filepath.endswith('-dp.csv'):
            try:
                df = pd.read_csv(filepath)
                # If shape is (0, 1), add to list
                if df.shape == (0, 1):
                    not_converted_properly.append(filepath)
            except Exception as e: # If error reading CSV, add to list
                not_converted_properly.append(filepath)

    return not_converted_properly

# Check for files not converted properly
csvs_not_converted_properly = test_csv_conversion(data_pattern)
csvs_not_converted_properly

[]