In [1]:
import pandas as pd
import glob
import os

In [2]:
''' Delete existing converted files '''

# Pattern to match all csv files
pattern = '../seed_datasets/**/*.csv'

# recursively match files in subdirectories
for filepath in glob.glob(pattern, recursive=True):
    if filepath.endswith('-converted.csv'):
        os.remove(filepath)  # Delete the file

# check for remaining files
remaining_converted_files = [f for f in glob.glob(pattern, recursive=True) if f.endswith('-converted.csv')]
remaining_converted_files

[]

In [3]:
def string_to_dataframe(data_string):
    # Split the data into rows
    rows = data_string.strip().split("<0x0A>")

    # Check if first row is a title and handle it
    title = None
    if "TITLE" in rows[0]:
        title = rows.pop(0)
        if not rows:
            raise ValueError("No header row found after the title row.")
        header = rows.pop(0).split(" | ")
    else: # Split first row to get headers
        header = rows.pop(0).split(" | ")

    if not rows:
        raise ValueError("No data rows found after the header row.")

    # Split remaining rows based on number of headers
    records = [row.strip().split(" | ") for row in rows]
    
    # Create df and convert data to numeric where possible
    df = pd.DataFrame(records, columns=header).apply(pd.to_numeric, errors='ignore')

    return df, title

In [4]:
# Pattern for matching csvs in directory and its subdirectories
pattern = '../seed_datasets/**/*.csv'  

for filepath in glob.glob(pattern, recursive=True):
    with open(filepath, 'r') as file:
        data_string = file.read()

    # Process the data string into a dataframe and handle title if it exists
    try:
        df, title = string_to_dataframe(data_string)
        
        # Create new file name with '-converted' extention
        base, extension = os.path.splitext(filepath)
        new_name = f"{base}-converted{extension}"

        # Save df with new file name and handle title if necessary
        with open(new_name, 'w', newline='', encoding='utf-8') as file:
            if title:
                file.write(title + "\n")
            df.to_csv(file, index=False, sep=',')
    except ValueError as e:
        print(f"Error processing {filepath}: {e}")

In [5]:
''' Check if CSV was converted properly '''
def test_csv_conversion(directory_pattern):
    not_converted_properly = []

    # Loop through each file in directory
    for filepath in glob.glob(directory_pattern, recursive=True):
        # Only check converted files
        if filepath.endswith('-converted.csv'):
            try:
                df = pd.read_csv(filepath)
                # If shape is (0, 1), add to list
                if df.shape == (0, 1):
                    not_converted_properly.append(filepath)
            except Exception as e: # If error reading CSV, add to list
                not_converted_properly.append(filepath)

    return not_converted_properly

csvs_not_converted_properly = test_csv_conversion('../seed_datasets/**/*.csv')
csvs_not_converted_properly

['../seed_datasets/PlotQA/test/csv/plotQA_0-test-converted.csv']

In [6]:
example = pd.read_csv('../seed_datasets/PlotQA/test/csv/plotQA_0-test-converted.csv')
example

Unnamed: 0,"TITLE | Payments made towards primary income, imports of goods and services by Serbia\nYear,Goods and services,Primary income\n2007,2185915000,2335885000\n2008,2774650000,2986385000\n2009,1899280000,2040635000\n2010,1940540000,2088985000\n2011,2311370000,2475155000\n2012,2185760000,2414747000"""""""


In [7]:
example.shape

(0, 1)