In [50]:
import re
from pathlib import Path
import glob

#PURPOSE: PRE-PROCESSOR FOR THINKORSWIM CSV FILES (REMOVE HEADERS, SAVE TO REAL CSV)

#STEP 1: DOWNLOAD OPTIONS DATA FOR SYMBOL, DATES OF INTEREST AND SAVE IN 'input' FOLDER

#STEP 2: THIS SCRIPT WILL CREATE 'ouptut' WITH SUBFOLDERS: {SYMBOL}_{DATE} AND PUT REAL CSV DATA IN FILE INSIDE THIS FOLDER

def extract_date_and_symbol(filename):
    pattern = r'(\d{4}-\d{2}-\d{2})-.*?For(.*?)\.csv'

    # Search for the pattern in the input string
    match = re.search(pattern, filename)

    if match:
        extracted_text = match.group(1)  # Get the text captured by the parentheses
        extracted_date = match.group(1)  # Get the date
        extracted_text = match.group(2).strip()  # Get the text after 'For' 
        return (extracted_date, extracted_text)

directory_path = Path('input')
csv_files_glob = glob.glob(str(directory_path / '*.csv'))

for csv_file in csv_files_glob:
    date, symbol = extract_date_and_symbol(csv_file)

    with open(csv_file, 'r') as file:
        lines = file.readlines()
        
    csv_data = lines[12:] #MUST REMOVE FIRST 12 LINES OF TEXT FILE

    Path('output').mkdir(exist_ok=True)
    directory_path = Path('output', symbol + '_' + str(date))
    directory_path.mkdir(exist_ok=True)

    output_file = Path(directory_path, date + '.csv')
    with open(output_file, 'w') as file:
        file.writelines(lines[12:])  # Write from line 13 onwards

In [None]:
import re
from pathlib import Path
import glob
import pandas as pd

#PURPOSE: READ REAL CSV FILES WITH STOCK OPTION INFORMATION BY DAY, PARSE INTO SUBTABLES BY EXPIRATION DATE

#OUTPUT WILL BE SINGLE EXCEL FILE WITH SEPARATE WORKSHEET FOR EACH OPTION EXPIRATION DATE

def create_output(subtables, output_file):

    # Check if the Excel file exists; if not, create it
    if not output_file.exists():
        # Create an empty DataFrame to initialize the Excel file
        pd.DataFrame().to_excel(output_file, index=False)

    # Create an Excel writer object using pandas
    with pd.ExcelWriter(output_file, engine='openpyxl', mode='a') as writer:
        for date_str, subtable_data in subtables.items():
            # Split the string into lines
            lines = subtable_data.strip().split('\n')
            
            # The first line is the header
            header = lines[0].split(',')
            
            # The remaining lines are the data
            data_rows = [line.split(',') for line in lines[1:]]
            
            # Create a DataFrame from the collected data
            df = pd.DataFrame(data_rows, columns=header)
            
            df = df.drop(df.columns[[0, 1]], axis=1)
            # Save DataFrame to an Excel worksheet named after the date
            df.to_excel(writer, sheet_name=date_str.replace(" ", "_"), index=False)


src_folder_path = Path('output') #PREV. STEP 'output'
subfolders = [f for f in src_folder_path.iterdir() if f.is_dir()]

for src_folder in subfolders:
    directory_path = Path(src_folder)
    csv_files_glob = glob.glob(str(directory_path / '*.csv')) #SHOULD ONLY BE 1 FILE

    for input_file in csv_files_glob:
        with open(input_file, 'r') as file:
            raw_data = file.read()

        subtables = {}
        current_date = None
        current_data = []

        date_str = ''
        subtable_data = ''
        for line in lines:
            detail = line.split('\n')

            if ',' in detail[0]:
                subtable_data += detail[0]+'\n'
            elif len(detail[0]) > 0:
                date_str = detail[0].split('(')[0][:-2]
            else:
                #SAVE SUBTABLE
                subtables[date_str] = subtable_data        
                
                #RESET SUBTABLE TALLY FOR NEXT ITERATION
                date_str = ''
                subtable_data = ''

        output_file = Path(input_file.replace('.csv', '.xlsx'))
        create_output(subtables, output_file)
