In [1]:
import os
import pandas as pd

# This code goes through all subparse output files (vector only) in a directory and condenses the summary tables into one table
# Only args required are the input and output dir:
    # input_dir: top level directory containing all subparse output; same as the output dir given to the subparser script
    # directory to put the output file (csv) of this code into
input_dir = 'subparsing/'
output_dir = 'condensed_subparsed/'

# you can probably leave the rest of this code as-is
# get all subparser output files in directory (recursive)
subparser_output_directory = input_dir
list_of_subparser_files = []
for (dirpath, dirnames, filenames) in os.walk(subparser_output_directory):
    for filename in filenames:
        if 'subparsed.tsv' in filename:
            list_of_subparser_files.append(os.path.join(dirpath, filename))
            break
    else:
        continue

# get top data table from each subparser file
subparser_file_matrices = []
for subparser_file in list_of_subparser_files:
    with open(subparser_file, 'r') as f:
        subparser_matrix = []
        for line in f:
            if 'Bin' in line:  # skip header row
                continue
            elif 'Totals' in line:  # stop add Totals row, don't add it
                subparser_matrix.append(line.split() + ['N/A'])
                break
            else:
                line = line.split()
                line[1] = float(line[1])
                subparser_matrix.append(line)
        for line in subparser_matrix:
            line[2] = round(float(line[2]) * 100, 2)
            line.insert(0, os.path.basename(subparser_file))  # add source file as column to matrix
    subparser_file_matrices.append(subparser_matrix)

# convert matrices list into a single dataframe
subparser_dataframe = pd.DataFrame(columns=['File', 'Category', 'Sequences', 'Percent_of_File', 'Tile_Patterns', 'Percent_Full'])
for matrix in subparser_file_matrices:
    for line in matrix:
        subparser_dataframe.loc[len(subparser_dataframe)] = line

subparser_dataframe['sort_col'] = subparser_dataframe['Sequences'].apply(lambda x: x if isinstance(x, float) else -1000)
subparser_dataframe.sort_values(by=['File', 'sort_col'], ascending=[True, False], inplace=True)
subparser_dataframe.drop(columns='sort_col', axis=1, inplace = True)


output_filename = 'all.subparsed.csv'
# while True:
#     head, tail = os.path.split(input_dir)
#     input_dir = head
#     if '-' in tail:
#         output_filename = f'{tail}_{output_filename}'
#         break
#     if not head:
#         break
        
subparser_dataframe.to_csv(os.path.join(output_dir, output_filename), index=False)
print('done')

done


In [3]:
import pandas as pd  # ad hoc script condenses data for paper

input_file = 'condensed_subparsed/all.subparsed.csv'
output_file = 'condensed_subparsed/all.subparsed_condensed.csv'

empty_row = {'Subclassification': '', 'Mutation_Rate': 0, 'Matches': 0, 'Misses': 0, 'Proportion_Matches': 0}
row = empty_row.copy()
df = pd.DataFrame(columns = ['Subclassification', 'Mutation_Rate', 'Matches', 'Misses', 'Proportion_Matches'])

with open(input_file, 'r') as file:
    for line in file:
        line = line.split(',')
        if line[0] == 'File':  # skip header
            continue
        elif not row['Subclassification']:
            row['Subclassification'] = line[0].split('_m')[0]
            row['Mutation_Rate'] = float(line[0].split('.')[0].split('_m_')[1]) / 100
        elif line[1] == 'Totals':
            row['Proportion_Matches'] = row['Matches'] / (row['Matches'] + row['Misses'])
            df.loc[len(df)] = row
            row = empty_row.copy()
            continue
        if row['Matches'] and line[1] == line[0].split('_m_')[0]:
            raise ValueError(f'\nmatch should not be set twice; line:{line}\nrow:{row}')
        elif not row['Matches'] and line[1] in line[0]:
            row['Matches'] = float(line[2])
        else:
            row['Misses'] += float(line[2])

df.to_csv(output_file, index=False)
print('done successfully')

done successfully
