# Collect data into a DataFrame

In [16]:
import re
from pathlib import Path

import pandas as pd


data_dir = Path('data')

Data structure is as follows:

 - iterate through batches and run-folders
 - extract gAA, gBB, gCC, gAB, gAC, gBC values from parameters.py files

In [32]:
def replace_one_parameter_with_one_value(input_str, name, val):
    """Replace a parameter name with its value. Search for the name in the
    string and replace it with a value. Ensure that 'name' is not part of
    another parameter name by verfiying that the previous and next charachter
    is either nothing or a special symbol, such as: ,+-/()...."""

    special_symbol = ['.', '-', '+', '*', '/', '(', ')', '%', ' ', ',', ':']

    # find all occurrences in val_string, but we want to replace only the first
    # occurrence with its value, since the lengths of vals and pars might differ
    for m in re.finditer(name, input_str):
        # check if charachter before occurrence is a symbol (or empty)
        if m.start() == 0 or (m.start() != 0 and
            input_str[m.start() - 1] in special_symbol):
            if m.end() == len(input_str) or (m.end() != len(input_str) and
                input_str[m.end()] in special_symbol):
                input_str = input_str[:m.start()] + f'({val})' + input_str[m.end():]
                break
    return input_str


def replace_parameters(input_str, par_dict):
    """Iterate until input_str does not change anymore (like a while-loop),
    It should be possible to treat cases like
     gCC = -5.9
     g = 2.0
     a = 2
     n = a*g
     gAC_prop = g - gCC*math.sqrt(gCC/g**2) + int(g*n)
    """
    for _ in range(1000):
        input_str_old = input_str
        for par_name, par_val in par_dict.items():
            input_str = replace_one_parameter_with_one_value(
                input_str, par_name, par_val)
        if input_str_old == input_str:
            break

    return input_str


def read_in_par(path_par_f, var_name):
    """Read in parameters.py file and extract requested variable.
    
    file structure:
    X = 1 # description
    Y = 2*X
    
    Args:
        path_par_f (Path): Path of parameters.py file
        var_name (str): name of parameter
    
    Return:
        value of parameter.
    """

    with open(path_par_f, 'r') as f:
        temp_par_dict = {}  # collect all previously defined parameters
        for line in f:
            if '=' not in line:
                continue
            par_name_list = line.split('=')[0].split()
            if '#' in par_name_list[0] or len(par_name_list) != 1:
                continue
            par_name = par_name_list[0]
            val_list = line.split('=')[1]
            if '#' in val_list:
                temp_par_dict[par_name] = val_list.split('#')[0].strip()
            else:
                temp_par_dict[par_name] = val_list.strip()
            if var_name in temp_par_dict.keys():
                break

    # evaluate string, if neccessary replace string with parameters
    if var_name in temp_par_dict.keys():
        val_string = temp_par_dict[var_name]
        try:
            return eval(val_string)
        except:
            pass
        val_string = replace_parameters(val_string, temp_par_dict)
        try:
            return eval(val_string)
        except:
            return val_string
    else:
        return None

    

In [40]:
# initialize parameter dictionary
par_dict = {
    'gBB':[],
    'gCC':[],
    'gAB':[],
    'gAC':[],
    'gBC':[],
    'path':[]
}


for batch_dir in sorted(data_dir.iterdir()):
    
    for run_dir in sorted(batch_dir.iterdir()):
        
        # get parameter values        
        par_dict['gBB'].append(read_in_par(run_dir/'parameters.py', 'gBB'))
        par_dict['gCC'].append(read_in_par(run_dir/'parameters.py', 'gCC'))
        par_dict['gAB'].append(read_in_par(run_dir/'parameters.py', 'gAB'))
        par_dict['gAC'].append(read_in_par(run_dir/'parameters.py', 'gAC'))
        par_dict['gBC'].append(read_in_par(run_dir/'parameters.py', 'gBC'))

        par_dict['path'].append(f'data/{batch_dir.name}/{run_dir.name}')



In [41]:
# convert to dataframe

df = pd.DataFrame(par_dict)

In [42]:
df.shape

(18014, 6)