In [1]:
import os
import json
import csv
import statistics
import re
from pathlib import Path

In [7]:
def extract_numbers_with_min_digits(data, min_digits=1):
    """
    Extracts all numbers with a minimum number of digits from a nested JSON structure,
    capturing specific patterns including numbers with commas, decimals,
    and certain currency symbols (€). It ignores patterns followed by any currency symbol except €.
    """
    numbers = []
    excluded_numbers = ['50', '50.0', '50.00', '50.000']  # Numbers to be excluded

    if isinstance(data, dict):
        for value in data.values():
            numbers.extend(extract_numbers_with_min_digits(value, min_digits))

    elif isinstance(data, list):
        for item in data:
            numbers.extend(extract_numbers_with_min_digits(item, min_digits))

    elif isinstance(data, str):
        # Regex pattern to capture numbers with commas, decimals, and € currency symbol
        # Ignores patterns followed by any currency symbol except €
        pattern = r'(?<!\S)(?:€)?(\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d+(?:\.\d+)?)\b'
        found_numbers = re.findall(pattern, data)
        for num in found_numbers:
            # Cleaning the number format
            num_clean = num.replace(',', '').replace('€', '')
            # Check if the number has at least min_digits on the left side of the decimal
            if '.' in num_clean:
                left_digits = num_clean.split('.')[0]
                if len(left_digits) >= min_digits and num_clean not in excluded_numbers:
                    numbers.append(float(num_clean))
            else:
                if len(num_clean) >= min_digits and num_clean not in excluded_numbers:
                    numbers.append(float(num_clean))

    return numbers

def process_json_file(json_file_path, folder_name):
    """
    Process a single JSON file and return the extracted data for each person.
    Includes both the original and modified list of extracted numbers for each person.
    """
    with open(json_file_path, 'r') as json_file:
        conversation = json.load(json_file)
        persons_data = {}

        # Extract numbers for each person and maintain original and modified lists
        for message in conversation:
            name = message['name']
            content = message['content']
            role = message['role']  # Assuming 'role' is a key in the JSON data indicating the party's role
            if name not in persons_data:
                persons_data[name] = {'original': [], 'modified': [], 'rounds': 0}

            content_numbers = extract_numbers_with_min_digits(content)
            persons_data[name]['original'].extend(content_numbers)

            # Modify the list by removing the first number if more than one
            modified_numbers = content_numbers[1:] if len(content_numbers) > 1 else content_numbers
            persons_data[name]['modified'].extend(modified_numbers)

            # Increment the number of rounds for the current party (name)
            persons_data[name]['rounds'] += 1

        # Compile data for CSV
        csv_data = []
        for name, data in persons_data.items():
            original_numbers = data['original']
            modified_numbers = data['modified']
            rounds = data['rounds']

            min_price = min(modified_numbers) if modified_numbers else 0
            max_price = max(modified_numbers) if modified_numbers else 0
            avg_price = statistics.mean(modified_numbers) if modified_numbers else 0
            last_price = modified_numbers[-1] if modified_numbers else 0
            csv_data.append([50, name, min_price, max_price, avg_price, last_price, folder_name, rounds, original_numbers, modified_numbers])

        return csv_data

def write_to_csv(csv_file_path, csv_data):
    """
    Write data to a CSV file.
    """
    with open(csv_file_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Apartment Size', 'Name', 'Min Price', 'Max Price', 'Avg Price', 'Last Price', 'Folder Name', 'Rounds', 'Original Price List', 'Modified Price List'])
        # Write data to a CSV file.
        with open(csv_file_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Apartment Size', 'Name', 'Min Price', 'Max Price', 'Avg Price', 'Last Price', 'Folder Name', 'Rounds', 'Original Price List', 'Modified Price List'])
            for row in csv_data:
                # Convert the lists of prices to string representations for CSV writing
                if isinstance(row[-3], list):
                    row[-3] = ', '.join(map(str, row[-3]))  # Original Price List
                if isinstance(row[-2], list):
                    row[-2] = ', '.join(map(str, row[-2]))  # Modified Price List
                writer.writerow(row)

def process_folder(base_path, output_directory):
    """
    Process each folder and create a single CSV file containing data
    from all JSON files within the subfolders of each folder.
    """
    all_csv_data = []

    # Extract the parent folder name from the base path
    parent_folder_name = os.path.basename(os.path.normpath(base_path))

    for json_folder in os.listdir(base_path):
        json_folder_path = os.path.join(base_path, json_folder)

        if os.path.isdir(json_folder_path):
            for file in os.listdir(json_folder_path):
                if file.endswith('.json'):
                    json_file_path = os.path.join(json_folder_path, file)
                    csv_data = process_json_file(json_file_path, json_folder)
                    all_csv_data.extend(csv_data)

    # Write all the data to a single CSV file with the parent folder name as the filename
    csv_file_name = f"{parent_folder_name}.csv"
    csv_file_path = os.path.join(output_directory, csv_file_name)
    write_to_csv(csv_file_path, all_csv_data)


In [8]:
# Get the directory of the notebook file
current_directory = os.getcwd()

# Set the base path and output directory accordingly
base_path = os.path.abspath(os.path.join(current_directory, '..', '..', '..', 'hubsim', 'single-factor-experiments\Open-Ended', 'bagel-bagel'))
output_directory = os.path.abspath(os.path.join(current_directory, '..', '..', 'parsing', 'output-04.03.2024\Open-Ended', 'Bagel-Bagel'))

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Get a list of all subfolders in the "bagel-dpo-34b-v0.2-bagel-dpo-34b-v0.2" directory
subfolders = [f.path for f in os.scandir(base_path) if f.is_dir()]

# Process each folder
for folder in subfolders:
    process_folder(folder, output_directory)


  base_path = os.path.abspath(os.path.join(current_directory, '..', '..', '..', 'hubsim', 'single-factor-experiments\Open-Ended', 'bagel-bagel'))
  output_directory = os.path.abspath(os.path.join(current_directory, '..', '..', 'parsing', 'output-04.03.2024\Open-Ended', 'Bagel-Bagel'))


In [5]:
# Set the base path and output directory accordingly
base_path = os.path.abspath(os.path.join(current_directory, '..', '..', '..', 'hubsim', 'single-factor-experiments\Open-Ended', 'bagel-Yi'))
output_directory = os.path.abspath(os.path.join(current_directory, '..', '..', 'parsing', 'output-04.03.2024\Open-Ended', 'Bagel-Yi'))

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Get a list of all subfolders in the "bagel-dpo-34b-v0.2-Yi-34B-Chat" directory
subfolders = [f.path for f in os.scandir(base_path) if f.is_dir()]

# Process each folder
for folder in subfolders:
    process_folder(folder, output_directory)


In [6]:
# Set the base path and output directory accordingly
base_path = os.path.abspath(os.path.join(current_directory, '..', '..', '..', 'hubsim', 'single-factor-experiments\Open-Ended', 'Yi-bagel'))
output_directory = os.path.abspath(os.path.join(current_directory, '..', '..', 'parsing', 'output-04.03.2024\Open-Ended', 'Yi-Bagel'))


# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Get a list of all subfolders in the "Yi-34B-Chat-Yi-34B-Chat" directory
subfolders = [f.path for f in os.scandir(base_path) if f.is_dir()]

# Process each folder
for folder in subfolders:
    process_folder(folder, output_directory)


In [None]:
# Set the base path and output directory accordingly
base_path = os.path.abspath(os.path.join(current_directory, '..', '..', '..', 'hubsim', 'single-factor-experiments\Open-Ended', 'Yi-Yi'))
output_directory = os.path.abspath(os.path.join(current_directory, '..', '..', 'parsing', 'output-04.03.2024\Open-Ended', 'Yi-Yi'))


# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Get a list of all subfolders in the "Yi-34B-Chat-Yi-34B-Chat" directory
subfolders = [f.path for f in os.scandir(base_path) if f.is_dir()]

# Process each folder
for folder in subfolders:
    process_folder(folder, output_directory)


In [10]:
test_data = '''The values are 1 and 12.0 and 5 and 25 and 50.50 and 50.0 and 50.00 and 50.000 and 50 and 50.000, 50,000 and 2.50 and 1214 test 1,234,
500.56, 300, 500,240€, and 50,050€, 5,050 and 5050,50 and 1.050€ and 1,050€ and test I €1,312.50, 500,50'''
extracted_numbers = extract_numbers_with_min_digits(test_data, 2)
print(extracted_numbers)

[12.0, 25.0, 50.5, 50000.0, 1214.0, 1234.0, 500.56, 300.0, 500240.0, 50050.0, 5050.0, 5050.0, 1050.0, 1312.5, 500.0]
