In [4]:
import re
import pandas as pd
import os

In [2]:
def clean_text(text):
    cleaned_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    return cleaned_text

In [3]:
def clean_files_in_folder(folder_path):
    count=0
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if os.path.isfile(file_path) and filename.endswith(".txt"):  # Process only text files
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            cleaned_content = clean_text(content)
            
            with open(file_path, 'w', encoding='utf-8') as file:  # Overwrite the same file
                file.write(cleaned_content)
            print(f"{count}  Cleaned: {filename}")
            count+=1

In [6]:
clean_files_in_folder("train")

0  Cleaned: %C2%BFse-debe-independizar-catalu%C3%B1a-de-espa%C3%B1a-2964.txt
1  Cleaned: %E2%80%98asdaburys%E2%80%99---is-the-asdasainsburys-merger-good-for-the-uk%E2%80%99s-pharmaceutical-and-optometry-landscape-14766.txt
2  Cleaned: 3d-printer-and-guns-should-blueprints-of-3d-printed-weapons-be-prohibited-17593.txt
3  Cleaned: a-bar-of-soap-is-better-than-a-bottle-of-shower-gel-21205.txt
4  Cleaned: a-childs-primary-carer-should-receive-a-wage-until-the-child-enters-primary-school-or-some-other-form-of-care-17763.txt
5  Cleaned: a-flat-asset-tax-is-all-the-tax-we-should-ever-pay-16974.txt
6  Cleaned: a-free-press-is-necessary-to-democracy-8559.txt
7  Cleaned: a-permanent-venue-for-the-olympic-games-1335.txt
8  Cleaned: a-society-with-no-gender-would-be-better-16617.txt
9  Cleaned: a-united-ireland-is-the-best-political-structure-for-northern-ireland-18183.txt
10  Cleaned: addressing-psychosocial-factors-is-essential-to-reducing-or-preventing-school-shootings-11784.txt
11  Cleaned: af

In [9]:
def parse_arguments(filename):
    with open(filename, "r", encoding="utf-8") as file:
        lines = file.readlines()
    
    hierarchy = {}
    parent_stack = []
    data = []
    references = {}
    
    for line in lines:
        match = re.match(r'(\d+(\.\d+)*)\.(\s*)(Pro|Con):\s*(.*)', line)
        if match:
            num, _, _, tag, opinion = match.groups()
            level = num.count('.')
            
            if "-> See" in opinion:
                ref_match = re.search(r'-> See (\d+(\.\d+)*)', opinion)
                if ref_match:
                    reference_num = ref_match.group(1)
                    references[num] = reference_num  # Store reference mapping
                    continue
            
            if level == 0:
                parent_stack = [(num, opinion)]
            else:
                parent_stack = parent_stack[:level] + [(num, opinion)]
            
            parent = parent_stack[-2][1] if len(parent_stack) > 1 else None
            
            hierarchy[num] = {
                'parent': parent,
                'opinion': opinion,
                'label': 1 if tag == 'Pro' else 0
            }
    
    # Resolve references
    for key, ref in references.items():
        if ref in hierarchy:
            hierarchy[key] = hierarchy[ref]
    
    for key, value in hierarchy.items():
        if value['parent']:
            data.append([value['parent'], value['opinion'], value['label']])
    
    return data

In [10]:
def process_folder(folder_path, output_file):
    all_data = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):  # Process only text files
                file_path = os.path.join(root, file)
                parsed_data = parse_arguments(file_path)
                all_data.extend(parsed_data)
    
    df = pd.DataFrame(all_data, columns=["Anchor Point", "Opinions", "Label"])
    df.to_csv(output_file, index=False)
    print(f"Formatted output saved to {output_file}")


In [11]:
folder_path = "train"  # Change this to the folder containing subfolders
output_file = "formatted_output3.0.csv"
process_folder(folder_path, output_file)

Formatted output saved to formatted_output3.0.csv
