In [1]:
import os
import re
import subprocess

def list_txt_files(directory):
    # Find all .txt files in the directory (non-recursively)
    txt_files = []
    for file in os.listdir(directory):
        file_path = os.path.join(directory, file)
        if os.path.isfile(file_path) and file.endswith('.txt'):
            txt_files.append(file_path)
    return txt_files
    
# Function to extract text between <doc> and </doc> tags
def extract_text_from_file(file_path, encoding='utf-8'):
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            content = file.read()
        # Use regular expressions to find the text between <doc> and </doc>
        #match = re.search(r'<doc>(.*?)</doc>', content, re.DOTALL)
        #if match:
        #    return match.group(1).strip()
        #else:
            return content
    except UnicodeDecodeError as e:
        print(f"Error reading file {file_path}: {e}")
        return None

# Function to tokenize text into words and punctuation
def tokenize(text):
    # Regex to split words and punctuation
    # \w+ matches word characters (including numbers and underscores)
    # [^\w\s] matches punctuation (anything that's not a word or whitespace)
    tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    return tokens

# Function to write tokens to a new file with each token on a new line
def write_words_to_file(text, output_path, encoding='utf-8'):
    try:
        # Tokenize the text using the tokenizer
        tokens = tokenize(text)
        with open(output_path, 'w', encoding=encoding) as file:
            for token in tokens:
                file.write(token + '\n')
        print(f"Tokens have been successfully written to {output_path}")
    except UnicodeEncodeError as e:
        print(f"Error writing file {output_path}: {e}")

# Function to execute TreeTagger and process the output
def run_treetagger(input_path, output_path):
    treetagger_path = 'tree-tagger-windows-3.2.3a/TreeTagger/bin/tree-tagger.exe'  # Replace with the path to your TreeTagger executable
    parameter_path = 'tree-tagger-windows-3.2.3a/TreeTagger/lib/english.par'  # Replace with the path to your TreeTagger parameter file
    cmd = [treetagger_path, parameter_path, '-token', '-lemma', '-no-unknown', input_path]
    
    try:
        # Run TreeTagger and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8')
        
        if result.returncode == 0:
            if result.stdout:  # Check if stdout is not None
                with open(output_path, 'w', encoding='utf-8') as file:
                    file.write(result.stdout)
                print(f"TreeTagger output has been written to {output_path}")
            else:
                print(f"No output from TreeTagger for file: {input_path}")
        else:
            print(f"Error running TreeTagger: {result.stderr}")
    
    except Exception as e:
        print(f"An exception occurred: {e}")

def process_files(directory):
    # Find all .txt files
    txt_files = list_txt_files(directory)
    for txt_file in txt_files:
        # Ensure the directory for TreeTagger output exists
        treetagger_output_dir = os.path.join(os.path.dirname(txt_file), 'treetagger_output')
        os.makedirs(treetagger_output_dir, exist_ok=True)
        
        # Define paths
        temp_text_file_path = os.path.join(treetagger_output_dir, os.path.basename(txt_file).replace('.txt', '_words.txt'))
        treetagger_output_path = os.path.join(treetagger_output_dir, os.path.basename(txt_file).replace('.txt', '_treetagger_output.txt'))

        # Extract text and write to new file
        text = extract_text_from_file(txt_file)
        if text:
            write_words_to_file(text, temp_text_file_path)
            print(f"Words have been written to {temp_text_file_path}")

            # Run TreeTagger
            run_treetagger(temp_text_file_path, treetagger_output_path)

            # Clean up temporary file
            os.remove(temp_text_file_path)
        else:
            print(f"No text found in <doc> tags for file: {txt_file}")


# Example usage
directory_path = 'data/en_BBCNews/'  # Root directory containing subfolders with .txt files
process_files(directory_path)

Tokens have been successfully written to data/en_BBCNews\treetagger_output\'gaza children killed as they slept' in un-run school - bbc news_words.txt
Words have been written to data/en_BBCNews\treetagger_output\'gaza children killed as they slept' in un-run school - bbc news_words.txt
TreeTagger output has been written to data/en_BBCNews\treetagger_output\'gaza children killed as they slept' in un-run school - bbc news_treetagger_output.txt
Tokens have been successfully written to data/en_BBCNews\treetagger_output\'hamas has destroyed the chance of humanitarian aid for gaza' says mark regev - bbc news_words.txt
Words have been written to data/en_BBCNews\treetagger_output\'hamas has destroyed the chance of humanitarian aid for gaza' says mark regev - bbc news_words.txt
TreeTagger output has been written to data/en_BBCNews\treetagger_output\'hamas has destroyed the chance of humanitarian aid for gaza' says mark regev - bbc news_treetagger_output.txt
Tokens have been successfully written 