In [14]:
import os
import re
import subprocess

def list_txt_files(directory):
    # Find all .txt files in the directory (non-recursively)
    txt_files = []
    for file in os.listdir(directory):
        file_path = os.path.join(directory, file)
        if os.path.isfile(file_path) and file.endswith('.txt'):
            txt_files.append(file_path)
    return txt_files
    
# Function to extract text between <doc> and </doc> tags
def extract_text_from_file(file_path, encoding='utf-8'):
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            content = file.read()
        # Use regular expressions to find the text between <doc> and </doc>
        #match = re.search(r'<doc>(.*?)</doc>', content, re.DOTALL)
        #if match:
        #    return match.group(1).strip()
        #else:
            return content
    except UnicodeDecodeError as e:
        print(f"Error reading file {file_path}: {e}")
        return None

# Function to write words to a new file with each word on a new line
def write_words_to_file(text, output_path, encoding='utf-8'):
    try:
        words = text.split()
        with open(output_path, 'w', encoding=encoding) as file:
            for word in words:
                file.write(word + '\n')
        print(f"Words have been successfully written to {output_path}")
    except UnicodeEncodeError as e:
        print(f"Error writing file {output_path}: {e}")

# Function to execute TreeTagger and process the output
def run_treetagger(input_path, output_path):
    treetagger_path = 'tree-tagger-windows-3.2.3a/TreeTagger/bin/tree-tagger.exe'  # Replace with the path to your TreeTagger executable
    parameter_path = 'tree-tagger-windows-3.2.3a/TreeTagger/lib/english.par'  # Replace with the path to your TreeTagger parameter file
    cmd = [treetagger_path, parameter_path, '-token', '-lemma', '-no-unknown', input_path]
    
    try:
        # Run TreeTagger and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8')
        
        if result.returncode == 0:
            if result.stdout:  # Check if stdout is not None
                with open(output_path, 'w', encoding='utf-8') as file:
                    file.write(result.stdout)
                print(f"TreeTagger output has been written to {output_path}")
            else:
                print(f"No output from TreeTagger for file: {input_path}")
        else:
            print(f"Error running TreeTagger: {result.stderr}")
    
    except Exception as e:
        print(f"An exception occurred: {e}")

def process_files(directory):
    # Find all .txt files
    txt_files = list_txt_files(directory)
    
    for txt_file in txt_files:
        # Define file paths
        base_dir = os.path.dirname(txt_file)
        base_name = os.path.splitext(os.path.basename(txt_file))[0]
        temp_text_file_path = os.path.join(base_dir, f'{base_name}_words.txt')
        treetagger_output_dir = os.path.join(base_dir, 'treetagger_output')
        treetagger_output_path = os.path.join(treetagger_output_dir, f'{base_name}_treetagger_output.txt')

        # Create output directory if it doesn't exist
        if not os.path.exists(treetagger_output_dir):
            os.makedirs(treetagger_output_dir)
        
        # Extract text and write to new file
        text = extract_text_from_file(txt_file)
        if text:
            write_words_to_file(text, temp_text_file_path)
            print(f"Words have been written to {temp_text_file_path}")

            # Run TreeTagger
            run_treetagger(temp_text_file_path, treetagger_output_path)

            # Clean up temporary file
            os.remove(temp_text_file_path)
        else:
            print(f"No text found in <doc> tags for file: {txt_file}")


# Example usage
directory_path = 'data/en_DW/'  # Root directory containing subfolders with .txt files
process_files(directory_path)

Words have been successfully written to data/en_DW\'Chaos' & 'despair' at Gaza hospital after Israeli strike on 'safe zone' - UN eyewitness  DW News.en_words.txt
Words have been written to data/en_DW\'Chaos' & 'despair' at Gaza hospital after Israeli strike on 'safe zone' - UN eyewitness  DW News.en_words.txt
TreeTagger output has been written to data/en_DW\treetagger_output\'Chaos' & 'despair' at Gaza hospital after Israeli strike on 'safe zone' - UN eyewitness  DW News.en_treetagger_output.txt
Words have been successfully written to data/en_DW\'It is time for this war to end' Kamala Harris speaks after meeting Israeli PM Netanyahu  DW News.en_words.txt
Words have been written to data/en_DW\'It is time for this war to end' Kamala Harris speaks after meeting Israeli PM Netanyahu  DW News.en_words.txt
TreeTagger output has been written to data/en_DW\treetagger_output\'It is time for this war to end' Kamala Harris speaks after meeting Israeli PM Netanyahu  DW News.en_treetagger_output.tx

Exception in thread Thread-1376 (_readerthread):
Traceback (most recent call last):
  File "C:\Program Files\Python312\Lib\threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "C:\Users\Lorenz\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "C:\Program Files\Python312\Lib\threading.py", line 1010, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Program Files\Python312\Lib\subprocess.py", line 1597, in _readerthread
    buffer.append(fh.read())
                  ^^^^^^^^^
  File "C:\Program Files\Python312\Lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1686: character maps to <undefined>


TypeError: write() argument must be str, not None