In [2]:
import os
import re
import subprocess
import json
import pandas as pd

# Function to extract metadata and content between <doc> and </doc> tags, including video_title
def extract_metadata_and_content(df):
    video_id = None
    publish_date = None
    video_title = None
    content_start = None
    content_end = None

    # Fill NaN values with an empty string to prevent issues with `in` checks
    df = df.fillna('')

    # Iterate through the DataFrame to find metadata and content boundaries
    for index, row in df.iterrows():
        row_content = row.iloc[0]
        if '<video_id>' in row_content:
            video_id = re.search(r'<video_id>(.*?)</video_id>', row_content).group(1)
        if '<publish_date>' in row_content:
            publish_date = re.search(r'<publish_date>(.*?)</publish_date>', row_content).group(1)
        if '<video_title>' in row_content:
            video_title = re.search(r'<video_title>(.*?)</video_title>', row_content).group(1)
        if '<doc>' in row_content:
            content_start = index + 1  # Start after <doc> tag
        if '</doc>' in row_content:
            content_end = index  # End before </doc> tag
            break  # Exit loop once end tag is found

    return video_id, publish_date, video_title, content_start, content_end

def list_txt_files(directory):
    # Find all .txt files in the directory (non-recursively)
    txt_files = []
    for file in os.listdir(directory):
        file_path = os.path.join(directory, file)
        if os.path.isfile(file_path) and file.endswith('.txt'):
            txt_files.append(file_path)
    return txt_files

# Function to extract text from file and return DataFrame (for metadata extraction)
def extract_text_from_file(file_path, encoding='utf-8'):
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            content = file.readlines()  # Reading file line by line
        df = pd.DataFrame(content)
        return df
    except UnicodeDecodeError as e:
        print(f"Error reading file {file_path}: {e}")
        return None

# Function to tokenize text into words and punctuation
def tokenize(text):
    tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    return tokens

# Function to write tokens to a new file with each token on a new line
def write_words_to_file(text, output_path, encoding='utf-8'):
    try:
        tokens = tokenize(text)
        with open(output_path, 'w', encoding=encoding) as file:
            for token in tokens:
                file.write(token + '\n')
        print(f"Tokens have been successfully written to {output_path}")
    except UnicodeEncodeError as e:
        print(f"Error writing file {output_path}: {e}")

# Function to execute TreeTagger and process the output
def run_treetagger(input_path, output_path):
    treetagger_path = 'tree-tagger-windows-3.2.3a/TreeTagger/bin/tree-tagger.exe'  # Replace with the path to your TreeTagger executable
    parameter_path = 'tree-tagger-windows-3.2.3a/TreeTagger/lib/english.par'  # Replace with the path to your TreeTagger parameter file
    cmd = [treetagger_path, parameter_path, '-token', '-lemma', '-no-unknown', input_path]
    
    try:
        # Run TreeTagger and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8')
        
        if result.returncode == 0:
            if result.stdout:  # Check if stdout is not None
                with open(output_path, 'w', encoding='utf-8') as file:
                    file.write(result.stdout)
                print(f"TreeTagger output has been written to {output_path}")
            else:
                print(f"No output from TreeTagger for file: {input_path}")
        else:
            print(f"Error running TreeTagger: {result.stderr}")
    
    except Exception as e:
        print(f"An exception occurred: {e}")

# Function to parse TreeTagger output into structured format (Token, Tag, Lemma)
def parse_treetagger_output(tt_output_path):
    structured_output = []
    try:
        with open(tt_output_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            for line in lines:
                parts = line.strip().split('\t')
                if len(parts) == 3:
                    token, tag, lemma = parts
                    structured_output.append({
                        "Token": token,
                        "Tag": tag,
                        "Lemma": lemma
                    })
    except Exception as e:
        print(f"Error parsing TreeTagger output: {e}")
    
    return structured_output

# Function to save the metadata and structured TreeTagger output as JSON
def save_as_json(metadata, treetagger_data, json_output_path):
    try:
        data = {
            'video_id': metadata['video_id'],
            'publish_date': metadata['publish_date'],
            'video_title': metadata['video_title'],
            'treetagger_output': treetagger_data  # Structured output here
        }
        with open(json_output_path, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=4)
        print(f"Data has been successfully written to {json_output_path}")
    except Exception as e:
        print(f"Error saving JSON file: {e}")

# Function to process files and extract metadata, tokenize, and run TreeTagger
def process_files(directory):
    # Find all .txt files
    txt_files = list_txt_files(directory)
    for txt_file in txt_files:
        # Ensure the directory for TreeTagger output exists
        treetagger_output_dir = os.path.join(os.path.dirname(txt_file), 'treetagger_output')
        os.makedirs(treetagger_output_dir, exist_ok=True)
        
        # Define paths
        temp_text_file_path = os.path.join(treetagger_output_dir, os.path.basename(txt_file).replace('.txt', '_words.txt'))
        treetagger_output_path = os.path.join(treetagger_output_dir, os.path.basename(txt_file).replace('.txt', '_treetagger_output.txt'))
        json_output_path = os.path.join(treetagger_output_dir, os.path.basename(txt_file).replace('.txt', '_output.json'))

        # Extract text and metadata
        df = extract_text_from_file(txt_file)
        if df is not None:
            video_id, publish_date, video_title, content_start, content_end = extract_metadata_and_content(df)
            if content_start is not None and content_end is not None:
                # Extract the content between <doc> and </doc>
                content = ' '.join(df.iloc[content_start:content_end][0].tolist())

                # Write words to file
                write_words_to_file(content, temp_text_file_path)

                # Run TreeTagger
                run_treetagger(temp_text_file_path, treetagger_output_path)

                # Parse TreeTagger output
                treetagger_data = parse_treetagger_output(treetagger_output_path)

                # Save metadata and structured TreeTagger output as JSON
                metadata = {
                    'video_id': video_id,
                    'publish_date': publish_date,
                    'video_title': video_title
                }
                save_as_json(metadata, treetagger_data, json_output_path)

                # Clean up temporary file
                os.remove(temp_text_file_path)
            else:
                print(f"No content found between <doc> tags in file: {txt_file}")
        else:
            print(f"Error processing file: {txt_file}")

# Example usage
directory_path = 'data/en_DW/'  # Root directory containing subfolders with .txt files
process_files(directory_path)


Tokens have been successfully written to data/en_DW\treetagger_output\'chaos' & 'despair' at gaza hospital after israeli strike on 'safe zone' - un eyewitness  dw news_words.txt
TreeTagger output has been written to data/en_DW\treetagger_output\'chaos' & 'despair' at gaza hospital after israeli strike on 'safe zone' - un eyewitness  dw news_treetagger_output.txt
Data has been successfully written to data/en_DW\treetagger_output\'chaos' & 'despair' at gaza hospital after israeli strike on 'safe zone' - un eyewitness  dw news_output.json
Tokens have been successfully written to data/en_DW\treetagger_output\'it is time for this war to end' kamala harris speaks after meeting israeli pm netanyahu  dw news_words.txt
TreeTagger output has been written to data/en_DW\treetagger_output\'it is time for this war to end' kamala harris speaks after meeting israeli pm netanyahu  dw news_treetagger_output.txt
Data has been successfully written to data/en_DW\treetagger_output\'it is time for this war t