In [2]:
import os
import pandas as pd
import re
from collections import Counter

# Path to the folder containing the .txt files
folder_path = "data/en_DW/treetagger_output/"

# Define the column you're interested in and the terms you're looking for
column_index = 1  # Adjusted to the second column
first_term = r'VBD|VBN'
second_term = r'VVN'

# Function to extract metadata (video_id and publish_date)
def extract_metadata(df):
    video_id = None
    publish_date = None

    # Iterate through the first few rows to find metadata
    for index, row in df.iterrows():
        if '<video_id>' in row.iloc[0]:
            video_id = re.search(r'<video_id>(.*?)</video_id>', row.iloc[0]).group(1)
        if '<publish_date>' in row.iloc[0]:
            publish_date = re.search(r'<publish_date>(.*?)</publish_date>', row.iloc[0]).group(1)
        # Break out once both are found
        if video_id and publish_date:
            break

    return video_id, publish_date

# Dictionary to store the results for each file
results = []

# Variables to accumulate totals
total_passive_voice_count = 0
total_rows = 0

# Counters to store the frequency of strings in VVN and VBD/VBN sequences
vvn_strings_counter = Counter()
preceding_strings_counter = Counter()  # For the strings before VBD/VBN

# Loop through all files in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a .txt file
    if file_name.endswith(".txt"):
        # Construct the full file path
        file_path = os.path.join(folder_path, file_name)

        # Read the .txt file (tab-delimited) with no headers and assign column names
        df = pd.read_csv(file_path, delimiter="\t", header=None, names=["col1", "col2", "col3"])

        # Extract metadata (video_id, publish_date)
        video_id, publish_date = extract_metadata(df)

        # Count occurrences of the sequence VBD/VBN followed by VVN (indicating passive voice)
        file_total_rows = len(df)  # Total number of rows in the file
        file_passive_voice_count = 0
        vvn_strings = []  # List to store the strings in column 3 of the VVN rows
        preceding_strings = []  # List to store the preceding strings based on VBD/VBN rules

        # Iterate through the DataFrame to find the sequence
        for i in range(file_total_rows - 1):  # Loop through each row except the last one
            # Safely convert the values to strings and avoid NaN values
            current_value = str(df.iloc[i, column_index]) if pd.notna(df.iloc[i, column_index]) else ""
            next_value = str(df.iloc[i + 1, column_index]) if pd.notna(df.iloc[i + 1, column_index]) else ""

            # Check if the current row matches VBD or VBN, and the next row matches VVN (indicating passive voice)
            if re.match(first_term, current_value) and re.match(second_term, next_value):
                file_passive_voice_count += 1

                # Collect the string from the row of VVN
                vvn_strings.append(df.iloc[i + 1, 0])  # Store the string from column 1 of the VVN row

                # Collect the preceding string based on whether it's VBD or VBN
                if re.match(r'VBD', current_value):  # For VBD: 1 row and 1 column before
                    if i - 1 >= 0:
                        preceding_strings.append(df.iloc[i - 1, 0])  # String 1 row and 1 column before
                elif re.match(r'VBN', current_value):  # For VBN: 2 rows and 1 column before
                    if i - 2 >= 0:
                        preceding_strings.append(df.iloc[i - 2, 0])  # String 2 rows and 1 column before

        # Calculate instances per million (IPM)
        file_relative_frequency_ipm = (file_passive_voice_count / file_total_rows) * 1_000_000 if file_total_rows > 0 else 0

        # Accumulate totals
        total_passive_voice_count += file_passive_voice_count
        total_rows += file_total_rows

        # Update the counters for VVN strings and preceding strings
        vvn_strings_counter.update(vvn_strings)
        preceding_strings_counter.update(preceding_strings)

        # Store the result in a list
        results.append({
            "video_id": video_id,
            "publish_date": publish_date,
            "passive_voice_count": file_passive_voice_count,
            "relative_frequency_ipm": file_relative_frequency_ipm,
            "vvn_strings": vvn_strings,  # Add the list of VVN strings for this file
            "preceding_strings": preceding_strings  # Add the list of preceding strings for VBD/VBN
        })

# Calculate total relative frequency in IPM
total_relative_frequency_ipm = (total_passive_voice_count / total_rows) * 1_000_000 if total_rows > 0 else 0

# Create a DataFrame to store the results
results_df = pd.DataFrame(results)

# Convert list of VVN strings and preceding strings into comma-separated strings for easier viewing in the DataFrame
results_df['vvn_strings'] = results_df['vvn_strings'].apply(lambda x: ', '.join(x))
results_df['preceding_strings'] = results_df['preceding_strings'].apply(lambda x: ', '.join(x))

# Save the DataFrame to a CSV file
csv_output_path = "passive_voice_results_with_vvn_and_preceding_strings.csv"
results_df.to_csv(csv_output_path, index=False)  # This will include both VVN and preceding strings

# Print out the results for each file
for result in results:
    print(f"Video ID: {result['video_id']}, "
          f"Publish Date: {result['publish_date']}, Passive Voice Count: {result['passive_voice_count']}, "
          f"Relative Frequency (IPM): {result['relative_frequency_ipm']:.2f}, VVN Strings: {result['vvn_strings']}, "
          f"Preceding Strings: {result['preceding_strings']}")

# Print total counts and frequencies
print(f"\nTotal Passive Voice Count: {total_passive_voice_count}")
print(f"Total Relative Frequency (IPM): {total_relative_frequency_ipm:.2f}")

# Print the top 10 most common VVN strings
print("\nTop 10 most common VVN strings and their frequencies:")
for string, count in vvn_strings_counter.most_common(10):
    print(f"{string}: {count}")

# Print the top 10 most common preceding strings
print("\nTop 10 most common preceding strings and their frequencies:")
for string, count in preceding_strings_counter.most_common(10):
    print(f"{string}: {count}")

Video ID: _7NksAcG_l8, Publish Date: 20240714, Passive Voice Count: 5, Relative Frequency (IPM): 4901.96, VVN Strings: ['taken', 'impacted', 'killed', 'taken', 'bombed'], Preceding Strings: ['casualties', 'children', 'one', 'they', 'attack']
Video ID: RDLZkqqOaLM, Publish Date: 20240726, Passive Voice Count: 3, Relative Frequency (IPM): 1515.92, VVN Strings: ['glued', 'set', 'supposed'], Preceding Strings: ['stores', 'negotiations', 'That']
Video ID: cSMPo4XCSYU, Publish Date: 20231117, Passive Voice Count: 12, Relative Frequency (IPM): 3538.78, VVN Strings: ['prepared', 'released', 'hidden', 'confirmed', 'told', 'taken', 'taken', 'killed', 'killed', 'killed', 'found', 'butchered'], Preceding Strings: ['that', 'footage', 'says', "that's", 'north', '7th', 'she', 'people', 'Palestinians', 'that', 'that', 'that']
Video ID: fhwhQhoKVhk, Publish Date: 20231220, Passive Voice Count: 11, Relative Frequency (IPM): 3378.38, VVN Strings: ['made', 'killed', 'killed', 'killed', 'killed', 'circulat