In [30]:
import os
import pandas as pd
import re

# Path to the folder containing the .txt files
folder_path = "data/en_DW/treetagger_output/"

# Define the column you're interested in and the term you're looking for
column_index = 2  # Assuming you want to search in the third column
term_to_search = 'Hamas'

# Function to extract metadata (video_id and publish_date)
def extract_metadata(df):
    video_id = None
    publish_date = None
    
    # Iterate through the first few rows to find metadata
    for index, row in df.iterrows():
        if '<video_id>' in row.iloc[0]:
            video_id = re.search(r'<video_id>(.*?)</video_id>', row.iloc[0]).group(1)
        if '<publish_date>' in row.iloc[0]:
            publish_date = re.search(r'<publish_date>(.*?)</publish_date>', row.iloc[0]).group(1)
        # Break out once both are found
        if video_id and publish_date:
            break
            
    return video_id, publish_date

# Dictionary to store the results for each file
results = []

# Variables to accumulate totals
total_term_count = 0
total_rows = 0

# Loop through all files in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a .txt file
    if file_name.endswith(".txt"):
        # Construct the full file path
        file_path = os.path.join(folder_path, file_name)
        
        # Read the .txt file (tab-delimited) with no headers and assign column names
        df = pd.read_csv(file_path, delimiter="\t", header=None, names=["col1", "col2", "col3"])
        
        # Extract metadata (video_id, publish_date)
        video_id, publish_date = extract_metadata(df)
        
        # Count the occurrences of the term in the specified column (by index)
        file_total_rows = len(df)  # Total number of rows in the file
        file_term_count = df.iloc[:, column_index].str.contains(term_to_search, case=False, na=False).sum()
        
        # Calculate relative frequency as a percentage
        file_relative_frequency = (file_term_count / file_total_rows) * 100 if file_total_rows > 0 else 0
        
        # Accumulate totals
        total_term_count += file_term_count
        total_rows += file_total_rows
        
        # Store the result in a list
        results.append({
            "video_id": video_id,
            "publish_date": publish_date,
            "term_count": file_term_count,
            "relative_frequency": file_relative_frequency
        })

# Calculate total relative frequency
total_relative_frequency = (total_term_count / total_rows) * 100 if total_rows > 0 else 0

# Print out the results for each file
for result in results:
    print(f"Video ID: {result['video_id']}, "
          f"Publish Date: {result['publish_date']}, {term_to_search} Count: {result['term_count']}, "
          f"Relative Frequency: {result['relative_frequency']:.2f}%")

# Print total counts and frequencies
print(f"\nTotal {term_to_search} Count: {total_term_count}")
print(f"Total Relative Frequency: {total_relative_frequency:.2f}%")

Video ID: _7NksAcG_l8, Publish Date: 20240714, Hamas Count: 3, Relative Frequency: 0.29%
Video ID: RDLZkqqOaLM, Publish Date: 20240726, Hamas Count: 3, Relative Frequency: 0.15%
Video ID: cSMPo4XCSYU, Publish Date: 20231117, Hamas Count: 17, Relative Frequency: 0.50%
Video ID: fhwhQhoKVhk, Publish Date: 20231220, Hamas Count: 15, Relative Frequency: 0.46%
Video ID: QG9QSHEgX6E, Publish Date: 20231107, Hamas Count: 11, Relative Frequency: 0.41%
Video ID: l5pL2DhnjEE, Publish Date: 20231121, Hamas Count: 8, Relative Frequency: 0.38%
Video ID: o0arAFe0WmE, Publish Date: 20231103, Hamas Count: 21, Relative Frequency: 0.67%
Video ID: FYck1QG52I0, Publish Date: 20231019, Hamas Count: 7, Relative Frequency: 0.73%
Video ID: xSgxVYaNfXg, Publish Date: 20231120, Hamas Count: 11, Relative Frequency: 0.47%
Video ID: 7qxY2fjvGhE, Publish Date: 20231027, Hamas Count: 12, Relative Frequency: 1.16%
Video ID: lH3jvaO1wXc, Publish Date: 20240221, Hamas Count: 1, Relative Frequency: 0.13%
Video ID: we2YB