# Frequency script
This notebook is used to extract the frequency for a list of search terms.

## Setting up

### import packages

In [None]:
import os
import pandas as pd
import re

### define search terms
The search terms for the frequency analysis are located in a search mask.
This is currently located in a csv file, with the following structure:
| Token| Tag|Lemma

For searching for a specific token (e.g. "Palestinians") the search term needs to be placed in the Token column, for a specific Tag (e.g. "VVN") in the Tag column and to look for a Lemma in the Lemma column.
Currently having things in more than one column, doesn't work.
Several words per cell does not properly seem implemented.

In [None]:
# Path to the CSV file containing search terms
input_csv_path = "freqency_en_input.csv"

# Read the input CSV file
search_terms_df = pd.read_csv(input_csv_path)

### define the folders
the folders in folder_paths will be searched for the search terms.

In [None]:
# Define the folders you want to process
folder_paths = [
    "data/en_BBCNews/treetagger_output/",
    "data/en_CNN/treetagger_output/",
    "data/en_DW/treetagger_output/",
    "data/en_AJ/treetagger_output/"
]

### create output folder
here a .csv file with the frequency results will be saved

In [None]:
# Create the 'frequency_data' directory if it doesn't exist
output_directory = "frequency_data"
os.makedirs(output_directory, exist_ok=True)

### Count total files
For keeping track how far along the script is, we track processed_files and will compare them to total_files

In [None]:
# Calculate the total number of files to be processed
total_files = sum([len([name for name in os.listdir(folder) if name.endswith(".txt")]) for folder in folder_paths])
processed_files = 0

### Initiate cleaning function
In some cases the txt structure contains an absurd combination of different quotation marks which will break the script. If that happens and onyl then, we preprocess the offending file to deal with the quotation marks 

In [None]:
# Function to preprocess the file to handle unusual lines
def preprocess_file(file_path):
    cleaned_lines = []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        for line in file:
            # Check for and handle unusual characters or formatting
            if line.count('"') % 2 == 0:  # Ensure the line has an even number of quotes
                cleaned_lines.append(line)
            elif line.strip():  # If the line is not empty, clean it up
                cleaned_line = line.replace('“', '"').replace('”', '"')  # Replace unusual quotes with standard quotes
                cleaned_line = cleaned_line.replace('``', '"')  # Replace backticks with quotes
                cleaned_lines.append(cleaned_line)
    
    # Write the cleaned lines to a temporary file
    cleaned_file_path = file_path.replace(".txt", "_cleaned.txt")
    with open(cleaned_file_path, 'w', encoding='utf-8') as cleaned_file:
        cleaned_file.writelines(cleaned_lines)
    
    return cleaned_file_path

### initiate function for metadata and content
the metadata is located at the beginning of the txt files while the actually content is between <doc> and </doc> tags.
Currently no words in the titles of videos are analyzed.
There could be a switch for that (?)

In [None]:
# Function to extract metadata and content between <doc> and </doc> tags
def extract_metadata_and_content(df):
    video_id = None
    publish_date = None
    content_start = None
    content_end = None

    # Fill NaN values with an empty string to prevent issues with `in` checks
    df = df.fillna('')

    # Iterate through the DataFrame to find metadata and content boundaries
    for index, row in df.iterrows():
        row_content = row.iloc[0]
        if '<video_id>' in row_content:
            video_id = re.search(r'<video_id>(.*?)</video_id>', row_content).group(1)
        if '<publish_date>' in row_content:
            publish_date = re.search(r'<publish_date>(.*?)</publish_date>', row_content).group(1)
        if '<doc>' in row_content:
            content_start = index + 1  # Start after <doc> tag
        if '</doc>' in row_content:
            content_end = index  # End before </doc> tag
            break  # Exit loop once end tag is found

    return video_id, publish_date, content_start, content_end


## Main loop
Here we iterate through folders, and text files to count the frequency of the search terms.

In [1]:
# Loop through all folder paths
for folder_path in folder_paths:
    # Extract the middle section of the folder path
    middle_folder_name = folder_path.split('/')[1]  # Adjust based on your folder structure
    print(f"Processing folder: {middle_folder_name}")
    
    # Create a list to store data for the current folder
    folder_results = []
    
    # Loop through all files in the current folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a .txt file
        if file_name.endswith(".txt"):
            processed_files += 1
            progress_percentage = (processed_files / total_files) * 100
            print(f"\rProcessing file {processed_files}/{total_files} ({progress_percentage:.2f}%) - {file_name} in folder: {middle_folder_name}", end='', flush=True)
            
            # Construct the full file path
            file_path = os.path.join(folder_path, file_name)
            
            # Attempt to read the file using the default 'c' engine first
            try:
                df = pd.read_csv(file_path, delimiter="\t", header=None, names=["col1", "col2", "col3"], dtype=str, on_bad_lines='skip')
            except pd.errors.ParserError:
                print(f"\nParserError with default engine for file: {file_name}. Preprocessing and retrying with engine='python'.")
                # Preprocess the file to handle problematic lines
                cleaned_file_path = preprocess_file(file_path)
                # Retry with the cleaned file using the Python engine
                df = pd.read_csv(cleaned_file_path, delimiter="\t", header=None, names=["col1", "col2", "col3"], dtype=str, engine='python', on_bad_lines='skip')

            
            # Extract metadata and content range
            video_id, publish_date, content_start, content_end = extract_metadata_and_content(df)
            
            # Skip files that don't contain the expected tags
            if content_start is None or content_end is None:
                continue
            
            # Extract the relevant content
            content_df = df.iloc[content_start:content_end]
            
            # Calculate total word count
            total_word_count = content_df["col3"].fillna('').str.split().str.len().sum()
            total_word_count = int(total_word_count)  # Ensure it's an integer

            # Initialize a dictionary to store term counts for this file
            file_result = {
                "file_name": file_name,
                "video_id": video_id,
                "publish_date": publish_date,
                "total_word_count": total_word_count
            }
            
            # Loop through each row in the search terms CSV
            for _, row in search_terms_df.iterrows():
                # Process columns 'Token', 'Tag', and 'Lemma'
                columns_to_search = {
                    'Token': row['Token'],
                    'Tag': row['Tag'],
                    'Lemma': row['Lemma']
                }
                
                for col_name, term_to_search in columns_to_search.items():
                    if pd.notna(term_to_search) and term_to_search.strip():
                        term_to_search = str(term_to_search).strip()
                        column_index = {'Token': 2, 'Tag': 1, 'Lemma': 0}[col_name]  # Corresponding column index in DataFrame
                        term_column_name = f"{col_name}_{term_to_search}"
                        
                        # If the term contains two words, search for them as a sequence
                        words = term_to_search.split()
                        if len(words) == 2:
                            first_word, second_word = words
                            # Check for the sequence of two words
                            count_two_word_occurrences = (
                                content_df.iloc[:, column_index].str.contains(fr'\b{first_word}\b', case=False, na=False) & 
                                content_df.iloc[:, column_index + 1].str.contains(fr'\b{second_word}\b', case=False, na=False)
                            ).sum()
                            
                            file_result[term_column_name] = count_two_word_occurrences
                        else:
                            # Handle single-word search term
                            file_result[term_column_name] = content_df.iloc[:, column_index].str.contains(fr'\b{term_to_search}\b', case=False, na=False).sum()
            
            # Add the result dictionary to the list for this folder
            folder_results.append(file_result)

    # Convert the folder results into a DataFrame
    folder_df = pd.DataFrame(folder_results)

    # Save to a CSV file in the 'frequency_data' directory using the middle folder name
    output_csv_path = os.path.join(output_directory, f"{middle_folder_name}_frequency.csv")
    folder_df.to_csv(output_csv_path, index=False)

    print(f"\nFinished processing all files in folder: {middle_folder_name}, results saved to {output_csv_path}")

print("\nProcessing complete for all folders.")


Processing folder: en_BBCNews
Processing file 533/4743 (11.24%) - “90 killed and 300 injured” in israeli strike on gaza “humanitarian area”  bbc news_treetagger_output.txt in folder: en_BBCNewsewsen_BBCNewsws
Finished processing all files in folder: en_BBCNews, results saved to frequency_data\en_BBCNews_frequency.csv
Processing folder: en_CNN
Processing file 839/4743 (17.69%) - ‘you decided to still drop a bomb’ wolf presses idf spokesman on israeli airstrike on refugee camp_treetagger_output.txt in folder: en_CNNNN
Finished processing all files in folder: en_CNN, results saved to frequency_data\en_CNN_frequency.csv
Processing folder: en_DW
Processing file 1654/4743 (34.87%) - ‘it’s time for this war to end’_treetagger_output.txt in folder: en_DWews_treetagger_output.txt in folder: en_DW: en_DWn folder: en_DWDW_DW
Finished processing all files in folder: en_DW, results saved to frequency_data\en_DW_frequency.csv
Processing folder: en_AJ
Processing file 4403/4743 (92.83%) - war on gaza 