# Frequency script
This notebook is used to extract the frequency for a list of search terms.

## Setting up

In [6]:
import os
import pandas as pd
import re
import json

# Path to the CSV file containing search terms
input_csv_path = "killed-murdered_input.csv"
file_name_addition = input_csv_path.split("_")[0]

# Read the input CSV file
search_terms_df = pd.read_csv(input_csv_path)

# Strip trailing (and leading) whitespaces from all string columns
search_terms_df = search_terms_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Define the folders you want to process
folder_paths = [
    "data/en_BBCNews/treetagger_output/",
    "data/en_CNN/treetagger_output/",
    "data/en_DW/treetagger_output/",
    "data/en_AJ/treetagger_output/"
]

# Create the 'frequency_data' directory if it doesn't exist
output_directory = "killed_data"
os.makedirs(output_directory, exist_ok=True)

def search_file_ci(search_term, filedf):
    # Initialize row_indices to None
    row_indices = None

    if pd.isna(search_term['Token']):
        pass  # Do nothing if Token is NaN
    else:
        token = search_term['Token'].lower().strip()
        match = filedf.apply(lambda col: col.str.lower().str.strip().isin([token]) if col.dtype == 'object' else col.isin([token]))
        positions = match.stack()[match.stack()]
        row_indices = positions.index.get_level_values(0)

    if pd.isna(search_term['Tag']):
        pass
    else:
        tag = search_term['Tag'].lower().strip()
        if row_indices is not None:
            tag_match = filedf.loc[row_indices, 'Tag'].str.lower().str.strip() == tag
            row_indices = row_indices[tag_match]
        else:
            match = filedf['Tag'].str.lower().str.strip().isin([tag])
            row_indices = filedf.index[match]

    if pd.isna(search_term['Lemma']):
        pass
    else:
        lemma = search_term['Lemma'].lower().strip()
        if row_indices is not None:
            lemma_match = filedf.loc[row_indices, 'Lemma'].str.lower().str.strip() == lemma
            row_indices = row_indices[lemma_match]
        else:
            match = filedf['Lemma'].str.lower().str.strip().isin([lemma])
            row_indices = filedf.index[match]

    return row_indices if row_indices is not None else pd.Index([])

def search_term_iterate(search_terms_df, df, file_result):
    for _, row in search_terms_df.iterrows():
        search_term = {
            'Token': row['Token'],
            'Tag': row['Tag'],
            'Lemma': row['Lemma']
        }
        
        term_column_name = ".".join(str(value) if pd.notna(value) else "" for value in search_term.values())
        
        preceding_terms = 'TRUE'
        preceding_term1 = {'Token': pd.NA, 'Tag': "VBD", 'Lemma': pd.NA}
        preceding_term2 = {'Token': pd.NA, 'Tag': "VBN", 'Lemma': pd.NA}

        if preceding_terms == 'TRUE':
            indices_single_term = search_file_ci(search_term, df)
            indices_preceding_term1 = search_file_ci(preceding_term1, df)
            indices_preceding_term2 = search_file_ci(preceding_term2, df)

            indices_adjusted_preceding_term1 = [idx + 1 for idx in indices_preceding_term1]
            indices_adjusted_preceding_term2 = [idx + 1 for idx in indices_preceding_term2]

            common_indices1 = set(indices_single_term).intersection(indices_adjusted_preceding_term1)
            common_indices2 = set(indices_single_term).intersection(indices_adjusted_preceding_term2)
            common_indices = common_indices1.union(common_indices2)

            file_result[term_column_name] = len(common_indices)
            
            if common_indices:
                term_subject = term_column_name + "subject"
                token1_indices = [idx - 2 for idx in common_indices1 if (idx - 2) in df.index]
                token2_indices = [idx - 3 for idx in common_indices2 if (idx - 3) in df.index]

                token1 = df.loc[token1_indices, 'Token'].tolist() if token1_indices else [""]
                token2 = df.loc[token2_indices, 'Token'].tolist() if token2_indices else [""]

                file_result[term_subject] = " ".join(token1 + token2).strip()

    return file_result

total_files = sum([len([name for name in os.listdir(folder) if name.endswith(".json")]) for folder in folder_paths])
processed_files = 0

for folder_path in folder_paths:
    middle_folder_name = folder_path.split('/')[1]
    print(f"Processing folder: {middle_folder_name}")
    
    folder_results = []
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            processed_files += 1
            progress_percentage = (processed_files / total_files) * 100
            print(f"\rProcessing file {processed_files}/{total_files} ({progress_percentage:.2f}%) - {file_name} in folder: {middle_folder_name}", end='', flush=True)
            
            file_path = os.path.join(folder_path, file_name)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            
            video_id = data['video_id']
            publish_date = data['publish_date']
            video_title = data['video_title']
              
            treetagger_output = data['treetagger_output']
              
            content_df = pd.DataFrame(treetagger_output)
            try:
                total_word_count = content_df["Token"].fillna('').str.split().str.len().sum()
                total_word_count = int(total_word_count)
            except KeyError as e:
                print(f"KeyError: {e}. The column 'Token' does not exist in the DataFrame for File {file_path}")
                continue
            except Exception as e:
                print(f"An unexpected error occurred: {e}")
                continue

            file_result = {
                "file_name": file_name,
                "video_id": video_id,
                "publish_date": publish_date,
                "total_word_count": total_word_count
            }
            file_result = search_term_iterate(search_terms_df, content_df, file_result)
            folder_results.append(file_result)

    folder_df = pd.DataFrame(folder_results)

    output_csv_path = os.path.join(output_directory, f"{middle_folder_name}_{file_name_addition}.csv")
    folder_df.to_csv(output_csv_path, index=False)

    print(f"\nFinished processing all files in folder: {middle_folder_name}, results saved to {output_csv_path}")

print("\nProcessing complete for all folders.")

Processing folder: en_BBCNews
Processing file 532/4744 (11.21%) - “90 killed and 300 injured” in israeli strike on gaza “humanitarian area”  bbc news_output.json in folder: en_BBCNewsewsen_BBCNewsws
Finished processing all files in folder: en_BBCNews, results saved to killed_data\en_BBCNews_killed-murdered.csv
Processing folder: en_CNN
Processing file 838/4744 (17.66%) - ‘you decided to still drop a bomb’ wolf presses idf spokesman on israeli airstrike on refugee camp_output.json in folder: en_CNNNN
Finished processing all files in folder: en_CNN, results saved to killed_data\en_CNN_killed-murdered.csv
Processing folder: en_DW
Processing file 1056/4744 (22.26%) - german fm hamas holding entire gaza population hostage  dw news⁣_output.json in folder: en_DWolder: en_DWolder: en_DWW_DW_DWWWKeyError: 'Token'. The column 'Token' does not exist in the DataFrame for File data/en_DW/treetagger_output/german fm hamas holding entire gaza population hostage  dw news⁣_output.json
Processing file 1