# Frequency script
This notebook is used to extract the frequency for a list of search terms.

## Setting up

### import packages

In [1]:
import os
import pandas as pd
import re
import json

### define search terms
The search terms for the frequency analysis are located in a search mask.
This is currently located in a csv file, with the following structure:
| Token | Tag | Lemma |
|-------|-----|-------|
| sings |     |       |
|       | VVN |       |
|       |     | sing  |

For searching for a specific token (e.g. "Palestinians") the search term needs to be placed in the Token column, for a specific Tag (e.g. "VVN") in the Tag column and to look for a Lemma in the Lemma column.
Currently having things in more than one column, doesn't work.
Several words per cell does not properly seem implemented.

In [2]:
# Path to the CSV file containing search terms
input_csv_path = "frequency_en_input.csv"
file_name_addition = input_csv_path.split("_")[0]

# Read the input CSV file
search_terms_df = pd.read_csv(input_csv_path)

# Strip trailing (and leading) whitespaces from all string columns
search_terms_df = search_terms_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Now search_terms_df has no trailing whitespaces in any of the cells

file_name_addition is a variable that will be used so the output file reflects the input filename.

### define the folders
the folders in folder_paths will be searched for the search terms.

In [3]:
# Define the folders you want to process
folder_paths = [
    "data/en_BBCNews/treetagger_output/",
    "data/en_CNN/treetagger_output/",
    "data/en_DW/treetagger_output/",
    "data/en_AJ/treetagger_output/"
]

### create output folder
here a .csv file with the frequency results will be saved

In [4]:
# Create the 'frequency_data' directory if it doesn't exist
output_directory = "frequency_data_new"
os.makedirs(output_directory, exist_ok=True)

## Initiate functions

### initiate search function

In [5]:
def search_file(search_term, filedf):
    if 'row_indices' in locals():
          del row_indices
    if pd.isna(search_term['Token']):
        pass  # Do nothing if Token is NaN
    else:
        match = filedf.isin([search_term['Token']])
        # Stack the DataFrame to get the positions where matches occurred
        positions = match.stack()[match.stack()]
        row_indices = positions.index.get_level_values(0)
    if pd.isna(search_term['Tag']):
        pass  # Do nothing if Token is NaN
    else:
        if 'row_indices' in locals():
            tag_match = filedf.loc[row_indices, 'Tag'] == search_term['Tag']
            row_indices = row_indices[tag_match]
        else:
            match = filedf.isin([search_term['Tag']])
            # Stack the DataFrame to get the positions where matches occurred
            positions = match.stack()[match.stack()]
            row_indices = positions.index.get_level_values(0)
    if pd.isna(search_term['Lemma']):
        pass  # Do nothing if Token is NaN
    else:
        if 'row_indices' in locals():
            lemma_match = filedf.loc[row_indices, 'Lemma'] == search_term['Lemma']
            row_indices = row_indices[lemma_match]
        else:
            match = filedf.isin([search_term['Lemma']])
            # Stack the DataFrame to get the positions where matches occurred
            positions = match.stack()[match.stack()]
            row_indices = positions.index.get_level_values(0)
    return(row_indices)

Below the same function but case insensitive

In [6]:
def search_file_ci(search_term, filedf):
    # Initialize row_indices to None
    row_indices = None

    if pd.isna(search_term['Token']):
        pass  # Do nothing if Token is NaN
    else:
        # Convert search term to lowercase
        token = search_term['Token'].lower()
        # Create a boolean mask for matches in a case-insensitive manner
        match = filedf.apply(lambda col: col.str.lower().isin([token]) if col.dtype == 'object' else col.isin([token]))
        # Stack the DataFrame to get the positions where matches occurred
        positions = match.stack()[match.stack()]
        row_indices = positions.index.get_level_values(0)

    if pd.isna(search_term['Tag']):
        pass  # Do nothing if Tag is NaN
    else:
        # Convert search term to lowercase
        tag = search_term['Tag'].lower()
        if row_indices is not None:
            tag_match = filedf.loc[row_indices, 'Tag'].str.lower() == tag
            row_indices = row_indices[tag_match]
        else:
            match = filedf['Tag'].str.lower().isin([tag])
            row_indices = filedf.index[match]

    if pd.isna(search_term['Lemma']):
        pass  # Do nothing if Lemma is NaN
    else:
        # Convert search term to lowercase
        lemma = search_term['Lemma'].lower()
        if row_indices is not None:
            lemma_match = filedf.loc[row_indices, 'Lemma'].str.lower() == lemma
            row_indices = row_indices[lemma_match]
        else:
            match = filedf['Lemma'].str.lower().isin([lemma])
            row_indices = filedf.index[match]

    return row_indices

### functions to deal with two word terms

In [7]:
def split_two_word_entries(entry):
    # Create a copy of the original dictionary for the first entry
    first_dict = entry.copy()
    second_dict = {k: pd.NA for k in entry}  # Initialize the second dictionary with pd.NA
    
    for key, value in entry.items():
        if isinstance(value, str) and len(value.split()) == 2:  # Check if value is a string and has two words
            word1, word2 = value.split()
            first_dict[key] = word1  # Replace the two-word entry with the first word
            second_dict[key] = word2  # Store the second word in the second dictionary
            break  # Stop once a two-word entry is found and split
            
    return first_dict, second_dict  # Return both dictionaries

In [8]:
def check_two_terms(search_term):
    two_terms = 'FALSE'
    for key, value in search_term.items():
        if isinstance(value, str) and len(value.split()) == 2: 
            #print("!two terms found")
            two_terms= 'TRUE'
    return two_terms

### function to iterate through search terms 

In [9]:
# Loop through each row in the search terms CSV
def search_term_iterate(search_terms_df, df, file_result):
    for _, row in search_terms_df.iterrows():
        # Process columns 'Token', 'Tag', and 'Lemma'
        search_term = {
        'Token': row['Token'],
        'Tag': row['Tag'],
        'Lemma': row['Lemma']
        }
        #print(search_terms_df)
        #print(df)
        #Here is defined what the column of the search result will look like
        term_column_name = ".".join(str(value) if pd.notna(value) else "" for value in search_term.values())
        #print(search_term)
        two_terms= check_two_terms(search_term)
        #check for two word terms:
        if two_terms=='FALSE':
            #print(search_file_ci(search_term,df))
            file_result[term_column_name] = len(search_file_ci(search_term,df))

 
        if two_terms=='TRUE':
            # Perform the split and assign to separate variables
            single_term, following_term = split_two_word_entries(search_term)
            indices_single_term = search_file_ci(single_term,df)
            indices_following_term = search_file_ci(following_term,df)
            # Keep only the indices that are present in both sets
            #print("indices_single_term: ",indices_single_term)
            #print("indices_following_term: ",indices_following_term)
            common_indices = indices_single_term.intersection(indices_following_term-1)
            # Output the result
            #print("Two term indices:", common_indices)
            file_result[term_column_name] = len(common_indices)
    return file_result

## Main loop
Here we iterate through folders, and text files to count the frequency of the search terms.

In [10]:
# Calculate the total number of files to be processed
total_files = sum([len([name for name in os.listdir(folder) if name.endswith(".json")]) for folder in folder_paths])
processed_files = 0
# Loop through all folder paths
for folder_path in folder_paths:
    # Extract the middle section of the folder path
    middle_folder_name = folder_path.split('/')[1]  # Adjust based on your folder structure
    print(f"Processing folder: {middle_folder_name}")
    
    # Create a list to store data for the current folder
    folder_results = []
    
    # Loop through all files in the current folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a .txt file
        if file_name.endswith(".json"):
            processed_files += 1
            progress_percentage = (processed_files / total_files) * 100
            print(f"\rProcessing file {processed_files}/{total_files} ({progress_percentage:.2f}%) - {file_name} in folder: {middle_folder_name}", end='', flush=True)
            
            # Construct the full file path
            file_path = os.path.join(folder_path, file_name)
            
            # Open the JSON file and load the data with utf-8 encoding
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            # Accessing various parts of the JSON data
            video_id = data['video_id']
            publish_date = data['publish_date']
            video_title = data['video_title']
              
            # Access the treetagger_output
            treetagger_output = data['treetagger_output']
              
            # Convert the treetagger_output list of dictionaries to a DataFrame
            content_df = pd.DataFrame(treetagger_output)
            try:
                total_word_count = content_df["Token"].fillna('').str.split().str.len().sum()
                total_word_count = int(total_word_count)  # Ensure it's an integer
            except KeyError as e:
                print(f"KeyError: {e}. The column 'Token' does not exist in the DataFrame for File {file_path}")
                continue  # Skip this iteration and move to the next
            except Exception as e:
                print(f"An unexpected error occurred: {e}")
                continue  # Skip this iteration for any other unexpected exceptions

            # Initialize a dictionary to store term counts for this file and iterate through the search terms
            file_result = {
                "file_name": file_name,
                "video_id": video_id,
                "publish_date": publish_date,
                "total_word_count": total_word_count
            }
            file_result = search_term_iterate(search_terms_df, content_df, file_result)
            #print(file_result)
            # Add the result dictionary to the list for this folder
            folder_results.append(file_result)

    # Convert the folder results into a DataFrame
    folder_df = pd.DataFrame(folder_results)

    # Save to a CSV file in the 'frequency_data' directory using the middle folder name
    output_csv_path = os.path.join(output_directory, f"{middle_folder_name}_{file_name_addition}.csv")
    folder_df.to_csv(output_csv_path, index=False)

    print(f"\nFinished processing all files in folder: {middle_folder_name}, results saved to {output_csv_path}")

print("\nProcessing complete for all folders.")


Processing folder: en_BBCNews
Processing file 532/4744 (11.21%) - “90 killed and 300 injured” in israeli strike on gaza “humanitarian area”  bbc news_output.json in folder: en_BBCNewsewsen_BBCNewsws
Finished processing all files in folder: en_BBCNews, results saved to frequency_data_new\en_BBCNews_frequency.csv
Processing folder: en_CNN
Processing file 838/4744 (17.66%) - ‘you decided to still drop a bomb’ wolf presses idf spokesman on israeli airstrike on refugee camp_output.json in folder: en_CNNNN
Finished processing all files in folder: en_CNN, results saved to frequency_data_new\en_CNN_frequency.csv
Processing folder: en_DW
Processing file 1056/4744 (22.26%) - german fm hamas holding entire gaza population hostage  dw news⁣_output.json in folder: en_DWolder: en_DWolder: en_DWW_DW_DWWWKeyError: 'Token'. The column 'Token' does not exist in the DataFrame for File data/en_DW/treetagger_output/german fm hamas holding entire gaza population hostage  dw news⁣_output.json
Processing file