# Frequency script
This notebook is used to extract the frequency for a list of search terms. First some formatting things:

## Setting up

### import packages

In [209]:
import os
import pandas as pd
import re
import json

In [210]:
# Path to the CSV file containing search terms
input_csv_path = "test_frequency.csv"
# Read the input CSV file
search_terms_df = pd.read_csv(input_csv_path)

In [211]:
print(search_terms_df)

    Token  Tag                Lemma
0     NaN  NaN                 kill
1     NaN  NaN                 Gaza
2     NaN  NaN                 gaza
3  killed  VVN                  NaN
4     NaN  NaN       United Nations
5     NaN  NaN  Israeli government 


In [221]:
# input_file
file_path = "data/en_CNN/treetagger_output/israeli military on high alert for potential imminent attack by iran_output.json"
# Open the JSON file and load the data with utf-8 encoding
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
    
# Accessing various parts of the JSON data
video_id = data['video_id']
publish_date = data['publish_date']
video_title = data['video_title']

# Access the treetagger_output
treetagger_output = data['treetagger_output']

# Convert the treetagger_output list of dictionaries to a DataFrame
df_treetagger = pd.DataFrame(treetagger_output)


In [213]:
def search_file_ci(search_term, filedf):
    # Initialize row_indices to None
    row_indices = None

    if pd.isna(search_term['Token']):
        pass  # Do nothing if Token is NaN
    else:
        # Convert search term to lowercase
        token = search_term['Token'].lower()
        # Create a boolean mask for matches in a case-insensitive manner
        match = filedf.apply(lambda col: col.str.lower().isin([token]) if col.dtype == 'object' else col.isin([token]))
        # Stack the DataFrame to get the positions where matches occurred
        positions = match.stack()[match.stack()]
        row_indices = positions.index.get_level_values(0)

    if pd.isna(search_term['Tag']):
        pass  # Do nothing if Tag is NaN
    else:
        # Convert search term to lowercase
        tag = search_term['Tag'].lower()
        if row_indices is not None:
            tag_match = filedf.loc[row_indices, 'Tag'].str.lower() == tag
            row_indices = row_indices[tag_match]
        else:
            match = filedf['Tag'].str.lower().isin([tag])
            row_indices = filedf.index[match]

    if pd.isna(search_term['Lemma']):
        pass  # Do nothing if Lemma is NaN
    else:
        # Convert search term to lowercase
        lemma = search_term['Lemma'].lower()
        if row_indices is not None:
            lemma_match = filedf.loc[row_indices, 'Lemma'].str.lower() == lemma
            row_indices = row_indices[lemma_match]
        else:
            match = filedf['Lemma'].str.lower().isin([lemma])
            row_indices = filedf.index[match]

    return row_indices

In [214]:
def search_file(search_term, filedf):
    if 'row_indices' in locals():
          del row_indices
    if pd.isna(search_term['Token']):
        pass  # Do nothing if Token is NaN
    else:
        match = filedf.isin([search_term['Token']])
        # Stack the DataFrame to get the positions where matches occurred
        positions = match.stack()[match.stack()]
        row_indices = positions.index.get_level_values(0)
    if pd.isna(search_term['Tag']):
        pass  # Do nothing if Token is NaN
    else:
        if 'row_indices' in locals():
            tag_match = filedf.loc[row_indices, 'Tag'] == search_term['Tag']
            row_indices = row_indices[tag_match]
        else:
            match = filedf.isin([search_term['Tag']])
            # Stack the DataFrame to get the positions where matches occurred
            positions = match.stack()[match.stack()]
            row_indices = positions.index.get_level_values(0)
    if pd.isna(search_term['Lemma']):
        pass  # Do nothing if Token is NaN
    else:
        if 'row_indices' in locals():
            lemma_match = filedf.loc[row_indices, 'Lemma'] == search_term['Lemma']
            row_indices = row_indices[lemma_match]
        else:
            match = filedf.isin([search_term['Lemma']])
            # Stack the DataFrame to get the positions where matches occurred
            positions = match.stack()[match.stack()]
            row_indices = positions.index.get_level_values(0)
    return(row_indices)

In [215]:
def split_two_word_entries(entry):
    # Create a copy of the original dictionary for the first entry
    first_dict = entry.copy()
    second_dict = {k: pd.NA for k in entry}  # Initialize the second dictionary with pd.NA
    
    for key, value in entry.items():
        if isinstance(value, str) and len(value.split()) == 2:  # Check if value is a string and has two words
            word1, word2 = value.split()
            first_dict[key] = word1  # Replace the two-word entry with the first word
            second_dict[key] = word2  # Store the second word in the second dictionary
            break  # Stop once a two-word entry is found and split
            
    return first_dict, second_dict  # Return both dictionaries

In [216]:
def check_two_terms(search_term):
    two_terms = 'FALSE'
    for key, value in search_term.items():
        if isinstance(value, str) and len(value.split()) == 2: 
            #print("!two terms found")
            two_terms= 'TRUE'
    return two_terms

there is currently a bug where for some reason with double terms it gets every index twice. Through the intersection this disappears again though.

In [223]:
# Loop through each row in the search terms CSV
file_result = {
    "file_name": "random_name",
            }
for _, row in search_terms_df.iterrows():
    # Process columns 'Token', 'Tag', and 'Lemma'
    search_term = {
    'Token': row['Token'],
    'Tag': row['Tag'],
    'Lemma': row['Lemma']
    }
    print(df_treetagger)
    #Here is defined what the column of the search result will look like
    term_column_name = ".".join(str(value) if pd.notna(value) else "" for value in search_term.values())
    #print(search_term)
    two_terms= check_two_terms(search_term)
    #check for two word terms:
    if two_terms=='FALSE':
        print(search_file_ci(search_term,df_treetagger))
        file_result[term_column_name] = len(search_file_ci(search_term,df_treetagger))
    if two_terms=='TRUE':
        # Perform the split and assign to separate variables
        single_term, following_term = split_two_word_entries(search_term)
        indices_single_term = search_file_ci(single_term,df)
        indices_following_term = search_file_ci(following_term,df)
        # Keep only the indices that are present in both sets
        print("indices_single_term: ",indices_single_term)
        print("indices_following_term: ",indices_following_term)
        common_indices = indices_single_term.intersection(indices_following_term-1)
        # Output the result
        print("Two term indices:", len(common_indices))
        file_result[term_column_name] = len(common_indices)

print(file_result)
            

Index([], dtype='int64')
Index([1429, 1984, 1995], dtype='int64')
Index([1429, 1984, 1995], dtype='int64')
Index([], dtype='int64')
indices_single_term:  Index([928, 946, 1390, 1473], dtype='int64')
indices_following_term:  Index([], dtype='int64')
Two term indices: 0
indices_single_term:  Index([86, 271, 438, 1345], dtype='int64')
indices_following_term:  Index([272, 439, 2266], dtype='int64')
Two term indices: 2
{'file_name': 'random_name', '..kill': 0, '..Gaza': 3, '..gaza': 3, 'killed.VVN.': 0, '..United Nations': 0, '..Israeli government ': 2}
