In [4]:

import os
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk import bigrams, FreqDist
import numpy as np

# Download necessary NLTK resources
nltk.download('punkt')

# Path to the folder containing the .txt files
folder_path = "data/en_DW/treetagger_output/"
output_csv_path = "data/en_DW/keyword_frequencies_collocations.csv"

# Define custom stop words
custom_stop_words = ['uh', 'um']
stop_words = custom_stop_words + list(ENGLISH_STOP_WORDS)

# Function to extract metadata (video_id and publish_date) from a DataFrame
def extract_metadata(df):
    video_id = None
    publish_date = None

    for index, row in df.iterrows():
        if '<video_id>' in row.iloc[0]:
            video_id = re.search(r'<video_id>(.*?)</video_id>', row.iloc[0]).group(1)
        if '<publish_date>' in row.iloc[0]:
            publish_date = re.search(r'<publish_date>(.*?)</publish_date>', row.iloc[0]).group(1)
        if video_id and publish_date:
            break

    return video_id, publish_date

# Function to extract text from a DataFrame
def extract_text(df):
    text = ' '.join(df.iloc[:, 0].astype(str).tolist())  # Join all rows from the first column
    return text

# Read and process all text data
texts = []
metadata = []
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, delimiter="\t", header=None)

        # Extract text and metadata
        text = extract_text(df)
        video_id, publish_date = extract_metadata(df)

        texts.append(text)
        metadata.append({"video_id": video_id, "publish_date": publish_date})

# Preprocess the text data with custom stop words
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000)
X = vectorizer.fit_transform(texts)

# Calculate TF-IDF scores
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = X.sum(axis=0).A1
tfidf_dict = dict(zip(feature_names, tfidf_scores))

# Sort terms by their TF-IDF score
sorted_tfidf = sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)

# Get top keywords
top_keywords = [term for term, score in sorted_tfidf[:10]]  # Adjust the number of top keywords as needed

# Dictionaries to store frequency and collocation results
frequency_results = []
total_frequencies = {keyword: 0 for keyword in top_keywords}
total_word_count = 0
collocation_freq = FreqDist()

# Function to calculate collocations with a context window
def calculate_collocations_with_window(text, top_keywords, window=3):
    tokens = nltk.word_tokenize(text.lower())  # Tokenize the text
    keyword_collocations = FreqDist()  # Store collocations involving top keywords

    # Loop through each token to create a window of surrounding words
    for i in range(len(tokens)):
        if tokens[i] in top_keywords:
            # Define the start and end of the window
            start = max(i - window, 0)
            end = min(i + window + 1, len(tokens))
            context = tokens[start:end]  # Get the context window

            # Generate all bigrams from the context
            context_bigrams = list(bigrams(context))
            for bigram in context_bigrams:
                keyword_collocations[bigram] += 1  # Increment frequency of the bigram

    return keyword_collocations

def sketch_logdice(bigram, collocation_freq, keyword_freq):
    word1, word2 = bigram

    freq_bigram = collocation_freq[bigram]
    freq_word1_context = sum(collocation_freq[(word1, other_word)] for other_word in collocation_freq if other_word != word1)
    freq_word2_context = sum(collocation_freq[(other_word, word2)] for other_word in collocation_freq if other_word != word2)

    # Avoid division by zero
    if freq_word1_context + freq_word2_context == 0:
        return float('-inf')  # Or another suitable value

    # LogDice calculation with a base adjustment
    logdice_score = 14 + np.log2((2 * freq_bigram) / (freq_word1_context + freq_word2_context))

    return logdice_score

# Loop through all files to count keyword frequencies and collocations
for text, meta in zip(texts, metadata):
    term_counts = {keyword: text.lower().count(keyword) for keyword in top_keywords}
    word_count = len(text.split())
    total_terms = sum(term_counts.values())

    for keyword in top_keywords:
        total_frequencies[keyword] += term_counts[keyword]
    total_word_count += word_count

    term_relative_frequencies = {keyword: (term_counts[keyword] / word_count) * 1_000_000 if word_count > 0 else 0 for keyword in top_keywords}

    # Calculate collocations for each file using the updated function
    file_collocations = calculate_collocations_with_window(text, top_keywords)
    for bigram in file_collocations.keys():
        collocation_freq[bigram] += file_collocations[bigram]

    result = {
        "video_id": meta["video_id"],
        "publish_date": meta["publish_date"],
        "total_term_count": total_terms,
        "total_word_count": word_count
    }
    result.update(term_counts)
    result.update({f"{keyword}_relative": term_relative_frequencies[keyword] for keyword in top_keywords})

    frequency_results.append(result)

# Create a DataFrame from the results
df_results = pd.DataFrame(frequency_results)

# Calculate LogDice for each collocation
logdice_scores = {bigram: sketch_logdice(bigram, collocation_freq, total_frequencies) for bigram in collocation_freq.keys()}

# Sort collocations by LogDice score
sorted_collocations = sorted(logdice_scores.items(), key=lambda x: x[1], reverse=True)

# Print out the top collocations based on LogDice
print("Top Collocations by LogDice Score:")
for bigram, score in sorted_collocations[:20]:  # Adjust the number of top collocations as needed
    print(f"  Collocation: {bigram}, LogDice Score: {score:.4f}")

# Save the DataFrame to a CSV file
df_results.to_csv(output_csv_path, index=False)

# Calculate and print total frequencies and relative frequencies across all files
print("Total Frequencies Across All Files:")
total_keyword_count = sum(total_frequencies.values())
for keyword, count in total_frequencies.items():
    print(f"  {keyword} Total Count: {count}")
    print(f"  {keyword} Relative Frequency: {(count / total_word_count) * 1_000_000:.2f} instances per million")
print(f"  Total Word Count Across All Files: {total_word_count}")


Processing folder: en_CNN
Processing file 306/306 (100.00%) - ‘you decided to still drop a bomb’ wolf presses idf spokesman on israeli airstrike on refugee camp_treetagger_output.txt in folder: en_CNN
Finished processing all files in folder: en_CNN, results saved to frequency_data\en_CNN_frequency.csv

Processing complete for all folders.


[Relationship Extraction & Network Analysis with Spacy & NetworkX](https://youtu.be/fAHkJ_Dhr50?si=8fDlto50iqekKnI-)