In [35]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
   -- ------------------------------------- 0.8/11.0 MB 2.1 MB/s eta 0:00:05
   ---- ----------------------------------- 1.3/11.0 MB 2.2 MB/s eta 0:00:05
   ----- ---------------------------------- 1.6/11.0 MB 2.2 MB/s eta 0:00:05
   ------- -------------------------------- 2.1/11.0 MB 2.2 MB/s eta 0:00:05
   --------- ------------------------------ 2.6/11.0 MB 2.2 MB/s eta 0:00

In [37]:
import sklearn
print(sklearn.__version__)

1.5.2


In [6]:
import os
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Path to the folder containing the .txt files
folder_path = "data/en_DW/treetagger_output/"
output_csv_path = "data/en_DW/keyword_frequencies.csv"

# Define custom stop words
custom_stop_words = ['uh', 'um']
stop_words = custom_stop_words + list(ENGLISH_STOP_WORDS)

# Function to extract metadata (video_id and publish_date) from a DataFrame
def extract_metadata(df):
    video_id = None
    publish_date = None
    
    for index, row in df.iterrows():
        if '<video_id>' in row.iloc[0]:
            video_id = re.search(r'<video_id>(.*?)</video_id>', row.iloc[0]).group(1)
        if '<publish_date>' in row.iloc[0]:
            publish_date = re.search(r'<publish_date>(.*?)</publish_date>', row.iloc[0]).group(1)
        if video_id and publish_date:
            break
            
    return video_id, publish_date

# Function to extract text from a DataFrame
def extract_text(df):
    text = ' '.join(df.iloc[:, 0].astype(str).tolist())  # Join all rows from the first column
    return text

# Read and process all text data
texts = []
metadata = []
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, delimiter="\t", header=None)
        
        # Extract text and metadata
        text = extract_text(df)
        video_id, publish_date = extract_metadata(df)
        
        texts.append(text)
        metadata.append({"video_id": video_id, "publish_date": publish_date})

# Preprocess the text data with custom stop words
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000)
X = vectorizer.fit_transform(texts)

# Calculate TF-IDF scores
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = X.sum(axis=0).A1
tfidf_dict = dict(zip(feature_names, tfidf_scores))

# Sort terms by their TF-IDF score
sorted_tfidf = sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)

# Get top keywords
top_keywords = [term for term, score in sorted_tfidf[:10]]  # Adjust the number of top keywords as needed

# Dictionaries to store frequency results
frequency_results = []
total_frequencies = {keyword: 0 for keyword in top_keywords}
total_word_count = 0

# Loop through all files to count keyword frequencies
for text, meta in zip(texts, metadata):
    # Count occurrences of top keywords
    term_counts = {keyword: text.lower().count(keyword) for keyword in top_keywords}
    word_count = len(text.split())  # Total number of words in the file
    total_terms = sum(term_counts.values())  # Total count of top keywords

    # Update total frequencies and word count
    for keyword in top_keywords:
        total_frequencies[keyword] += term_counts[keyword]
    total_word_count += word_count

    # Calculate relative frequencies for each keyword (instances per million)
    term_relative_frequencies = {keyword: (term_counts[keyword] / word_count) * 1_000_000 if word_count > 0 else 0 for keyword in top_keywords}

    # Store the result for the file
    result = {
        "video_id": meta["video_id"],
        "publish_date": meta["publish_date"],
        "total_term_count": total_terms,
        "total_word_count": word_count
    }
    result.update(term_counts)
    result.update({f"{keyword}_relative": term_relative_frequencies[keyword] for keyword in top_keywords})
    
    frequency_results.append(result)

# Create a DataFrame from the results
df_results = pd.DataFrame(frequency_results)

# Save the DataFrame to a CSV file
df_results.to_csv(output_csv_path, index=False)

print(f"Results have been saved to {output_csv_path}")

# Calculate and print total frequencies and relative frequencies across all files
print("Total Frequencies Across All Files:")
total_keyword_count = sum(total_frequencies.values())
for keyword, count in total_frequencies.items():
    print(f"  {keyword} Total Count: {count}")
    print(f"  {keyword} Relative Frequency: {(count / total_word_count) * 1_000_000:.2f} instances per million")
print(f"  Total Word Count Across All Files: {total_word_count}")


Results have been saved to data/en_DW/keyword_frequencies.csv
Total Frequencies Across All Files:
  israel Total Count: 19697
  israel Relative Frequency: 12587.33 instances per million
  gaza Total Count: 10317
  gaza Relative Frequency: 6593.06 instances per million
  hamas Total Count: 7001
  hamas Relative Frequency: 4473.98 instances per million
  people Total Count: 5951
  people Relative Frequency: 3802.98 instances per million
  israeli Total Count: 6385
  israeli Relative Frequency: 4080.32 instances per million
  know Total Count: 5901
  know Relative Frequency: 3771.02 instances per million
  war Total Count: 6228
  war Relative Frequency: 3979.99 instances per million
  think Total Count: 3803
  think Relative Frequency: 2430.30 instances per million
  hostages Total Count: 2494
  hostages Relative Frequency: 1593.79 instances per million
  iran Total Count: 3207
  iran Relative Frequency: 2049.43 instances per million
  Total Word Count Across All Files: 1564827
