In [35]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
   -- ------------------------------------- 0.8/11.0 MB 2.1 MB/s eta 0:00:05
   ---- ----------------------------------- 1.3/11.0 MB 2.2 MB/s eta 0:00:05
   ----- ---------------------------------- 1.6/11.0 MB 2.2 MB/s eta 0:00:05
   ------- -------------------------------- 2.1/11.0 MB 2.2 MB/s eta 0:00:05
   --------- ------------------------------ 2.6/11.0 MB 2.2 MB/s eta 0:00

In [37]:
import sklearn
print(sklearn.__version__)

1.5.2


In [1]:
import os
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Path to the folder containing the .txt files
folder_path = "data/en_DW/treetagger_output/"

# Define custom stop words
custom_stop_words = ['uh', 'um']

# Combine custom stop words with built-in English stop words
stop_words = custom_stop_words + list(ENGLISH_STOP_WORDS)

# Function to extract text from a DataFrame
def extract_text(df):
    text = ' '.join(df.iloc[:, 0].astype(str).tolist())  # Join all rows from the first column
    return text

# Read and concatenate all text data
texts = []
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, delimiter="\t", header=None)
        text = extract_text(df)
        texts.append(text)

# Preprocess the text data with custom stop words and n-gram range (1, 3) for unigrams, bigrams, and trigrams
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000, ngram_range=(1, 3))
X = vectorizer.fit_transform(texts)

# Calculate TF-IDF scores
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = X.sum(axis=0).A1
tfidf_dict = dict(zip(feature_names, tfidf_scores))

# Sort terms by their TF-IDF score
sorted_tfidf = sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)

# Print top keywords
print("Top Keywords:")
for term, score in sorted_tfidf[:10]:  # Adjust the number of keywords as needed
    print(f"{term}: {score:.4f}")

# Optional: Print all keywords
print("\nAll Keywords:")
for term, score in sorted_tfidf:
    print(f"{term}: {score:.4f}")

Top Keywords:
israel: 126.5057
gaza: 105.0950
hamas: 79.3757
people: 62.6036
israeli: 60.4792
know: 59.7096
war: 45.5630
think: 43.0843
hostages: 40.6308
iran: 38.9175

All Keywords:
israel: 126.5057
gaza: 105.0950
hamas: 79.3757
people: 62.6036
israeli: 60.4792
know: 59.7096
war: 45.5630
think: 43.0843
hostages: 40.6308
iran: 38.9175
aid: 38.0763
military: 35.5769
just: 32.5654
said: 32.1039
going: 31.6783
ve: 30.4898
dw: 30.1161
humanitarian: 29.4090
say: 29.0865
palestinian: 28.9027
ceasefire: 28.7509
time: 28.3252
like: 28.0312
rafa: 27.9593
government: 27.1742
really: 27.0029
netanyahu: 26.9811
right: 26.5928
course: 26.2629
news: 26.1073
hospital: 24.8454
don: 24.8251
says: 24.5296
situation: 24.1946
need: 24.0571
children: 23.8637
minister: 23.6160
attack: 23.5518
strip: 23.4207
deal: 23.2590
palestinians: 23.0198
want: 22.7281
killed: 22.7218
dw news: 22.5059
international: 22.4812
doc: 22.0819
publish_date: 22.0500
thumbnail_path: 22.0500
video_id: 22.0500
video_title: 22.0500