In [35]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
   -- ------------------------------------- 0.8/11.0 MB 2.1 MB/s eta 0:00:05
   ---- ----------------------------------- 1.3/11.0 MB 2.2 MB/s eta 0:00:05
   ----- ---------------------------------- 1.6/11.0 MB 2.2 MB/s eta 0:00:05
   ------- -------------------------------- 2.1/11.0 MB 2.2 MB/s eta 0:00:05
   --------- ------------------------------ 2.6/11.0 MB 2.2 MB/s eta 0:00

In [37]:
import sklearn
print(sklearn.__version__)

1.5.2


In [42]:
import os
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Path to the folder containing the .txt files
folder_path = "data/en_DW/treetagger_output/"

# Define custom stop words
custom_stop_words = ['uh', 'um']

# Combine custom stop words with built-in English stop words
stop_words = custom_stop_words + list(ENGLISH_STOP_WORDS)

# Function to extract text from a DataFrame
def extract_text(df):
    text = ' '.join(df.iloc[:, 0].astype(str).tolist())  # Join all rows from the first column
    return text

# Read and concatenate all text data
texts = []
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, delimiter="\t", header=None)
        text = extract_text(df)
        texts.append(text)

# Preprocess the text data with custom stop words and n-gram range (1, 3) for unigrams, bigrams, and trigrams
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000, ngram_range=(1, 3))
X = vectorizer.fit_transform(texts)

# Calculate TF-IDF scores
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = X.sum(axis=0).A1
tfidf_dict = dict(zip(feature_names, tfidf_scores))

# Sort terms by their TF-IDF score
sorted_tfidf = sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)

# Print top keywords
print("Top Keywords:")
for term, score in sorted_tfidf[:10]:  # Adjust the number of keywords as needed
    print(f"{term}: {score:.4f}")

# Optional: Print all keywords
print("\nAll Keywords:")
for term, score in sorted_tfidf:
    print(f"{term}: {score:.4f}")

Top Keywords:
israel: 128.1921
gaza: 106.8782
hamas: 80.4601
people: 63.3375
israeli: 61.4710
know: 60.2757
war: 46.1743
think: 43.3713
hostages: 41.3447
iran: 38.9539

All Keywords:
israel: 128.1921
gaza: 106.8782
hamas: 80.4601
people: 63.3375
israeli: 61.4710
know: 60.2757
war: 46.1743
think: 43.3713
hostages: 41.3447
iran: 38.9539
aid: 38.5439
military: 35.9684
just: 32.8771
said: 32.4956
going: 32.0015
ve: 30.8551
dw: 30.4497
humanitarian: 29.9756
palestinian: 29.5751
say: 29.4316
ceasefire: 29.2572
time: 28.6735
rafa: 28.4062
like: 28.2856
netanyahu: 27.6728
government: 27.5881
really: 27.2829
right: 26.9263
course: 26.5857
news: 26.3618
hospital: 25.1457
don: 25.1090
says: 24.9768
situation: 24.5527
need: 24.3396
minister: 24.1522
children: 24.1225
strip: 23.8461
deal: 23.6751
attack: 23.6417
palestinians: 23.4564
want: 23.0249
killed: 23.0115
international: 22.9524
doc: 22.3060
publish_date: 22.2778
thumbnail_path: 22.2778
video_id: 22.2778
video_title: 22.2778
saying: 21.7644
