# ***תרגיל 4   של הפרוייקט***

# ***חלק א***

***Cell 1: Load data and basic inspection***




---



In [2]:
import pandas as pd

# Load the filtered dataset from disk (Colab path)
filtered_df = pd.read_csv("/content/train-filtered_question_level.csv")

# Remove duplicate questions to avoid biasing the model with repeated texts
filtered_df = filtered_df.drop_duplicates(subset=["question"], keep="first")

# Sanity check: show columns and first rows to verify the structure
print("Columns in DataFrame:")
print(filtered_df.columns)

print("\nFirst 5 rows:")
print(filtered_df.head())

# Show global label distribution for 'level' (if exists), to understand dataset balance
if "level" in filtered_df.columns:
    print("\nGlobal distribution of 'level':")
    print(filtered_df["level"].value_counts(normalize=True))
else:
    print("\nColumn 'level' not found in DataFrame.")


Columns in DataFrame:
Index(['question', 'level'], dtype='object')

First 5 rows:
                                            question   level
0  Which magazine was started first Arthur's Maga...  medium
1  The Oberoi family is part of a hotel company t...  medium
2  Musician and satirist Allie Goertz wrote a son...    hard
3    What nationality was James Henry Miller's wife?  medium
4  Cadmium Chloride is slightly soluble in this c...  medium

Global distribution of 'level':
level
medium    0.628149
easy      0.198688
hard      0.173162
Name: proportion, dtype: float64


***Cell 2: Text preprocessing (tokenization + lemmatization)***

In [3]:
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

# Download required NLTK resources (run once per runtime)
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")

lemmatizer = WordNetLemmatizer()
eng_stops = set(stopwords.words("english"))

# Normalize all forms of the verb "to be" into a single token "be"
BE_FORMS = {"am", "is", "are", "was", "were", "be", "been", "being"}


def get_wordnet_pos(tag: str):
    """
    Map POS tag from nltk.pos_tag to a WordNet POS tag.
    This helps the lemmatizer pick the correct base form.
    """
    if tag.startswith("J"):
        return wordnet.ADJ
    if tag.startswith("V"):
        return wordnet.VERB
    if tag.startswith("N"):
        return wordnet.NOUN
    if tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN


# Regex patterns for cleaning
# Remove URLs, emails, @handles, #hashtags
url_email_handle_re = re.compile(r"(https?://\S+|www\.\S+|\S+@\S+|[@#]\w+)", re.IGNORECASE)

# Detect any digit inside a token
digits_re = re.compile(r"\d")

# For NON-numeric tokens: remove everything except [a-z] and spaces
non_letter_re = re.compile(r"[^a-z ]+")


def process_text_value(text: str) -> str:
    """
    Full preprocessing for a single text value:
    - Remove URLs, emails, and @handles/#hashtags
    - Tokenize
    - POS tagging
    - Lemmatization with POS
    - Normalize all 'be' verb forms to 'be'
    - Any token that contains at least one digit -> '_number' (entire token)
    - For other tokens: strip punctuation/non-letters, keep only [a-z] and spaces
    - Finally, any token that still contains the substring 'number' is collapsed to '_number'
    - (Optional) Remove stopwords [currently commented out]
    - Lowercase
    Returns a cleaned string with space-separated tokens.
    """
    # Safely handle missing or non-string values
    if not isinstance(text, str):
        return ""

    # Remove URLs, emails, handles, hashtags
    t = url_email_handle_re.sub(" ", text)

    # Tokenize and POS-tag on original (cleaned) text
    tokens = word_tokenize(t)
    tagged = pos_tag(tokens)

    lemmas = []
    for tok, pos in tagged:
        # Normalize 'be' forms early to reduce sparsity
        if tok.lower() in BE_FORMS:
            lemmas.append("be")
            continue

        # Map POS tag to WordNet POS tag and lemmatize
        wn_pos = get_wordnet_pos(pos)
        lemma = lemmatizer.lemmatize(tok, wn_pos)
        lemmas.append(lemma)

    # Lowercase all tokens
    lemmas = [w.lower() for w in lemmas]

    intermediate = []
    for w in lemmas:
        # If the token contains ANY digit, replace the entire token with '_number'
        if digits_re.search(w):
            intermediate.append("_number")
            continue

        # For non-numeric tokens: remove punctuation and non-letters
        w2 = non_letter_re.sub(" ", w).strip()
        if not w2:
            # Skip tokens that became empty after cleaning
            continue

        # If cleaning produced multiple parts (e.g. "word-word" -> "word word")
        for part in w2.split():
            if not part:
                continue
            intermediate.append(part)

    # Final pass: collapse any token that still contains 'number' into '_number'
    # This guarantees we do not get '_numbera', '_numberkm', etc.
    clean_lemmas = []
    for w in intermediate:
        if "number" in w:
            clean_lemmas.append("_number")
        else:
            clean_lemmas.append(w)

    # If you want to remove stopwords, uncomment the next line
    # clean_lemmas = [w for w in clean_lemmas if w not in eng_stops]

    # Join tokens back into a single cleaned string
    return " ".join(clean_lemmas)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


***Cell 3: Apply preprocessing to all questions***

In [4]:
# Ensure the 'question' column exists before applying preprocessing
if "question" not in filtered_df.columns:
    raise KeyError("The DataFrame does not contain a 'question' column.")

# Apply the preprocessing function to every question in the dataset
# This creates a new column 'question_clean' that contains the normalized text
filtered_df["clean_text"] = filtered_df["question"].apply(process_text_value)

# Inspect a few examples to verify that preprocessing works as expected
print("Original vs. cleaned examples:\n")
for i in range(5):
    print(f"--- Example {i+1} ---")
    print("Original :", filtered_df.loc[filtered_df.index[i], "question"])
    print("Cleaned  :", filtered_df.loc[filtered_df.index[i], "clean_text"])
    print()


Original vs. cleaned examples:

--- Example 1 ---
Original : Which magazine was started first Arthur's Magazine or First for Women?
Cleaned  : which magazine be start first arthur s magazine or first for women

--- Example 2 ---
Original : The Oberoi family is part of a hotel company that has a head office in what city?
Cleaned  : the oberoi family be part of a hotel company that have a head office in what city

--- Example 3 ---
Original : Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
Cleaned  : musician and satirist allie goertz write a song about the the simpsons character milhouse who matt groening name after who

--- Example 4 ---
Original :  What nationality was James Henry Miller's wife?
Cleaned  : what nationality be james henry miller s wife

--- Example 5 ---
Original : Cadmium Chloride is slightly soluble in this chemical, it is also called what?
Cleaned  : cadmium chloride be slightly soluble 

***Cell 4: TF-IDF vectorization of the preprocessed questions***

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Safety check: make sure 'clean_text' exists
if "clean_text" not in filtered_df.columns:
    raise KeyError("The DataFrame does not contain a 'clean_text' column. "
                   "Run the preprocessing cell first.")

# Define a TF-IDF vectorizer
# max_features limits vocabulary size to the most frequent terms
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,   # limit vocabulary size (you can tune this later)
    ngram_range=(1, 1),   # unigrams only
)

# Fit TF-IDF on the entire cleaned corpus and transform it to a sparse matrix
# Each row = one question, each column = one term from the vocabulary
X_tfidf = tfidf_vectorizer.fit_transform(filtered_df["clean_text"])

print("TF-IDF matrix shape (n_samples, n_features):", X_tfidf.shape)
print("(Num of documents, max_features)")

# Optional: extract labels if you need them later for supervised models / evaluation
if "level" in filtered_df.columns:
    y = filtered_df["level"].values
    print("Labels vector shape:", y.shape)
else:
    y = None
    print("No 'level' column found. y is set to None.")

# Show a small sample of feature names for sanity check
feature_names = tfidf_vectorizer.get_feature_names_out()
print("\nVocabulary size (len(feature_names)):", len(feature_names))
print("First 30 features:\n", feature_names[:60])


TF-IDF matrix shape (n_samples, n_features): (90418, 10000)
(Num of documents, max_features)
Labels vector shape: (90418,)

Vocabulary size (len(feature_names)): 10000
First 30 features:
 ['_number' 'aaa' 'aaron' 'ab' 'abandon' 'abashidze' 'abba' 'abbey' 'abbot'
 'abbott' 'abbreviate' 'abbreviation' 'abc' 'abdication' 'abduct' 'abdul'
 'abe' 'abel' 'aberdeen' 'abigail' 'ability' 'able' 'aboard'
 'abolitionist' 'aboriginal' 'about' 'above' 'abraham' 'abrams' 'absent'
 'absorb' 'abstract' 'abu' 'abuse' 'ac' 'academic' 'academy' 'accept'
 'access' 'accessible' 'accessory' 'accident' 'acclaim' 'acclaimed'
 'accompany' 'accomplished' 'accomplishment' 'accord' 'according'
 'account' 'accra' 'accredit' 'accuse' 'ace' 'achieve' 'achievement'
 'acid' 'acknowledge' 'acorn' 'acoustic']


***Cell 5: Run K-Means for different K values***

In [10]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Choose several K values
k_values = [2, 3, 7, 15]
# k_values = [2]
inertia_scores = []
silhouette_scores = []

print("Running K-Means on TF-IDF matrix... (may take a bit)")

for k in k_values:
    print(f"\n--- K = {k} ---")

    # KMeans (using smart initialization k-means++)
    kmeans = KMeans(
        n_clusters=k,
        init="k-means++",
        max_iter=300,
        random_state=42,
        n_init=10
    )

    # Fit on full TF-IDF matrix
    kmeans.fit(X_tfidf)

    # Inertia (Elbow)
    inertia = kmeans.inertia_
    inertia_scores.append(inertia)

    # Silhouette score (requires >1 cluster)
    sil_score = silhouette_score(X_tfidf, kmeans.labels_, metric='euclidean')
    silhouette_scores.append(sil_score)

    print(f"Inertia: {inertia}")
    print(f"Silhouette Score: {sil_score}")


Running K-Means on TF-IDF matrix... (may take a bit)

--- K = 2 ---
Inertia: 86997.84242688587
Silhouette Score: 0.006657060134126678

--- K = 3 ---
Inertia: 86574.06021907461
Silhouette Score: 0.004180111159384997

--- K = 7 ---
Inertia: 85277.72843629566
Silhouette Score: 0.006383348812444887

--- K = 15 ---
Inertia: 83893.26229810694
Silhouette Score: 0.009616516930486471


**Cell 6: Dimensionality reduction for clustering (TruncatedSVD)**

In [6]:
from sklearn.decomposition import TruncatedSVD

# We reduce dimensionality because TF-IDF has many features and is sparse.
# TruncatedSVD is PCA-like but works directly on sparse matrices.
svd = TruncatedSVD(
    n_components=50,   # number of latent dimensions (you can tune this)
    random_state=42
)

# Fit SVD on the TF-IDF matrix and transform it to a dense lower-dimensional space
X_svd = svd.fit_transform(X_tfidf)

print("Original TF-IDF shape :", X_tfidf.shape)
print("Reduced SVD shape     :", X_svd.shape)

# Sum of explained variance ratio gives an idea how much information we kept
explained = svd.explained_variance_ratio_.sum()
print(f"Total explained variance (approx): {explained:.3f}")


Original TF-IDF shape : (90418, 10000)
Reduced SVD shape     : (90418, 50)
Total explained variance (approx): 0.156


***Cell 7: DBSCAN clustering on reduced space and comparison***

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd

# הגדרת גודל המדגם
SAMPLE_SIZE = 20000

print(f"Original dataset size: {X_svd.shape[0]}")

# 1. דגימה (אם צריך)
if X_svd.shape[0] > SAMPLE_SIZE:
    print(f"Dataset is too large. Sampling {SAMPLE_SIZE} random points...")
    np.random.seed(42)
    indices = np.random.choice(X_svd.shape[0], SAMPLE_SIZE, replace=False)
    X_subset = X_svd[indices]
    df_subset = filtered_df.iloc[indices].copy()
else:
    X_subset = X_svd
    df_subset = filtered_df.copy()

print(f"Running DBSCAN on {X_subset.shape[0]} samples...")

# 2. הרצת DBSCAN
# הערה: אם את מקבלת רק אשכול אחד, כדאי לנסות לשנות את eps (למשל ל-0.3 או 0.7)
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean', n_jobs=-1)
dbscan_labels = dbscan.fit_predict(X_subset)
print("DBSCAN finished!")

# 3. שמירת תוצאות
df_subset['dbscan_cluster'] = dbscan_labels

# 4. ניתוח בסיסי
unique_labels, counts = np.unique(dbscan_labels, return_counts=True)
print("\nCluster counts (Label -1 represents 'Noise'):")
print(dict(zip(unique_labels, counts)))

# 5. חישוב Silhouette Score (בטוח)
# בדיקה כמה אשכולות יש שאינם רעש
n_clusters_real = len(set(dbscan_labels) - {-1})

if n_clusters_real >= 2:
    non_noise_mask = dbscan_labels != -1
    sil = silhouette_score(X_subset[non_noise_mask], dbscan_labels[non_noise_mask])
    print(f"\nDBSCAN Silhouette Score (excluding noise): {sil:.4f}")
else:
    print(f"\nCould not calculate Silhouette Score: Found {n_clusters_real} real clusters.")
    print("Silhouette Score requires at least 2 distinct clusters.")
    print("Try adjusting 'eps' (lower it to split clusters) or 'min_samples'.")

Step 1: Reducing dimensions with SVD...
Reduced shape: (90418, 50)

Step 2: Running DBSCAN...


***Cell 8: Analyze DBSCAN Clusters (Top Keywords & Sample Questions)***

In [None]:
# === Cell 8: Semantic Analysis for DBSCAN Sample ===

import pandas as pd
import numpy as np

print("Analyzing DBSCAN Clusters (on Sample)...")

# 1. וידוא שמשתני המדגם קיימים
if 'df_subset' not in locals() or 'indices' not in locals():
    # אם לא נעשתה דגימה (כי הדאטה היה קטן), נשתמש במקוריים
    df_subset = filtered_df
    indices = np.arange(X_tfidf.shape[0])

# 2. הוספת התוויות לדאטה-פריים של המדגם (ולא למקורי!)
df_subset['dbscan_cluster'] = dbscan_labels

# 3. יצירת מטריצת TF-IDF שמתאימה רק לשורות שבמדגם
# זה קריטי כדי שנוכל למצוא את המילים הנכונות
X_tfidf_subset = X_tfidf[indices]

unique_labels = np.unique(dbscan_labels)
feature_names = tfidf_vectorizer.get_feature_names_out()

# פונקציה מותאמת למדגם
def get_top_keywords_for_subset(cluster_mask, tfidf_sub, features, n_top=10):
    # חישוב ממוצע ה-TF-IDF עבור כל המילים באשכול הזה
    # cluster_mask הוא בוליאני (True/False) ביחס למדגם
    cluster_mean = np.array(tfidf_sub[cluster_mask].mean(axis=0)).flatten()
    sorted_indices = cluster_mean.argsort()[::-1][:n_top]
    return [features[i] for i in sorted_indices]

# מעבר על כל אשכול והצגת הנתונים
for label in unique_labels:
    print(f"\n{'='*40}")

    # טיפול באשכול רעש (-1)
    if label == -1:
        print(f"Cluster {label} (NOISE / OUTLIERS)")
        print(f"{'='*40}")
        n_noise = np.sum(dbscan_labels == -1)
        print(f"Contains {n_noise} documents considered as noise.")

        # דוגמאות לרעש
        print("\nSample Noise Questions:")
        sample = df_subset[df_subset['dbscan_cluster'] == label]['question'].sample(min(5, n_noise), random_state=42).values
        for i, q in enumerate(sample):
            print(f"  {i+1}. {q}")
        continue

    # טיפול באשכולות אמיתיים
    print(f"Cluster {label}")
    print(f"{'='*40}")

    # יצירת מסכה לאשכול הנוכחי
    cluster_mask = (df_subset['dbscan_cluster'] == label).values
    cluster_size = np.sum(cluster_mask)
    print(f"Size: {cluster_size} documents")

    # שליפת המילים המובילות (מתוך המטריצה החתוכה)
    top_words = get_top_keywords_for_subset(cluster_mask, X_tfidf_subset, feature_names, n_top=15)
    print(f"Top Keywords: {', '.join(top_words)}")

    # שליפת דוגמאות למשפטים (מתוך ה-subset)
    sample_size = min(5, cluster_size)
    sample_questions = df_subset[df_subset['dbscan_cluster'] == label]['question'].sample(sample_size, random_state=42).values

    print("\nSample Questions:")
    for i, q in enumerate(sample_questions):
        print(f"  {i+1}. {q}")