# ***תרגיל 4   של הפרוייקט***

# ***א-1***

***Load data and basic inspection***




---



In [2]:
# === Cell 1: Load data and basic inspection ===

import pandas as pd

# Load the filtered dataset from disk (Colab path)
filtered_df = pd.read_csv("/content/train-filtered_question_level.csv")

# Remove duplicate questions to avoid biasing the model with repeated texts
filtered_df = filtered_df.drop_duplicates(subset=["question"], keep="first")

# Sanity check: show columns and first rows to verify the structure
print("Columns in DataFrame:")
print(filtered_df.columns)

print("\nFirst 5 rows:")
print(filtered_df.head())

# Show global label distribution for 'level' (if exists), to understand dataset balance
if "level" in filtered_df.columns:
    print("\nGlobal distribution of 'level':")
    print(filtered_df["level"].value_counts(normalize=True))
else:
    print("\nColumn 'level' not found in DataFrame.")


Columns in DataFrame:
Index(['question', 'level'], dtype='object')

First 5 rows:
                                            question   level
0  Which magazine was started first Arthur's Maga...  medium
1  The Oberoi family is part of a hotel company t...  medium
2  Musician and satirist Allie Goertz wrote a son...    hard
3    What nationality was James Henry Miller's wife?  medium
4  Cadmium Chloride is slightly soluble in this c...  medium

Global distribution of 'level':
level
medium    0.628149
easy      0.198688
hard      0.173162
Name: proportion, dtype: float64


***חלוקה מאוזנת ל־train / validation / test (עם stratify)***

In [3]:
# # Define split proportions
# TEST_SIZE = 0.15      # 15% of total data for test
# VAL_SIZE = 0.15       # 15% of total data for validation
# RANDOM_STATE = 42     # For reproducibility

# # Compute validation size relative to the remaining data after test split
# val_size_relative = VAL_SIZE / (1 - TEST_SIZE)  # e.g., 0.15 / 0.85

# print("Relative validation size (from train_val):", val_size_relative)

# # Step 1: Split into train_val and test with stratification on 'level'
# train_val_df, test_df = train_test_split(
#     filtered_df,
#     test_size=TEST_SIZE,
#     stratify=filtered_df["level"],
#     random_state=RANDOM_STATE
# )

# # Step 2: Split train_val into train and validation with stratification on 'level'
# train_df, val_df = train_test_split(
#     train_val_df,
#     test_size=val_size_relative,
#     stratify=train_val_df["level"],
#     random_state=RANDOM_STATE
# )

# print("Finished stratified split into train / validation / test.")


***בדיקה שהחלוקה מאוזנת (stratified) ושיש לנו את היחסים הרצויים***

In [4]:
# def print_split_info(df, name):
#     print(f"\n{name}:")
#     print("Number of rows:", len(df))
#     print("Label distribution for 'level':")
#     print(df["level"].value_counts(normalize=True))

# print("Total rows in original filtered_df:", len(filtered_df))

# print_split_info(train_df, "Train set")
# print_split_info(val_df, "Validation set")
# print_split_info(test_df, "Test set")


***Text preprocessing (tokenization + lemmatization)***

In [5]:
# === Cell 2 (HARD FIX): NLTK setup and robust preprocessing ===

import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

# Download required NLTK resources (run once per runtime)
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")

lemmatizer = WordNetLemmatizer()
eng_stops = set(stopwords.words("english"))

# Normalize all forms of the verb "to be" into a single token "be"
BE_FORMS = {"am", "is", "are", "was", "were", "be", "been", "being"}


def get_wordnet_pos(tag: str):
    """
    Map POS tag from nltk.pos_tag to a WordNet POS tag.
    This helps the lemmatizer pick the correct base form.
    """
    if tag.startswith("J"):
        return wordnet.ADJ
    if tag.startswith("V"):
        return wordnet.VERB
    if tag.startswith("N"):
        return wordnet.NOUN
    if tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN


# Regex patterns for cleaning
# Remove URLs, emails, @handles, #hashtags
url_email_handle_re = re.compile(r"(https?://\S+|www\.\S+|\S+@\S+|[@#]\w+)", re.IGNORECASE)

# Detect any digit inside a token
digits_re = re.compile(r"\d")

# For NON-numeric tokens: remove everything except [a-z] and spaces
non_letter_re = re.compile(r"[^a-z ]+")


def process_text_value(text: str) -> str:
    """
    Full preprocessing for a single text value:
    - Remove URLs, emails, and @handles/#hashtags
    - Tokenize
    - POS tagging
    - Lemmatization with POS
    - Normalize all 'be' verb forms to 'be'
    - Any token that contains at least one digit -> '_number' (entire token)
    - For other tokens: strip punctuation/non-letters, keep only [a-z] and spaces
    - Finally, any token that still contains the substring 'number' is collapsed to '_number'
    - (Optional) Remove stopwords [currently commented out]
    - Lowercase
    Returns a cleaned string with space-separated tokens.
    """
    # Safely handle missing or non-string values
    if not isinstance(text, str):
        return ""

    # Remove URLs, emails, handles, hashtags
    t = url_email_handle_re.sub(" ", text)

    # Tokenize and POS-tag on original (cleaned) text
    tokens = word_tokenize(t)
    tagged = pos_tag(tokens)

    lemmas = []
    for tok, pos in tagged:
        # Normalize 'be' forms early to reduce sparsity
        if tok.lower() in BE_FORMS:
            lemmas.append("be")
            continue

        # Map POS tag to WordNet POS tag and lemmatize
        wn_pos = get_wordnet_pos(pos)
        lemma = lemmatizer.lemmatize(tok, wn_pos)
        lemmas.append(lemma)

    # Lowercase all tokens
    lemmas = [w.lower() for w in lemmas]

    intermediate = []
    for w in lemmas:
        # If the token contains ANY digit, replace the entire token with '_number'
        if digits_re.search(w):
            intermediate.append("_number")
            continue

        # For non-numeric tokens: remove punctuation and non-letters
        w2 = non_letter_re.sub(" ", w).strip()
        if not w2:
            # Skip tokens that became empty after cleaning
            continue

        # If cleaning produced multiple parts (e.g. "word-word" -> "word word")
        for part in w2.split():
            if not part:
                continue
            intermediate.append(part)

    # Final pass: collapse any token that still contains 'number' into '_number'
    # This guarantees we do not get '_numbera', '_numberkm', etc.
    clean_lemmas = []
    for w in intermediate:
        if "number" in w:
            clean_lemmas.append("_number")
        else:
            clean_lemmas.append(w)

    # If you want to remove stopwords, uncomment the next line
    # clean_lemmas = [w for w in clean_lemmas if w not in eng_stops]

    # Join tokens back into a single cleaned string
    return " ".join(clean_lemmas)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


***Apply preprocessing to all questions***

In [6]:
# === Cell 3: Apply preprocessing to all questions ===

# Ensure the 'question' column exists before applying preprocessing
if "question" not in filtered_df.columns:
    raise KeyError("The DataFrame does not contain a 'question' column.")

# Apply the preprocessing function to every question in the dataset
# This creates a new column 'question_clean' that contains the normalized text
filtered_df["question_clean"] = filtered_df["question"].apply(process_text_value)

# Inspect a few examples to verify that preprocessing works as expected
print("Original vs. cleaned examples:\n")
for i in range(5):
    print(f"--- Example {i+1} ---")
    print("Original :", filtered_df.loc[filtered_df.index[i], "question"])
    print("Cleaned  :", filtered_df.loc[filtered_df.index[i], "question_clean"])
    print()


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


***TF-IDF vectorization of the preprocessed questions***

In [None]:
# === Cell 4: TF-IDF vectorization for ALL questions (no train/val/test split) ===

from sklearn.feature_extraction.text import TfidfVectorizer

# Safety check: make sure 'clean_text' exists
if "clean_text" not in filtered_df.columns:
    raise KeyError("The DataFrame does not contain a 'clean_text' column. "
                   "Run the preprocessing cell first.")

# Define a TF-IDF vectorizer
# max_features limits vocabulary size to the most frequent terms
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,   # limit vocabulary size (you can tune this later)
    ngram_range=(1, 1),   # unigrams only
)

# Fit TF-IDF on the entire cleaned corpus and transform it to a sparse matrix
# Each row = one question, each column = one term from the vocabulary
X_tfidf = tfidf_vectorizer.fit_transform(filtered_df["clean_text"])

print("TF-IDF matrix shape (n_samples, n_features):", X_tfidf.shape)
print("(Num of documents, max_features)")

# Optional: extract labels if you need them later for supervised models / evaluation
if "level" in filtered_df.columns:
    y = filtered_df["level"].values
    print("Labels vector shape:", y.shape)
else:
    y = None
    print("No 'level' column found. y is set to None.")

# Show a small sample of feature names for sanity check
feature_names = tfidf_vectorizer.get_feature_names_out()
print("\nVocabulary size (len(feature_names)):", len(feature_names))
print("First 30 features:\n", feature_names[:60])


***Run K-Means for different K values***

In [None]:
# === Cell 5: Run K-Means for several K values ===

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Choose several K values
k_values = [2, 3, 7, 15]
# k_values = [2]
inertia_scores = []
silhouette_scores = []

print("Running K-Means on TF-IDF matrix... (may take a bit)")

for k in k_values:
    print(f"\n--- K = {k} ---")

    # KMeans (using smart initialization k-means++)
    kmeans = KMeans(
        n_clusters=k,
        init="k-means++",
        max_iter=300,
        random_state=42,
        n_init=10
    )

    # Fit on full TF-IDF matrix
    kmeans.fit(X_tfidf)

    # Inertia (Elbow)
    inertia = kmeans.inertia_
    inertia_scores.append(inertia)

    # Silhouette score (requires >1 cluster)
    sil_score = silhouette_score(X_tfidf, kmeans.labels_, metric='euclidean')
    silhouette_scores.append(sil_score)

    print(f"Inertia: {inertia}")
    print(f"Silhouette Score: {sil_score}")


***Dimensionality reduction for clustering***

In [None]:
# # === Cell 5: Dimensionality reduction for clustering (TruncatedSVD) ===

# from sklearn.decomposition import TruncatedSVD

# # We reduce dimensionality because TF-IDF has many features and is sparse.
# # TruncatedSVD is PCA-like but works directly on sparse matrices.
# svd = TruncatedSVD(
#     n_components=50,   # number of latent dimensions (you can tune this)
#     random_state=42
# )

# # Fit SVD on the TF-IDF matrix and transform it to a dense lower-dimensional space
# X_svd = svd.fit_transform(X_tfidf)

# print("Original TF-IDF shape :", X_tfidf.shape)
# print("Reduced SVD shape     :", X_svd.shape)

# # Sum of explained variance ratio gives an idea how much information we kept
# explained = svd.explained_variance_ratio_.sum()
# print(f"Total explained variance (approx): {explained:.3f}")


***DBSCAN clustering on reduced space and comparison***

In [None]:
# # === Cell 6: DBSCAN clustering on SVD-reduced data ===

# from sklearn.cluster import DBSCAN
# from sklearn.metrics import silhouette_score
# import numpy as np

# # Define DBSCAN hyperparameters
# # eps controls neighborhood radius; min_samples controls how many neighbors define a "dense" region
# dbscan = DBSCAN(
#     eps=1.0,          # you can tune this (e.g., 0.5, 0.7, 1.2, ...)
#     min_samples=5,    # minimum number of points to form a dense region
#     metric="euclidean",
#     n_jobs=-1         # use all available cores for distance computations
# )

# print("Fitting DBSCAN on SVD-reduced data (this may take some time)...")
# dbscan_labels = dbscan.fit_predict(X_svd)

# # Count how many points fell into each cluster (including noise = -1)
# unique_labels, counts = np.unique(dbscan_labels, return_counts=True)
# label_counts = dict(zip(unique_labels, counts))

# print("\nCluster label counts (including noise label = -1):")
# print(label_counts)

# # Filter out noise points (-1) before computing Silhouette score
# mask = dbscan_labels != -1
# num_clusters = len(set(dbscan_labels[mask]))

# if num_clusters < 2:
#     # Silhouette score is not defined if there is fewer than 2 clusters
#     print("\nDBSCAN found fewer than 2 clusters (after removing noise).")
#     print("Silhouette score is not defined in this case.")
# else:
#     # Silhouette score on the non-noise points only
#     dbscan_sil = silhouette_score(X_svd[mask], dbscan_labels[mask])
#     print(f"\nDBSCAN Silhouette Score (on non-noise points): {dbscan_sil:.4f}")
#     print("\nYou can compare this value to the Silhouette scores you got from K-Means.")


# ***==========================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================***