# ***תרגיל 4   של הפרוייקט***

# ***א-1***

***Load data and basic inspection***




---



In [2]:
# === Cell 1: Load data and basic inspection ===

import pandas as pd

# Load the filtered dataset from disk (Colab path)
filtered_df = pd.read_csv("/content/train-filtered_question_level.csv")

# Remove duplicate questions to avoid biasing the model with repeated texts
filtered_df = filtered_df.drop_duplicates(subset=["question"], keep="first")

# Sanity check: show columns and first rows to verify the structure
print("Columns in DataFrame:")
print(filtered_df.columns)

print("\nFirst 5 rows:")
print(filtered_df.head())

# Show global label distribution for 'level' (if exists), to understand dataset balance
if "level" in filtered_df.columns:
    print("\nGlobal distribution of 'level':")
    print(filtered_df["level"].value_counts(normalize=True))
else:
    print("\nColumn 'level' not found in DataFrame.")


Columns in DataFrame:
Index(['question', 'level'], dtype='object')

First 5 rows:
                                            question   level
0  Which magazine was started first Arthur's Maga...  medium
1  The Oberoi family is part of a hotel company t...  medium
2  Musician and satirist Allie Goertz wrote a son...    hard
3    What nationality was James Henry Miller's wife?  medium
4  Cadmium Chloride is slightly soluble in this c...  medium

Global distribution of 'level':
level
medium    0.628149
easy      0.198688
hard      0.173162
Name: proportion, dtype: float64


***חלוקה מאוזנת ל־train / validation / test (עם stratify)***

In [3]:
# # Define split proportions
# TEST_SIZE = 0.15      # 15% of total data for test
# VAL_SIZE = 0.15       # 15% of total data for validation
# RANDOM_STATE = 42     # For reproducibility

# # Compute validation size relative to the remaining data after test split
# val_size_relative = VAL_SIZE / (1 - TEST_SIZE)  # e.g., 0.15 / 0.85

# print("Relative validation size (from train_val):", val_size_relative)

# # Step 1: Split into train_val and test with stratification on 'level'
# train_val_df, test_df = train_test_split(
#     filtered_df,
#     test_size=TEST_SIZE,
#     stratify=filtered_df["level"],
#     random_state=RANDOM_STATE
# )

# # Step 2: Split train_val into train and validation with stratification on 'level'
# train_df, val_df = train_test_split(
#     train_val_df,
#     test_size=val_size_relative,
#     stratify=train_val_df["level"],
#     random_state=RANDOM_STATE
# )

# print("Finished stratified split into train / validation / test.")


***בדיקה שהחלוקה מאוזנת (stratified) ושיש לנו את היחסים הרצויים***

In [4]:
# def print_split_info(df, name):
#     print(f"\n{name}:")
#     print("Number of rows:", len(df))
#     print("Label distribution for 'level':")
#     print(df["level"].value_counts(normalize=True))

# print("Total rows in original filtered_df:", len(filtered_df))

# print_split_info(train_df, "Train set")
# print_split_info(val_df, "Validation set")
# print_split_info(test_df, "Test set")


***Text preprocessing (tokenization + lemmatization)***

In [5]:
# === Cell 2 (HARD FIX): NLTK setup and robust preprocessing ===

import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

# Download required NLTK resources (run once per runtime)
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")

lemmatizer = WordNetLemmatizer()
eng_stops = set(stopwords.words("english"))

# Normalize all forms of the verb "to be" into a single token "be"
BE_FORMS = {"am", "is", "are", "was", "were", "be", "been", "being"}


def get_wordnet_pos(tag: str):
    """
    Map POS tag from nltk.pos_tag to a WordNet POS tag.
    This helps the lemmatizer pick the correct base form.
    """
    if tag.startswith("J"):
        return wordnet.ADJ
    if tag.startswith("V"):
        return wordnet.VERB
    if tag.startswith("N"):
        return wordnet.NOUN
    if tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN


# Regex patterns for cleaning
# Remove URLs, emails, @handles, #hashtags
url_email_handle_re = re.compile(r"(https?://\S+|www\.\S+|\S+@\S+|[@#]\w+)", re.IGNORECASE)

# Detect any digit inside a token
digits_re = re.compile(r"\d")

# For NON-numeric tokens: remove everything except [a-z] and spaces
non_letter_re = re.compile(r"[^a-z ]+")


def process_text_value(text: str) -> str:
    """
    Full preprocessing for a single text value:
    - Remove URLs, emails, and @handles/#hashtags
    - Tokenize
    - POS tagging
    - Lemmatization with POS
    - Normalize all 'be' verb forms to 'be'
    - Any token that contains at least one digit -> '_number' (entire token)
    - For other tokens: strip punctuation/non-letters, keep only [a-z] and spaces
    - Finally, any token that still contains the substring 'number' is collapsed to '_number'
    - (Optional) Remove stopwords [currently commented out]
    - Lowercase
    Returns a cleaned string with space-separated tokens.
    """
    # Safely handle missing or non-string values
    if not isinstance(text, str):
        return ""

    # Remove URLs, emails, handles, hashtags
    t = url_email_handle_re.sub(" ", text)

    # Tokenize and POS-tag on original (cleaned) text
    tokens = word_tokenize(t)
    tagged = pos_tag(tokens)

    lemmas = []
    for tok, pos in tagged:
        # Normalize 'be' forms early to reduce sparsity
        if tok.lower() in BE_FORMS:
            lemmas.append("be")
            continue

        # Map POS tag to WordNet POS tag and lemmatize
        wn_pos = get_wordnet_pos(pos)
        lemma = lemmatizer.lemmatize(tok, wn_pos)
        lemmas.append(lemma)

    # Lowercase all tokens
    lemmas = [w.lower() for w in lemmas]

    intermediate = []
    for w in lemmas:
        # If the token contains ANY digit, replace the entire token with '_number'
        if digits_re.search(w):
            intermediate.append("_number")
            continue

        # For non-numeric tokens: remove punctuation and non-letters
        w2 = non_letter_re.sub(" ", w).strip()
        if not w2:
            # Skip tokens that became empty after cleaning
            continue

        # If cleaning produced multiple parts (e.g. "word-word" -> "word word")
        for part in w2.split():
            if not part:
                continue
            intermediate.append(part)

    # Final pass: collapse any token that still contains 'number' into '_number'
    # This guarantees we do not get '_numbera', '_numberkm', etc.
    clean_lemmas = []
    for w in intermediate:
        if "number" in w:
            clean_lemmas.append("_number")
        else:
            clean_lemmas.append(w)

    # If you want to remove stopwords, uncomment the next line
    # clean_lemmas = [w for w in clean_lemmas if w not in eng_stops]

    # Join tokens back into a single cleaned string
    return " ".join(clean_lemmas)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


***Apply preprocessing to all questions***

In [None]:
# === Cell 3: Apply preprocessing to all questions ===

# Ensure the 'question' column exists before applying preprocessing
if "question" not in filtered_df.columns:
    raise KeyError("The DataFrame does not contain a 'question' column.")

# Apply the preprocessing function to every question in the dataset
# This creates a new column 'question_clean' that contains the normalized text
filtered_df["question_clean"] = filtered_df["question"].apply(process_text_value)

# Inspect a few examples to verify that preprocessing works as expected
print("Original vs. cleaned examples:\n")
for i in range(5):
    print(f"--- Example {i+1} ---")
    print("Original :", filtered_df.loc[filtered_df.index[i], "question"])
    print("Cleaned  :", filtered_df.loc[filtered_df.index[i], "question_clean"])
    print()


***TF-IDF vectorization of the preprocessed questions***

In [None]:
# === Cell 4: TF-IDF vectorization for ALL questions (no train/val/test split) ===

from sklearn.feature_extraction.text import TfidfVectorizer

# Safety check: make sure 'clean_text' exists
if "clean_text" not in filtered_df.columns:
    raise KeyError("The DataFrame does not contain a 'clean_text' column. "
                   "Run the preprocessing cell first.")

# Define a TF-IDF vectorizer
# max_features limits vocabulary size to the most frequent terms
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,   # limit vocabulary size (you can tune this later)
    ngram_range=(1, 1),   # unigrams only
)

# Fit TF-IDF on the entire cleaned corpus and transform it to a sparse matrix
# Each row = one question, each column = one term from the vocabulary
X_tfidf = tfidf_vectorizer.fit_transform(filtered_df["clean_text"])

print("TF-IDF matrix shape (n_samples, n_features):", X_tfidf.shape)
print("(Num of documents, max_features)")

# Optional: extract labels if you need them later for supervised models / evaluation
if "level" in filtered_df.columns:
    y = filtered_df["level"].values
    print("Labels vector shape:", y.shape)
else:
    y = None
    print("No 'level' column found. y is set to None.")

# Show a small sample of feature names for sanity check
feature_names = tfidf_vectorizer.get_feature_names_out()
print("\nVocabulary size (len(feature_names)):", len(feature_names))
print("First 30 features:\n", feature_names[:60])


***Run K-Means for different K values***

In [None]:
# # === Cell 5: Run K-Means for several K values ===

# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_score

# # Choose several K values (must include 2 and 3 as required)
# # k_values = [2, 3, 5, 7, 10, 15]
# k_values = [2]
# inertia_scores = []
# silhouette_scores = []

# print("Running K-Means on TF-IDF matrix... (may take a bit)")

# for k in k_values:
#     print(f"\n--- K = {k} ---")

#     # KMeans (using smart initialization k-means++)
#     kmeans = KMeans(
#         n_clusters=k,
#         init="k-means++",
#         max_iter=300,
#         random_state=42,
#         n_init=10
#     )

#     # Fit on full TF-IDF matrix
#     kmeans.fit(X_tfidf)

#     # Inertia (Elbow)
#     inertia = kmeans.inertia_
#     inertia_scores.append(inertia)

#     # Silhouette score (requires >1 cluster)
#     sil_score = silhouette_score(X_tfidf, kmeans.labels_, metric='euclidean')
#     silhouette_scores.append(sil_score)

#     print(f"Inertia: {inertia}")
#     print(f"Silhouette Score: {sil_score}")


***Dimensionality reduction for clustering***

In [None]:
# === Cell 5: Dimensionality reduction for clustering (TruncatedSVD) ===

from sklearn.decomposition import TruncatedSVD

# We reduce dimensionality because TF-IDF has many features and is sparse.
# TruncatedSVD is PCA-like but works directly on sparse matrices.
svd = TruncatedSVD(
    n_components=50,   # number of latent dimensions (you can tune this)
    random_state=42
)

# Fit SVD on the TF-IDF matrix and transform it to a dense lower-dimensional space
X_svd = svd.fit_transform(X_tfidf)

print("Original TF-IDF shape :", X_tfidf.shape)
print("Reduced SVD shape     :", X_svd.shape)

# Sum of explained variance ratio gives an idea how much information we kept
explained = svd.explained_variance_ratio_.sum()
print(f"Total explained variance (approx): {explained:.3f}")


***DBSCAN clustering on reduced space and comparison***

In [None]:
# === Cell 6: DBSCAN clustering on SVD-reduced data ===

from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import numpy as np

# Define DBSCAN hyperparameters
# eps controls neighborhood radius; min_samples controls how many neighbors define a "dense" region
dbscan = DBSCAN(
    eps=1.0,          # you can tune this (e.g., 0.5, 0.7, 1.2, ...)
    min_samples=5,    # minimum number of points to form a dense region
    metric="euclidean",
    n_jobs=-1         # use all available cores for distance computations
)

print("Fitting DBSCAN on SVD-reduced data (this may take some time)...")
dbscan_labels = dbscan.fit_predict(X_svd)

# Count how many points fell into each cluster (including noise = -1)
unique_labels, counts = np.unique(dbscan_labels, return_counts=True)
label_counts = dict(zip(unique_labels, counts))

print("\nCluster label counts (including noise label = -1):")
print(label_counts)

# Filter out noise points (-1) before computing Silhouette score
mask = dbscan_labels != -1
num_clusters = len(set(dbscan_labels[mask]))

if num_clusters < 2:
    # Silhouette score is not defined if there is fewer than 2 clusters
    print("\nDBSCAN found fewer than 2 clusters (after removing noise).")
    print("Silhouette score is not defined in this case.")
else:
    # Silhouette score on the non-noise points only
    dbscan_sil = silhouette_score(X_svd[mask], dbscan_labels[mask])
    print(f"\nDBSCAN Silhouette Score (on non-noise points): {dbscan_sil:.4f}")
    print("\nYou can compare this value to the Silhouette scores you got from K-Means.")


# ***==========================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================***

***Build document vectors from Word2Vec (TF-IDF weighted average)***

In [None]:
import numpy as np

# Build a dictionary: word -> IDF score, based on the TF-IDF vocabulary
idf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(),
                      tfidf_vectorizer.idf_))

def document_vector(tokens, use_tfidf_weight=True):
    """
    Compute a single document vector from word vectors.
    By default uses TF-IDF weights as recommended.
    - tokens: list of preprocessed, lemmatized tokens
    - use_tfidf_weight: if True, weight each word vector by its IDF
    """
    vectors = []
    weights = []

    for tok in tokens:
        if tok in w2v_model.wv:
            vec = w2v_model.wv[tok]
            if use_tfidf_weight:
                weight = idf_scores.get(tok, 1.0)
            else:
                weight = 1.0
            vectors.append(vec * weight)
            weights.append(weight)

    if not vectors:
        # If no token has a vector, return a zero vector
        return np.zeros(w2v_model.vector_size, dtype=np.float32)

    vectors = np.vstack(vectors)
    weights = np.array(weights, dtype=np.float32)

    # Weighted average: sum(w_i * v_i) / sum(w_i)
    return vectors.sum(axis=0) / weights.sum()

# Build document-level vectors for each split
X_train_w2v = np.vstack(train_df["tokens"].apply(lambda toks: document_vector(toks, use_tfidf_weight=True)))
X_val_w2v   = np.vstack(val_df["tokens"].apply(lambda toks: document_vector(toks, use_tfidf_weight=True)))
X_test_w2v  = np.vstack(test_df["tokens"].apply(lambda toks: document_vector(toks, use_tfidf_weight=True)))

print("Word2Vec document matrices shapes:")
print("X_train_w2v:", X_train_w2v.shape)
print("X_val_w2v:  ", X_val_w2v.shape)
print("X_test_w2v: ", X_test_w2v.shape)


# ***ב-1-סיווג בינארי***

***Filter to binary classes (easy, hard)***

In [None]:
# Keep only 'easy' and 'hard' classes
binary_train = train_df[train_df["level"].isin(["easy", "hard"])].copy()
binary_val   = val_df[val_df["level"].isin(["easy", "hard"])].copy()
binary_test  = test_df[test_df["level"].isin(["easy", "hard"])].copy()

print("Train size:", len(binary_train))
print("Validation size:", len(binary_val))
print("Test size:", len(binary_test))

print("\nTrain label distribution:")
print(binary_train["level"].value_counts(normalize=True))


***Encode labels (easy=0, hard=1)***

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train = le.fit_transform(binary_train["level"])
y_val   = le.transform(binary_val["level"])
y_test  = le.transform(binary_test["level"])

print("Label classes:", le.classes_)  # ['easy' 'hard']


***Build TF-IDF for the binary subsets***

In [None]:
# Reuse the same TF-IDF vectorizer that was already fitted on full train_df
X_train_tfidf_bin = tfidf_vectorizer.transform(binary_train["clean_text"])
X_val_tfidf_bin   = tfidf_vectorizer.transform(binary_val["clean_text"])
X_test_tfidf_bin  = tfidf_vectorizer.transform(binary_test["clean_text"])

print("Binary TF-IDF shapes:")
print("X_train_tfidf_bin:", X_train_tfidf_bin.shape)
print("X_val_tfidf_bin:  ", X_val_tfidf_bin.shape)
print("X_test_tfidf_bin: ", X_test_tfidf_bin.shape)


***Build Word2Vec document vectors for the binary subsets***

In [None]:
import numpy as np

# Assuming you already have w2v_model and document_vector() defined

X_train_w2v_bin = np.vstack(
    binary_train["tokens"].apply(lambda toks: document_vector(toks, use_tfidf_weight=True))
)
X_val_w2v_bin = np.vstack(
    binary_val["tokens"].apply(lambda toks: document_vector(toks, use_tfidf_weight=True))
)
X_test_w2v_bin = np.vstack(
    binary_test["tokens"].apply(lambda toks: document_vector(toks, use_tfidf_weight=True))
)

print("Binary Word2Vec shapes:")
print("X_train_w2v_bin:", X_train_w2v_bin.shape)
print("X_val_w2v_bin:  ", X_val_w2v_bin.shape)
print("X_test_w2v_bin: ", X_test_w2v_bin.shape)


***Utility: evaluate model***

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

def evaluate_model(model_name, representation_name, y_true, y_pred):
    print(f"\n=== {model_name} + {representation_name} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))


***TF-IDF + Naive Bayes***

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf_bin, y_train)

pred_val = nb_tfidf.predict(X_val_tfidf_bin)

evaluate_model("Naive Bayes", "TF-IDF", y_val, pred_val)


***TF-IDF + Logistic Regression***

In [None]:
from sklearn.linear_model import LogisticRegression

lr_tfidf = LogisticRegression(max_iter=2000)
lr_tfidf.fit(X_train_tfidf_bin, y_train)

pred_val = lr_tfidf.predict(X_val_tfidf_bin)

evaluate_model("Logistic Regression", "TF-IDF", y_val, pred_val)


***Word2Vec + Naive Bayes***

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_w2v = GaussianNB()
nb_w2v.fit(X_train_w2v_bin, y_train)

pred_val = nb_w2v.predict(X_val_w2v_bin)

evaluate_model("Naive Bayes (Gaussian)", "Word2Vec", y_val, pred_val)


***Word2Vec + Logistic Regression***

In [None]:
lr_w2v = LogisticRegression(max_iter=2000)
lr_w2v.fit(X_train_w2v_bin, y_train)

pred_val = lr_w2v.predict(X_val_w2v_bin)

evaluate_model("Logistic Regression", "Word2Vec", y_val, pred_val)


# ***ב-1- סיווג רב מחלקתי כלומר 3***

***Build multi-class subsets (easy, medium, hard)***

In [None]:
# Keep only the three target classes
target_levels = ["easy", "medium", "hard"]

multi_train = train_df[train_df["level"].isin(target_levels)].copy()
multi_val   = val_df[val_df["level"].isin(target_levels)].copy()
multi_test  = test_df[test_df["level"].isin(target_levels)].copy()

print("Train size:", len(multi_train))
print("Validation size:", len(multi_val))
print("Test size:", len(multi_test))

print("\nTrain label distribution:")
print(multi_train["level"].value_counts(normalize=True))

print("\nUnique levels in all splits:")
print("Train:", multi_train["level"].unique())
print("Val:  ", multi_val["level"].unique())
print("Test: ", multi_test["level"].unique())


***Encode labels (3 classes)***

In [None]:
from sklearn.preprocessing import LabelEncoder

le_multi = LabelEncoder()

y_train_multi = le_multi.fit_transform(multi_train["level"])
y_val_multi   = le_multi.transform(multi_val["level"])
y_test_multi  = le_multi.transform(multi_test["level"])

print("Label classes (order):", le_multi.classes_)  # expects ['easy' 'hard' 'medium'] or similar


***TF-IDF representation for multi-class***

In [None]:
# Transform clean_text into TF-IDF vectors using the existing fitted vectorizer
X_train_tfidf_multi = tfidf_vectorizer.transform(multi_train["clean_text"])
X_val_tfidf_multi   = tfidf_vectorizer.transform(multi_val["clean_text"])
X_test_tfidf_multi  = tfidf_vectorizer.transform(multi_test["clean_text"])

print("TF-IDF shapes (multi-class):")
print("X_train_tfidf_multi:", X_train_tfidf_multi.shape)
print("X_val_tfidf_multi:  ", X_val_tfidf_multi.shape)
print("X_test_tfidf_multi: ", X_test_tfidf_multi.shape)


***Word2Vec document vectors for multi-class***

In [None]:
import numpy as np

# Build document-level vectors using the existing Word2Vec model
X_train_w2v_multi = np.vstack(
    multi_train["tokens"].apply(lambda toks: document_vector(toks, use_tfidf_weight=True))
)
X_val_w2v_multi = np.vstack(
    multi_val["tokens"].apply(lambda toks: document_vector(toks, use_tfidf_weight=True))
)
X_test_w2v_multi = np.vstack(
    multi_test["tokens"].apply(lambda toks: document_vector(toks, use_tfidf_weight=True))
)

print("Word2Vec document shapes (multi-class):")
print("X_train_w2v_multi:", X_train_w2v_multi.shape)
print("X_val_w2v_multi:  ", X_val_w2v_multi.shape)
print("X_test_w2v_multi: ", X_test_w2v_multi.shape)


***Evaluation helper (Accuracy, macro-F1, confusion matrix)***

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

def evaluate_multi(model_name, representation_name, y_true, y_pred, label_encoder):
    """
    Print accuracy, macro F1, and confusion matrix for a multi-class setting.
    """
    print(f"\n=== {model_name} + {representation_name} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Macro F1:", f1_score(y_true, y_pred, average="macro"))
    print("\nConfusion Matrix (rows=true, cols=pred):")
    print(confusion_matrix(y_true, y_pred))
    print("Label order:", label_encoder.classes_)


***TF-IDF + Multinomial Naive Bayes (3 classes)***

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_tfidf_multi = MultinomialNB()
nb_tfidf_multi.fit(X_train_tfidf_multi, y_train_multi)

pred_val_nb_tfidf = nb_tfidf_multi.predict(X_val_tfidf_multi)

evaluate_multi("Naive Bayes (Multinomial)", "TF-IDF", y_val_multi, pred_val_nb_tfidf, le_multi)


***TF-IDF + Logistic Regression (3 classes)***

In [None]:
from sklearn.linear_model import LogisticRegression

lr_tfidf_multi = LogisticRegression(max_iter=2000)
lr_tfidf_multi.fit(X_train_tfidf_multi, y_train_multi)

pred_val_lr_tfidf = lr_tfidf_multi.predict(X_val_tfidf_multi)

evaluate_multi("Logistic Regression", "TF-IDF", y_val_multi, pred_val_lr_tfidf, le_multi)


***Word2Vec + Gaussian Naive Bayes (3 classes)***

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_w2v_multi = GaussianNB()
nb_w2v_multi.fit(X_train_w2v_multi, y_train_multi)

pred_val_nb_w2v = nb_w2v_multi.predict(X_val_w2v_multi)

evaluate_multi("Naive Bayes (Gaussian)", "Word2Vec", y_val_multi, pred_val_nb_w2v, le_multi)


***Word2Vec + Logistic Regression (3 classes)***

In [None]:
lr_w2v_multi = LogisticRegression(max_iter=2000)
lr_w2v_multi.fit(X_train_w2v_multi, y_train_multi)

pred_val_lr_w2v = lr_w2v_multi.predict(X_val_w2v_multi)

evaluate_multi("Logistic Regression", "Word2Vec", y_val_multi, pred_val_lr_w2v, le_multi)


# ***Since our experiments clearly showed that TF-IDF consistently outperforms Word2Vec across all models and evaluation metrics, we decided to discontinue the use of Word2Vec and proceed exclusively with TF-IDF representations in the following stages***

***הגדרת פונקצייה לניסויים בהיפר-פרמטרים***

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

def evaluate_scores(y_true, y_pred):
    """
    Compute accuracy and macro F1 score.
    """
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="macro")
    return acc, f1


def tune_nb_tfidf(X_train, y_train, X_val, y_val, alphas, representation_name="TF-IDF"):
    """
    Hyperparameter tuning for Multinomial Naive Bayes on TF-IDF features.
    Varies the smoothing parameter 'alpha' and prints validation performance.
    Returns a list of results (alpha, accuracy, f1_macro).
    """
    results = []
    print(f"\n=== Naive Bayes (Multinomial) + {representation_name} — alpha sweep ===")
    for a in alphas:
        model = MultinomialNB(alpha=a)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        acc, f1 = evaluate_scores(y_val, y_pred)
        results.append({"alpha": a, "accuracy": acc, "f1_macro": f1})
        print(f"alpha = {a:>4}  ->  Accuracy = {acc:.4f},  Macro F1 = {f1:.4f}")
    # Print best by F1
    best = max(results, key=lambda r: r["f1_macro"])
    print(f"\nBest alpha by macro F1: {best['alpha']} (Accuracy={best['accuracy']:.4f}, F1={best['f1_macro']:.4f})")
    return results


def tune_logistic(
    X_train,
    y_train,
    X_val,
    y_val,
    Cs,
    max_iter=1000,
    representation_name="TF-IDF",
):
    """
    Hyperparameter tuning for Logistic Regression on TF-IDF features.
    Varies the regularization strength C and prints validation performance.
    Returns a list of results (C, accuracy, f1_macro).
    """
    results = []
    print(f"\n=== Logistic Regression + {representation_name} — C sweep (max_iter={max_iter}) ===")
    for c in Cs:
        clf = LogisticRegression(C=c, max_iter=max_iter)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        acc, f1 = evaluate_scores(y_val, y_pred)
        results.append({"C": c, "accuracy": acc, "f1_macro": f1})
        print(f"C = {c:>5}  ->  Accuracy = {acc:.4f},  Macro F1 = {f1:.4f}")
    # Print best by F1
    best = max(results, key=lambda r: r["f1_macro"])
    print(f"\nBest C by macro F1: {best['C']} (Accuracy={best['accuracy']:.4f}, F1={best['f1_macro']:.4f})")
    return results


# **תזכורת:**

### ✔ Accuracy (דיוק)
כמה אחוז מהניבואים של המודל היו נכונים מתוך כלל הדוגמאות.

**איך להבין את זה?**  
אם המודל ניחש נכון 70% מהפעמים → Accuracy = 0.70

**מתי זה טוב?**  
כאשר הדאטה מאוזן*
(כל המחלקות מופיעות בערך באותה כמות).

**החיסרון:**  
אם מחלקה אחת מופיעה הרבה יותר – המדד עלול להיות מטעה.

---

### ✔ F1 Score (מדד F1)
מדד שמחבר בין
 Precision ו־Recall
  למדד אחד מאוזן.

**איך להבין את זה?**  
 גבוה = המודל גם מוצא נכון דוגמאות של המחלקה וגם לא טועה הרבה.  
 נמוך = או שהמודל מפספס הרבה דוגמאות, או שהוא טועה הרבה.

**מתי משתמשים בו?**  
כאשר חשוב לזהות כל מחלקה בצורה טובה במיוחד,
או כאשר יש אי־איזון בין המחלקות.

---

### ✔ Macro F1 (מדד F1 מאקרו)
מחשב את ה
F1
 לכל מחלקה בנפרד, ואז עושה ממוצע פשוט ביניהן.

**איך להבין את זה?**  
כל מחלקה מקבלת משקל שווה — גם אם יש ממנה מעט דוגמאות.

**למה זה חשוב?**  
כי בבעיות שבהן חלק מהמחלקות מופיעות מעט ,  
Accuracy
 יכול להטעות,
אבל
Macro F1
מוודא שהמודל מצליח גם על המחלקות הקטנות.


---


***ניסויים בהיפר פרמטרים***

In [None]:
# ============================================
# Additional Hyperparameter Experiments (TF-IDF only)
# ============================================

# ----------------------------------------------------------
# 1) Naive Bayes + TF-IDF with more alpha values
# ----------------------------------------------------------

nb_alphas_extended = [0.01, 0.1, 0.5, 1.0, 2.0]
nb_tfidf_results_extended = tune_nb_tfidf(
    X_train_tfidf_multi,
    y_train_multi,
    X_val_tfidf_multi,
    y_val_multi,
    alphas=nb_alphas_extended,
    representation_name="TF-IDF (multi-class) — extended alpha"
)

# ----------------------------------------------------------
# 2) Logistic Regression + TF-IDF with extended C values
# ----------------------------------------------------------

lr_C_extended = [0.01, 0.1, 1.0, 10.0, 50.0, 100.0]
lr_tfidf_results_extended = tune_logistic(
    X_train_tfidf_multi,
    y_train_multi,
    X_val_tfidf_multi,
    y_val_multi,
    Cs=lr_C_extended,
    max_iter=3000,  # slightly higher, helps convergence
    representation_name="TF-IDF (multi-class) — extended C",
)

# ----------------------------------------------------------
# 3) Logistic Regression + TF-IDF — small max_iter test
# ----------------------------------------------------------

lr_tfidf_small_iter = tune_logistic(
    X_train_tfidf_multi,
    y_train_multi,
    X_val_tfidf_multi,
    y_val_multi,
    Cs=[1.0],
    max_iter=200,  # very small to check convergence behavior
    representation_name="TF-IDF (multi-class) — small max_iter",
)

# ----------------------------------------------------------
# 4) Logistic Regression + TF-IDF — large max_iter test
# ----------------------------------------------------------

lr_tfidf_large_iter = tune_logistic(
    X_train_tfidf_multi,
    y_train_multi,
    X_val_tfidf_multi,
    y_val_multi,
    Cs=[1.0],
    max_iter=5000,  # large enough to guarantee convergence
    representation_name="TF-IDF (multi-class) — large max_iter",
)


# ***עד לפה זה החדש =========================================================================================================================================================================================================***