# ***תרגיל 5 של הפרוייקט***

**Load Data and Basic Setup**

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("/content/train-filtered_question_level.csv")

# Remove duplicate questions
df = df.drop_duplicates(subset=["question"], keep="first")

# Extract text and difficulty levels
texts = df["question"].astype(str).tolist()
levels = df["level"].tolist()


**Balancing Dataset (Undersampling to Minority Class)**

In [4]:
import pandas as pd

# 1. נגדיר את גודל היעד לפי המחלקה הקטנה ביותר (Hard)
target_size = 15657

# 2. נבצע דגימה מכל מחלקה בנפרד
df_hard = df[df['level'] == 'hard']
# כאן אנחנו לא עושים sample כי זה כבר הגודל שאנחנו רוצים

df_medium_downsampled = df[df['level'] == 'medium'].sample(n=target_size, random_state=42)
df_easy_downsampled = df[df['level'] == 'easy'].sample(n=target_size, random_state=42)

# 3. נחבר את שלושתן יחד
df_balanced = pd.concat([df_hard, df_medium_downsampled, df_easy_downsampled])

# 4. נערבב את הדאטה (חשוב מאוד!)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# בדיקת תוצאה
print("התפלגות חדשה:")
print(df_balanced['level'].value_counts())

התפלגות חדשה:
level
easy      15657
hard      15657
medium    15657
Name: count, dtype: int64


# **שלב 1**

# **א**

**Choosing Maximum Sequence Length (Documentation)**

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

# 1. הגדרת פרמטרים
VOCAB_SIZE = 20000

# 2. אתחול הטוקנייזר (יצירת האובייקט שהיה חסר)
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")

# 3. חילוץ הטקסטים מהדאטה המאוזן (ודא ששם העמודה נכון, נניח 'text')
texts = df_balanced['question'].astype(str).tolist()

# 4. התאמת הטוקנייזר על הטקסטים (שלב קריטי!)
tokenizer.fit_on_texts(texts)

# 5. המרה לרצפים של מספרים
sequences = tokenizer.texts_to_sequences(texts)

# עכשיו הסטטיסטיקות שלך יעבדו:
sequence_lengths = [len(seq) for seq in sequences]
avg_len = np.mean(sequence_lengths)
percentile_95 = np.percentile(sequence_lengths, 95)

print("Average sequence length:", round(avg_len, 2))
print("95th percentile length:", percentile_95)
print("Vocabulary size (actual):", len(tokenizer.word_index))

Average sequence length: 19.32
95th percentile length: 44.0
Vocabulary size (actual): 58011


**Padding and Truncation**

In [19]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1. הגדרת המילון (למקרה שלא הוגדר בתא הזה)
label_dict = {'easy': 0, 'medium': 1, 'hard': 2}

# 2. קביעת אורך הרצף
MAX_SEQUENCE_LENGTH = int(percentile_95)

# 3. יצירת X (Padding)
X = pad_sequences(
    sequences,
    maxlen=MAX_SEQUENCE_LENGTH,
    padding="post",
    truncating="post"
)

# 4. יצירת y (הפיכת הקטגוריות למטריצה של 0 ו-1)
y_integers = df_balanced['level'].map(label_dict).values
y = to_categorical(y_integers, num_classes=3)

print(f"Data is ready! X shape: {X.shape}, y shape: {y.shape}")

Data is ready! X shape: (46971, 44), y shape: (46971, 3)


# **ב**

**ניסוי 1**


**Embedding Layer מאומן מאפס**

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split

# --- 1. הגדרת ארכיטקטורת המודל (בדיוק כמו ה-Keras שהיה לנו) ---
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        # ב-LSTM אנחנו לוקחים את ה-hidden state האחרון
        _, (hidden, _) = self.lstm(embedded)
        out = self.dropout(hidden[-1])
        return self.fc(out)

# --- 2. הכנת הנתונים (הפיכה מ-NumPy ל-PyTorch Tensors) ---
# נשתמש ב-y_integers (מספרים 0,1,2) ולא ב-y (מטריצה), כי PyTorch מעדיף ככה
X_tensor = torch.tensor(X, dtype=torch.long)
y_tensor = torch.tensor(y_integers, dtype=torch.long)

dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_data, val_data = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)

# --- 3. יצירת המודל, האופטימייזר וה-Loss ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMClassifier(VOCAB_SIZE, 100, 64, 3).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# --- 4. לולאת האימון (עם ה-print שביקשת) ---
epochs = 10
print("Starting training (PyTorch version)...")

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    # חישוב דיוק על ה-Validation (בדיוק כמו ב-Keras)
    model.eval()
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            val_total += labels.size(0)
            val_correct += predicted.eq(labels).sum().item()

    print(f"Epoch {epoch+1}/{epochs} "
          f"━━━━━━━━━━━━━━━━━━━━ "
          f"loss: {running_loss/len(train_loader):.4f} - "
          f"acc: {100.*correct/total:.2f}% - "
          f"val_acc: {100.*val_correct/val_total:.2f}%")

Starting training (PyTorch version)...
Epoch 1/10 ━━━━━━━━━━━━━━━━━━━━ loss: 0.9719 - acc: 46.42% - val_acc: 49.49%
Epoch 2/10 ━━━━━━━━━━━━━━━━━━━━ loss: 0.8989 - acc: 52.15% - val_acc: 52.57%
Epoch 3/10 ━━━━━━━━━━━━━━━━━━━━ loss: 0.8479 - acc: 54.84% - val_acc: 53.24%
Epoch 4/10 ━━━━━━━━━━━━━━━━━━━━ loss: 0.7938 - acc: 58.29% - val_acc: 52.38%
Epoch 5/10 ━━━━━━━━━━━━━━━━━━━━ loss: 0.7364 - acc: 62.12% - val_acc: 52.85%
Epoch 6/10 ━━━━━━━━━━━━━━━━━━━━ loss: 0.6753 - acc: 65.73% - val_acc: 52.45%
Epoch 7/10 ━━━━━━━━━━━━━━━━━━━━ loss: 0.6144 - acc: 69.94% - val_acc: 51.81%
Epoch 8/10 ━━━━━━━━━━━━━━━━━━━━ loss: 0.5542 - acc: 74.33% - val_acc: 52.02%
Epoch 9/10 ━━━━━━━━━━━━━━━━━━━━ loss: 0.4931 - acc: 78.35% - val_acc: 51.74%
Epoch 10/10 ━━━━━━━━━━━━━━━━━━━━ loss: 0.4269 - acc: 82.53% - val_acc: 51.53%


**dounload GloVe**

In [21]:
# הורדת קובץ ה-GloVe (זה עשוי לקחת דקה-שתיים)
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2026-01-01 20:47:00--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2026-01-01 20:47:00--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2026-01-01 20:47:01--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

**Creating the Embedding Matrix from GloVe**

In [26]:
import numpy as np
import torch

def create_embedding_matrix(word_index, vocab_size, embedding_dim=100):
    # 1. טעינת הוקטורים מהקובץ
    embeddings_index = {}
    with open('glove.6B.100d.txt', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    # 2. בניית המטריצה עבור ה-Vocab שלנו
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
            else:
                # מילים שלא נמצאו יקבלו וקטור רנדומלי (או אפסים)
                embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

    return torch.tensor(embedding_matrix, dtype=torch.float32)

# יצירת המטריצה
embedding_weights = create_embedding_matrix(tokenizer.word_index, VOCAB_SIZE)

**Building the model in PyTorch with Frozen/Fine-tuned support**

In [27]:
import torch.nn as nn

class GloVeLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, weights, freeze=True):
        super(GloVeLSTMModel, self).__init__()

        # טעינת המשקולות של GloVe
        # ה-parameter 'freeze' קובע אם המודל יעדכן את הוקטורים (Fine-tune) או לא (Frozen)
        self.embedding = nn.Embedding.from_pretrained(weights, freeze=freeze)

        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        out = self.dropout(hidden[-1])
        return self.fc(out)

**The Frozen model**

In [28]:
model_frozen = GloVeLSTMModel(VOCAB_SIZE, 100, 64, 3, embedding_weights, freeze=True).to(device)
# כאן תבוא לולאת האימון (אותה לולאה שכתבנו קודם)
print("Training Frozen GloVe Model...")
# train_model(model_frozen, train_loader)

Training Frozen GloVe Model...


**Fine-tuned**

In [29]:
model_finetune = GloVeLSTMModel(VOCAB_SIZE, 100, 64, 3, embedding_weights, freeze=False).to(device)
print("Training Fine-tuned GloVe Model...")
# train_model(model_finetune, train_loader)

Training Fine-tuned GloVe Model...
