# ***תרגיל 5 של הפרוייקט***

**Load Data and Basic Setup**

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("/content/train-filtered_question_level.csv")

# Remove duplicate questions
df = df.drop_duplicates(subset=["question"], keep="first")

# Extract text and difficulty levels
texts = df["question"].astype(str).tolist()
levels = df["level"].tolist()


**Balancing Dataset (Undersampling to Minority Class)**

In [4]:
import pandas as pd

# 1. נגדיר את גודל היעד לפי המחלקה הקטנה ביותר (Hard)
target_size = 15657

# 2. נבצע דגימה מכל מחלקה בנפרד
df_hard = df[df['level'] == 'hard']
# כאן אנחנו לא עושים sample כי זה כבר הגודל שאנחנו רוצים

df_medium_downsampled = df[df['level'] == 'medium'].sample(n=target_size, random_state=42)
df_easy_downsampled = df[df['level'] == 'easy'].sample(n=target_size, random_state=42)

# 3. נחבר את שלושתן יחד
df_balanced = pd.concat([df_hard, df_medium_downsampled, df_easy_downsampled])

# 4. נערבב את הדאטה (חשוב מאוד!)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# בדיקת תוצאה
print("התפלגות חדשה:")
print(df_balanced['level'].value_counts())

התפלגות חדשה:
level
easy      15657
hard      15657
medium    15657
Name: count, dtype: int64


Choosing Maximum Sequence Length (Documentation)

In [None]:
# Compute sequence lengths (in tokens)
sequence_lengths = [len(seq) for seq in sequences]

# Basic statistics
avg_len = np.mean(sequence_lengths)
percentile_95 = np.percentile(sequence_lengths, 95)

print("Average sequence length:", round(avg_len, 2))
print("95th percentile length:", percentile_95)


Padding and Truncation

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Chosen maximum sequence length
MAX_SEQUENCE_LENGTH = 32

# Apply padding and truncation
X_padded = pad_sequences(
    sequences,
    maxlen=MAX_SEQUENCE_LENGTH,
    padding="post",
    truncating="post"
)

print("Final input shape:", X_padded.shape)


ב': יציר ה והשוואת Embedding

Creating and Comparing Word Embedding

Encode Difficulty Levels (level → integers)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode difficulty levels (Easy / Medium / Hard)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(levels)

print("Encoded classes:", list(label_encoder.classes_))




Encoded classes: [np.str_('easy'), np.str_('hard'), np.str_('medium')]
