Adatok előfeldolgozása




In [11]:
import requests
import zipfile
import os
import json
from collections import Counter
import re
import pandas as pd

In [3]:


url = "https://bmeedu-my.sharepoint.com/:u:/g/personal/gyires-toth_balint_vik_bme_hu/IQDYwXUJcB_jQYr0bDfNT5RKARYgfKoH97zho3rxZ46KA1I?e=iFp3iz&download=1"

response = requests.get(url)

print(response.status_code)        # should be 200
print(response.headers.get('content-type'))  # should be 'application/zip' or similar

# Optional: save a small part to check
with open("/content/legaltextdecoder.zip", "wb") as f:
    f.write(response.content)  # first 1 KB
print("Downloaded 1 KB to /content/legaltextdecoder.zip")

200
application/x-zip-compressed
Downloaded 1 KB to /content/legaltextdecoder.zip


In [4]:
zip_path = "legaltextdecoder.zip"
extract_path = "data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [5]:
base_dir = "data"
all_data = []

json_file_count = 0
record_count = 0

for root, dirs, files in os.walk(base_dir):
    dirs[:] = [d for d in dirs if d != "consensus"]

    for file in files:
        if file.endswith(".json"):
            json_file_count += 1
            file_path = os.path.join(root, file)

            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

                if isinstance(data, list):
                    all_data.extend(data)
                    record_count += len(data)
                else:
                    all_data.append(data)
                    record_count += 1

print(f"JSON files loaded: {json_file_count}")
print(f"Total data records: {record_count}")

JSON files loaded: 34
Total data records: 3897


In [6]:
texts = []
labels = []

for item in all_data:
    text = item.get("data", {}).get("text")

    if not text:
        continue

    annotations = item.get("annotations", [])
    if not annotations:
        continue

    ann = annotations[0]
    results = ann.get("result", [])
    if not results:
        continue

    choices = results[0].get("value", {}).get("choices", [])
    if not choices:
        continue

    label = choices[0]

    texts.append(text)
    labels.append(label)

print("Loaded data records:", len(texts))

Loaded data records: 3747


In [7]:
label_counts = Counter(labels)

print("Label distribution:")
for label, count in label_counts.items():
    print(f"{label}: {count}")


Label distribution:
4-Érthető: 1122
3-Többé/kevésbé megértem: 816
5-Könnyen érthető: 1159
2-Nehezen érthető: 450
1-Nagyon nehezen érthető: 200


In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

clean_texts = [clean_text(t) for t in texts]

df = pd.DataFrame({
    "text": clean_texts,
    "label": labels
})

print(f"Samples after cleaning: {len(df)}")

duplicate_count = df.duplicated(subset="text").sum()
print(f"Duplicate texts detected: {duplicate_count}")

df = df.drop_duplicates(subset="text")
print(f"Samples after removing duplicates: {len(df)}")

print("Label distribution:")
print(f"\n{df['label'].value_counts()}")

df["text_length"] = df["text"].apply(lambda x: len(x.split()))

print("Text length statistics:")
print("\n" + df["text_length"].describe().to_string())

label_counts = df["label"].value_counts()
imbalance_ratio = label_counts.max() / label_counts.min()
print("Class imbalance analysis:")
print(f"\n{label_counts}")
print(f"Imbalance ratio (max/min): {imbalance_ratio:.2f}")

short_texts = (df["text_length"] < 5).sum()
print(f"Texts with fewer than 5 words: {short_texts}")

Samples after cleaning: 3747
Duplicate texts detected: 354
Samples after removing duplicates: 3393
Label distribution:

label
5-Könnyen érthető           1094
4-Érthető                   1021
3-Többé/kevésbé megértem     722
2-Nehezen érthető            389
1-Nagyon nehezen érthető     167
Name: count, dtype: int64
Text length statistics:

count    3393.000000
mean       50.390510
std        56.480726
min         1.000000
25%        21.000000
50%        38.000000
75%        62.000000
max      1186.000000
Class imbalance analysis:

label
5-Könnyen érthető           1094
4-Érthető                   1021
3-Többé/kevésbé megértem     722
2-Nehezen érthető            389
1-Nagyon nehezen érthető     167
Name: count, dtype: int64
Imbalance ratio (max/min): 6.55
Texts with fewer than 5 words: 93


In [16]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [None]:
#Baseline modell

#Túltanítás
train_texts = texts[:8]
train_labels = labels[:8]

test_texts = texts[8:]
test_labels = labels[8:]

label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train = label_encoder.transform(train_labels)
y_test = label_encoder.transform(test_labels)

NUM_CLASSES = 5
y_train = np.clip(y_train, 0, NUM_CLASSES - 1)
y_test = np.clip(y_test, 0, NUM_CLASSES - 1)

y_train = to_categorical(y_train, num_classes=NUM_CLASSES)

MAX_FEATURES = 2000

tfidf = TfidfVectorizer(
    max_features=MAX_FEATURES,
    ngram_range=(1, 2),
    min_df=1
)

X_train = tfidf.fit_transform(train_texts).toarray()
X_test = tfidf.transform(test_texts).toarray()

print("TF-IDF feature size:", X_train.shape[1])

# -----------------------------
# Neural Network (MLP)
# -----------------------------
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train,
    y_train,
    epochs=100,
    batch_size=2,
    verbose=1
)


TF-IDF feature size: 86


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.3167 - loss: 1.5985
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2333 - loss: 1.5454    
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9500 - loss: 1.4953 
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7667 - loss: 1.4741 
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9500 - loss: 1.3850 
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8667 - loss: 1.3597
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8667 - loss: 1.3214 
Epoch 8/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9500 - loss: 1.2699 
Epoch 9/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x7985650805c0>

Az overfitt megtörtént.

Baseline modell betanítása

In [14]:
from sklearn.model_selection import train_test_split

# Train vs temp (val + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    texts,
    labels,
    test_size=0.25,
    stratify=labels,
    random_state=42
)

# Val vs test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.40,
    stratify=y_temp,
    random_state=42
)

print("Train size:", len(X_train))
print("Val size:  ", len(X_val))
print("Test size: ", len(X_test))


Train size: 2810
Val size:   562
Test size:  375


In [17]:
label_encoder = LabelEncoder()
label_encoder.fit(labels)  # Fit on all labels

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

# One-hot encoding for MLP
NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# TF-IDF Vectorization
# -----------------------------
MAX_FEATURES = 2000

tfidf = TfidfVectorizer(
    max_features=MAX_FEATURES,
    ngram_range=(1, 2),
    min_df=1
)

X_train_vec = tfidf.fit_transform(X_train).toarray()
X_val_vec   = tfidf.transform(X_val).toarray()
X_test_vec  = tfidf.transform(X_test).toarray()

print("TF-IDF feature size:", X_train_vec.shape[1])

# -----------------------------
# Neural Network (MLP)
# -----------------------------
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_vec.shape[1],)),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print(f"Hyperparameters:\nEpochs: 20\nBatch size: 16\nLearning rate: {model.optimizer.learning_rate.numpy()}")
model.summary()

# -----------------------------
# Train (with validation)
# -----------------------------
model.fit(
    X_train_vec,
    y_train_enc,
    validation_data=(X_val_vec, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_vec)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))

TF-IDF feature size: 2000
Hyperparameters:
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.2844 - loss: 1.5587 - val_accuracy: 0.3932 - val_loss: 1.4119
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.4470 - loss: 1.3411 - val_accuracy: 0.4484 - val_loss: 1.2863
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5852 - loss: 1.0885 - val_accuracy: 0.4609 - val_loss: 1.2836
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6986 - loss: 0.8571 - val_accuracy: 0.4555 - val_loss: 1.3235
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7899 - loss: 0.6526 - val_accuracy: 0.4502 - val_loss: 1.4295
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8514 - loss: 0.5148 - val_accuracy: 0.4573 - val_loss: 1.5925
Epoch 7/20
[1m176/176[0m 

Early stopping hozzáadása


In [18]:
from tensorflow.keras.callbacks import EarlyStopping


# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)  # Fit on all labels

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# TF-IDF Vectorization
# -----------------------------
MAX_FEATURES = 2000

tfidf = TfidfVectorizer(
    max_features=MAX_FEATURES,
    ngram_range=(1, 2),
    min_df=1
)

X_train_vec = tfidf.fit_transform(X_train).toarray()
X_val_vec   = tfidf.transform(X_val).toarray()
X_test_vec  = tfidf.transform(X_test).toarray()

print("TF-IDF feature size:", X_train_vec.shape[1])

# -----------------------------
# Neural Network (MLP)
# -----------------------------
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_vec.shape[1],)),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
print(f"Hyperparameters:\nEpochs: 100\nBatch size: 16\nLearning rate: {model.optimizer.learning_rate.numpy()}")
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',   # Stop when validation loss stops improving
    patience=5,           # Wait 5 epochs before stopping
    restore_best_weights=True
)

# -----------------------------
# Train with EarlyStopping
# -----------------------------
model.fit(
    X_train_vec,
    y_train_enc,
    validation_data=(X_val_vec, y_val_enc),
    epochs=100,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_vec)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))

TF-IDF feature size: 2000
Hyperparameters:
Epochs: 100
Batch size: 16
Learning rate: 0.0010000000474974513


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3019 - loss: 1.5410 - val_accuracy: 0.3808 - val_loss: 1.4134
Epoch 2/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4558 - loss: 1.3204 - val_accuracy: 0.4270 - val_loss: 1.2844
Epoch 3/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.5699 - loss: 1.0815 - val_accuracy: 0.4609 - val_loss: 1.2583
Epoch 4/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.6878 - loss: 0.8506 - val_accuracy: 0.4520 - val_loss: 1.3151
Epoch 5/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7990 - loss: 0.6435 - val_accuracy: 0.4520 - val_loss: 1.4087
Epoch 6/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8666 - loss: 0.4843 - val_accuracy: 0.4413 - val_loss: 1.5354
Epoch 7/100
[1m176/1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TFIDF csere Embeddingre

In [20]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100
print(f"Encoder info: num_words: {MAX_VOCAB}\n max_length: {MAX_LEN}")

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense)
# -----------------------------
EMBED_DIM = 32

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
print(f"Hyperparameters:\nEpochs: 20\nBatch size: 16\nLearning rate: {model.optimizer.learning_rate.numpy()}")
model.build(input_shape=(None, MAX_LEN))
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))

Encoder info: num_words: 5000
 max_length: 100
Hyperparameters:
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.3763 - loss: 1.4165 - val_accuracy: 0.4466 - val_loss: 1.3044
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.5365 - loss: 1.1967 - val_accuracy: 0.4698 - val_loss: 1.2603
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.7536 - loss: 0.7585 - val_accuracy: 0.4662 - val_loss: 1.3624
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8993 - loss: 0.3483 - val_accuracy: 0.4431 - val_loss: 1.5703
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.9407 - loss: 0.2095 - val_accuracy: 0.4377 - val_loss: 1.7571
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9476 - loss: 0.1516 - val_accuracy: 0.4324 - val_loss: 1.8484
Epoch 7/20
[1m176/176[

LSTM kipróbálása

In [22]:
from tensorflow.keras.layers import LSTM
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → LSTM → Dense)
# -----------------------------
EMBED_DIM = 32
LSTM_UNITS = 16

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    LSTM(LSTM_UNITS),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\n===== TRAINING CONFIGURATION =====")
print(f"MAX_VOCAB: {MAX_VOCAB}")
print(f"MAX_LEN: {MAX_LEN}")
print(f"Embedding dim: {EMBED_DIM}")
print(f"LSTM units: {LSTM_UNITS}")
print(f"Epochs: 20")
print(f"Batch size: 16")
print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
print(f"Number of classes: {NUM_CLASSES}")
model.build(input_shape=(None, MAX_LEN))
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
LSTM units: 16
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 37ms/step - accuracy: 0.2918 - loss: 1.5050 - val_accuracy: 0.3612 - val_loss: 1.3648
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 52ms/step - accuracy: 0.3557 - loss: 1.3920 - val_accuracy: 0.3826 - val_loss: 1.3437
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 97ms/step - accuracy: 0.3998 - loss: 1.3443 - val_accuracy: 0.4181 - val_loss: 1.3115
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 74ms/step - accuracy: 0.4496 - loss: 1.2817 - val_accuracy: 0.4395 - val_loss: 1.2789
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 66ms/step - accuracy: 0.5222 - loss: 1.1558 - val_accuracy: 0.4822 - val_loss: 1.2330
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 84ms/step - accuracy: 0.5740 - loss: 1.0671 - val_accuracy: 0.4680 - val_loss: 1.2839
Epoch 7/20
[1m17

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


GRU kipróbálása (elvileg jó kis adathalmazokon)

In [23]:
from tensorflow.keras.layers import GRU
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → LSTM → Dense)
# -----------------------------
EMBED_DIM = 32

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    GRU(32),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\n===== TRAINING CONFIGURATION =====")
print(f"MAX_VOCAB: {MAX_VOCAB}")
print(f"MAX_LEN: {MAX_LEN}")
print(f"Embedding dim: {EMBED_DIM}")
print(f"Epochs: 20")
print(f"Batch size: 16")
print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
print(f"Number of classes: {NUM_CLASSES}")
model.build(input_shape=(None, MAX_LEN))
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 121ms/step - accuracy: 0.3099 - loss: 1.4993 - val_accuracy: 0.3221 - val_loss: 1.4191
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 50ms/step - accuracy: 0.3626 - loss: 1.4122 - val_accuracy: 0.4288 - val_loss: 1.3379
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 56ms/step - accuracy: 0.4400 - loss: 1.3035 - val_accuracy: 0.4502 - val_loss: 1.2443
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 55ms/step - accuracy: 0.5723 - loss: 1.0582 - val_accuracy: 0.4413 - val_loss: 1.2680
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 50ms/step - accuracy: 0.6216 - loss: 0.9610 - val_accuracy: 0.4555 - val_loss: 1.3576
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 54ms/step - accuracy: 0.6842 - loss: 0.8351 - val_accuracy: 0.4359 - val_loss: 1.4656
Epoch 7/20
[1m1

Dropout bevezetése

In [24]:
from tensorflow.keras.layers import Dropout

label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → GRU → Dense) with Dropout
# -----------------------------
EMBED_DIM = 32
GRU_UNITS = 32
DROPOUT_RATE = 0.3  # 30% dropout

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    GRU(GRU_UNITS),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\n===== TRAINING CONFIGURATION =====")
print(f"MAX_VOCAB: {MAX_VOCAB}")
print(f"MAX_LEN: {MAX_LEN}")
print(f"Embedding dim: {EMBED_DIM}")
print(f"GRU units: {GRU_UNITS}")
print(f"Epochs: 20")
print(f"Batch size: 16")
print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
print(f"Number of classes: {NUM_CLASSES}")
print(f"Dropout rate: {DROPOUT_RATE}")
model.build(input_shape=(None, MAX_LEN))
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
GRU units: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.3




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 60ms/step - accuracy: 0.2987 - loss: 1.5121 - val_accuracy: 0.3345 - val_loss: 1.4260
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 72ms/step - accuracy: 0.3283 - loss: 1.4328 - val_accuracy: 0.3452 - val_loss: 1.4053
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 88ms/step - accuracy: 0.3494 - loss: 1.3963 - val_accuracy: 0.4466 - val_loss: 1.2759
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 59ms/step - accuracy: 0.4760 - loss: 1.2482 - val_accuracy: 0.4075 - val_loss: 1.2796
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 55ms/step - accuracy: 0.5438 - loss: 1.1018 - val_accuracy: 0.4466 - val_loss: 1.2638
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 53ms/step - accuracy: 0.5933 - loss: 0.9925 - val_accuracy: 0.4377 - val_loss: 1.3444
Epoch 7/20
[1m1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Regularizáció bevezetése

In [25]:
from tensorflow.keras.regularizers import l2

label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → GRU → Dense) with Dropout
# -----------------------------
EMBED_DIM = 32
GRU_UNITS = 32
DROPOUT_RATE = 0.3  # 30% dropout
L2_REG = 1e-4

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    GRU(GRU_UNITS),
    Dense(32, activation='relu', kernel_regularizer=l2(L2_REG)),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=l2(L2_REG))
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\n===== TRAINING CONFIGURATION =====")
print(f"MAX_VOCAB: {MAX_VOCAB}")
print(f"MAX_LEN: {MAX_LEN}")
print(f"Embedding dim: {EMBED_DIM}")
print(f"GRU units: {GRU_UNITS}")
print(f"Epochs: 20")
print(f"Batch size: 16")
print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
print(f"Number of classes: {NUM_CLASSES}")
print(f"Dropout rate: {DROPOUT_RATE}")
print(f"L2 regularization: {L2_REG}")
model.build(input_shape=(None, MAX_LEN))
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))




===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
GRU units: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.3
L2 regularization: 0.0001


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 153ms/step - accuracy: 0.2823 - loss: 1.5383 - val_accuracy: 0.3470 - val_loss: 1.4246
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 94ms/step - accuracy: 0.3041 - loss: 1.4674 - val_accuracy: 0.3719 - val_loss: 1.3668
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 89ms/step - accuracy: 0.3471 - loss: 1.3950 - val_accuracy: 0.4395 - val_loss: 1.3026
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 101ms/step - accuracy: 0.4532 - loss: 1.2711 - val_accuracy: 0.4324 - val_loss: 1.2929
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 98ms/step - accuracy: 0.5520 - loss: 1.1060 - val_accuracy: 0.4377 - val_loss: 1.2731
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 107ms/step - accuracy: 0.6031 - loss: 1.0102 - val_accuracy: 0.4537 - val_loss: 1.3599
Epoch 7/20
[

In [26]:
from tensorflow.keras.layers import Dropout

label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → GRU → Dense) with Dropout
# -----------------------------
EMBED_DIM = 32
GRU_UNITS = 32
DROPOUT_RATE = 0.3  # 30% dropout

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    GRU(GRU_UNITS),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\n===== TRAINING CONFIGURATION =====")
print(f"MAX_VOCAB: {MAX_VOCAB}")
print(f"MAX_LEN: {MAX_LEN}")
print(f"Embedding dim: {EMBED_DIM}")
print(f"GRU units: {GRU_UNITS}")
print(f"Epochs: 20")
print(f"Batch size: 16")
print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
print(f"Number of classes: {NUM_CLASSES}")
print(f"Dropout rate: {DROPOUT_RATE}")
model.build(input_shape=(None, MAX_LEN))
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 10000
MAX_LEN: 100
Embedding dim: 32
GRU units: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.3




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 103ms/step - accuracy: 0.2911 - loss: 1.5232 - val_accuracy: 0.3185 - val_loss: 1.4296
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 68ms/step - accuracy: 0.3436 - loss: 1.4389 - val_accuracy: 0.3683 - val_loss: 1.3637
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 52ms/step - accuracy: 0.3782 - loss: 1.3639 - val_accuracy: 0.4270 - val_loss: 1.2875
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 110ms/step - accuracy: 0.5197 - loss: 1.1718 - val_accuracy: 0.4573 - val_loss: 1.3345
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 108ms/step - accuracy: 0.5911 - loss: 1.0146 - val_accuracy: 0.4395 - val_loss: 1.4202
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 121ms/step - accuracy: 0.6868 - loss: 0.8579 - val_accuracy: 0.4680 - val_loss: 1.5410
Epoch 7/20
[

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
from tensorflow.keras.regularizers import l2

label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → GRU → Dense) with Dropout
# -----------------------------
EMBED_DIM = 32
GRU_UNITS = 32
DROPOUT_RATE = 0.3  # 30% dropout
L2_REG = 1e-4

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    GRU(GRU_UNITS),
    Dense(32, activation='relu', kernel_regularizer=l2(L2_REG)),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=l2(L2_REG))
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\n===== TRAINING CONFIGURATION =====")
print(f"MAX_VOCAB: {MAX_VOCAB}")
print(f"MAX_LEN: {MAX_LEN}")
print(f"Embedding dim: {EMBED_DIM}")
print(f"GRU units: {GRU_UNITS}")
print(f"Epochs: 20")
print(f"Batch size: 16")
print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
print(f"Number of classes: {NUM_CLASSES}")
print(f"Dropout rate: {DROPOUT_RATE}")
print(f"L2 regularization: {L2_REG}")
model.build(input_shape=(None, MAX_LEN))
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 10000
MAX_LEN: 100
Embedding dim: 32
GRU units: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.3
L2 regularization: 0.0001




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 132ms/step - accuracy: 0.3020 - loss: 1.5362 - val_accuracy: 0.3470 - val_loss: 1.4416
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 72ms/step - accuracy: 0.3021 - loss: 1.4648 - val_accuracy: 0.3470 - val_loss: 1.4030
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 57ms/step - accuracy: 0.3295 - loss: 1.4093 - val_accuracy: 0.4324 - val_loss: 1.3113
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 61ms/step - accuracy: 0.4597 - loss: 1.2451 - val_accuracy: 0.4324 - val_loss: 1.2541
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 121ms/step - accuracy: 0.5577 - loss: 1.0883 - val_accuracy: 0.4484 - val_loss: 1.2709
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 105ms/step - accuracy: 0.6576 - loss: 0.9279 - val_accuracy: 0.4555 - val_loss: 1.3635
Epoch 7/20
[

Visszatértem sima neurális hálókhoz, az LSTM és a GRU nem teljesítettek jól a tesztjeim alatt.

Különböző perceptron számok és több layer tesztelése


In [28]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense)
# -----------------------------
EMBED_DIM = 32

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\n===== TRAINING CONFIGURATION =====")
print(f"MAX_VOCAB: {MAX_VOCAB}")
print(f"MAX_LEN: {MAX_LEN}")
print(f"Embedding dim: {EMBED_DIM}")
print(f"Epochs: 20")
print(f"Batch size: 16")
print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
print(f"Number of classes: {NUM_CLASSES}")
model.build(input_shape=(None, MAX_LEN))
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3987 - loss: 1.3998 - val_accuracy: 0.4413 - val_loss: 1.2945
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5390 - loss: 1.1522 - val_accuracy: 0.4875 - val_loss: 1.2316
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.7910 - loss: 0.6844 - val_accuracy: 0.4769 - val_loss: 1.3687
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - accuracy: 0.9249 - loss: 0.2909 - val_accuracy: 0.4520 - val_loss: 1.6247
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 25ms/step - accuracy: 0.9495 - loss: 0.1646 - val_accuracy: 0.4164 - val_loss: 1.7422
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.9629 - loss: 0.1255 - val_accuracy: 0.4288 - val_loss: 1.7879
Epoch 7/20
[1m176/176[

In [29]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense)
# -----------------------------
EMBED_DIM = 32

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\n===== TRAINING CONFIGURATION =====")
print(f"MAX_VOCAB: {MAX_VOCAB}")
print(f"MAX_LEN: {MAX_LEN}")
print(f"Embedding dim: {EMBED_DIM}")
print(f"Epochs: 20")
print(f"Batch size: 16")
print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
print(f"Number of classes: {NUM_CLASSES}")
model.build(input_shape=(None, MAX_LEN))
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.3922 - loss: 1.3978 - val_accuracy: 0.4466 - val_loss: 1.2913
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5230 - loss: 1.1796 - val_accuracy: 0.4626 - val_loss: 1.2776
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7564 - loss: 0.6916 - val_accuracy: 0.4573 - val_loss: 1.4576
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9190 - loss: 0.2871 - val_accuracy: 0.4769 - val_loss: 1.6948
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9458 - loss: 0.1764 - val_accuracy: 0.4431 - val_loss: 1.8357
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9595 - loss: 0.1376 - val_accuracy: 0.4626 - val_loss: 2.0276
Epoch 7/20
[1m176/176[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Innentől rengeteg kombinációban kipróbáltam különböző hálókat különböző dropout paraméterekkel, regularizácioval, embedding mérettel és vocabulary mérettel


In [30]:
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense) with Dropout
# -----------------------------
EMBED_DIM = 32
DROPOUT_RATE = 0.2
for DROPOUT_RATE in [0.2, 0.3, 0.4]:
  model = Sequential([
      Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
      Flatten(),
      Dense(64, activation='relu'),
      Dropout(DROPOUT_RATE),
      Dense(32, activation='relu'),
      Dropout(DROPOUT_RATE),
      Dense(32, activation='relu'),
      Dropout(DROPOUT_RATE),
      Dense(NUM_CLASSES, activation='softmax')
  ])

  model.compile(
      optimizer='adam',
      loss='categorical_crossentropy',
      metrics=['accuracy']
  )

  print("\n===== TRAINING CONFIGURATION =====")
  print(f"MAX_VOCAB: {MAX_VOCAB}")
  print(f"MAX_LEN: {MAX_LEN}")
  print(f"Embedding dim: {EMBED_DIM}")
  print(f"Epochs: 20")
  print(f"Batch size: 16")
  print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
  print(f"Number of classes: {NUM_CLASSES}")
  print(f"Dropout rate: {DROPOUT_RATE}")
  model.build(input_shape=(None, MAX_LEN))
  model.summary()

  # -----------------------------
  # EarlyStopping callback
  # -----------------------------
  early_stop = EarlyStopping(
      monitor='val_loss',
      patience=5,
      restore_best_weights=True
  )

  # -----------------------------
  # Train
  # -----------------------------
  model.fit(
      X_train_pad,
      y_train_enc,
      validation_data=(X_val_pad, y_val_enc),
      epochs=20,
      batch_size=16,
      verbose=1,
      callbacks=[early_stop]
  )

  # -----------------------------
  # Evaluate on test set
  # -----------------------------
  test_preds = model.predict(X_test_pad)
  test_preds = np.argmax(test_preds, axis=1)
  decoded_preds = label_encoder.inverse_transform(test_preds)

  print("\nTest Classification Report:")
  print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.2




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.3452 - loss: 1.4503 - val_accuracy: 0.4288 - val_loss: 1.3291
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.4427 - loss: 1.3216 - val_accuracy: 0.4609 - val_loss: 1.2642
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6076 - loss: 1.0362 - val_accuracy: 0.4698 - val_loss: 1.2690
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8016 - loss: 0.5946 - val_accuracy: 0.4609 - val_loss: 1.5646
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8787 - loss: 0.3751 - val_accuracy: 0.4377 - val_loss: 1.7804
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9226 - loss: 0.2369 - val_accuracy: 0.4270 - val_loss: 1.8735
Epoch 7/20
[1m176/176[0m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.3363 - loss: 1.4955 - val_accuracy: 0.4128 - val_loss: 1.3263
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.4110 - loss: 1.3415 - val_accuracy: 0.4662 - val_loss: 1.2997
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5227 - loss: 1.1799 - val_accuracy: 0.4502 - val_loss: 1.2984
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6793 - loss: 0.8489 - val_accuracy: 0.4502 - val_loss: 1.4051
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.7870 - loss: 0.5445 - val_accuracy: 0.4698 - val_loss: 1.5870
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8804 - loss: 0.3650 - val_accuracy: 0.4644 - val_loss: 1.9563
Epoch 7/20
[1m176/176[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.3136 - loss: 1.4771 - val_accuracy: 0.4164 - val_loss: 1.4205
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.3437 - loss: 1.4004 - val_accuracy: 0.4359 - val_loss: 1.3074
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.4608 - loss: 1.2638 - val_accuracy: 0.4609 - val_loss: 1.2654
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.5612 - loss: 1.0893 - val_accuracy: 0.3968 - val_loss: 1.4065
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6966 - loss: 0.8371 - val_accuracy: 0.4484 - val_loss: 1.3651
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8009 - loss: 0.5727 - val_accuracy: 0.4217 - val_loss: 1.4551
Epoch 7/20
[1m176/176

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Bag of Word kipróbálása

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Bag-of-Words vectorization
# -----------------------------
MAX_FEATURES = 5000
vectorizer = CountVectorizer(max_features=MAX_FEATURES, ngram_range=(1,2))

X_train_bow = vectorizer.fit_transform(X_train).toarray()
X_val_bow   = vectorizer.transform(X_val).toarray()
X_test_bow  = vectorizer.transform(X_test).toarray()

print("BoW feature size:", X_train_bow.shape[1])

# -----------------------------
# Neural Network (Dense + Dropout)
# -----------------------------
for DROPOUT_RATE in [0.2, 0.3, 0.4]:
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_bow.shape[1],)),
        Dropout(DROPOUT_RATE),
        Dense(32, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(32, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(NUM_CLASSES, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    print("\n===== TRAINING CONFIGURATION =====")
    print(f"Max Features: {MAX_FEATURES}")
    print(f"Embedding dim: {EMBED_DIM}")
    print(f"Epochs: 20")
    print(f"Batch size: 16")
    print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
    print(f"Number of classes: {NUM_CLASSES}")
    print(f"Dropout rate: {DROPOUT_RATE}")
    model.summary()

    # -----------------------------
    # EarlyStopping callback
    # -----------------------------
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    # -----------------------------
    # Train
    # -----------------------------
    model.fit(
        X_train_bow,
        y_train_enc,
        validation_data=(X_val_bow, y_val_enc),
        epochs=20,
        batch_size=16,
        verbose=1,
        callbacks=[early_stop]
    )

    # -----------------------------
    # Evaluate on test set
    # -----------------------------
    test_preds = model.predict(X_test_bow)
    test_preds = np.argmax(test_preds, axis=1)
    decoded_preds = label_encoder.inverse_transform(test_preds)

    print("\nTest Classification Report (Dropout={:.1f}):".format(DROPOUT_RATE))
    print(classification_report(y_test, decoded_preds))



BoW feature size: 5000

===== TRAINING CONFIGURATION =====
Max Features: 5000
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.2


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.3189 - loss: 1.5585 - val_accuracy: 0.4288 - val_loss: 1.3839
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.4416 - loss: 1.3250 - val_accuracy: 0.4591 - val_loss: 1.2707
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.5944 - loss: 1.0262 - val_accuracy: 0.4733 - val_loss: 1.2865
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.7514 - loss: 0.7307 - val_accuracy: 0.4893 - val_loss: 1.3491
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.8201 - loss: 0.5276 - val_accuracy: 0.4822 - val_loss: 1.5576
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.8831 - loss: 0.4066 - val_accuracy: 0.4591 - val_loss: 1.6420
Epoch 7/20
[1m176/176

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.3112 - loss: 1.5683 - val_accuracy: 0.4110 - val_loss: 1.3954
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4148 - loss: 1.3652 - val_accuracy: 0.4359 - val_loss: 1.2730
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5374 - loss: 1.1364 - val_accuracy: 0.4520 - val_loss: 1.2624
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.6012 - loss: 0.9617 - val_accuracy: 0.4893 - val_loss: 1.2928
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6842 - loss: 0.7897 - val_accuracy: 0.4555 - val_loss: 1.3888
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7790 - loss: 0.6451 - val_accuracy: 0.4448 - val_loss: 1.4853
Epoch 7/20
[1m176/176[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.2722 - loss: 1.5841 - val_accuracy: 0.3025 - val_loss: 1.4898
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.3555 - loss: 1.4827 - val_accuracy: 0.4004 - val_loss: 1.3780
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.3913 - loss: 1.3665 - val_accuracy: 0.4253 - val_loss: 1.2821
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.4810 - loss: 1.2325 - val_accuracy: 0.4573 - val_loss: 1.2382
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5624 - loss: 1.0860 - val_accuracy: 0.4502 - val_loss: 1.2386
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6192 - loss: 0.9835 - val_accuracy: 0.4466 - val_loss: 1.2748
Epoch 7/20
[1m176/176[0m 

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Bag-of-Words vectorization
# -----------------------------
MAX_FEATURES = 10000
vectorizer = CountVectorizer(max_features=MAX_FEATURES, ngram_range=(1,2))

X_train_bow = vectorizer.fit_transform(X_train).toarray()
X_val_bow   = vectorizer.transform(X_val).toarray()
X_test_bow  = vectorizer.transform(X_test).toarray()

print("BoW feature size:", X_train_bow.shape[1])

# -----------------------------
# Neural Network (Dense + Dropout)
# -----------------------------
for DROPOUT_RATE in [0.2, 0.3, 0.4]:
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_bow.shape[1],)),
        Dropout(DROPOUT_RATE),
        Dense(32, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(32, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(NUM_CLASSES, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    print("\n===== TRAINING CONFIGURATION =====")
    print(f"Max Features: {MAX_FEATURES}")
    print(f"Embedding dim: {EMBED_DIM}")
    print(f"Epochs: 20")
    print(f"Batch size: 16")
    print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
    print(f"Number of classes: {NUM_CLASSES}")
    model.summary()

    # -----------------------------
    # EarlyStopping callback
    # -----------------------------
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    # -----------------------------
    # Train
    # -----------------------------
    model.fit(
        X_train_bow,
        y_train_enc,
        validation_data=(X_val_bow, y_val_enc),
        epochs=20,
        batch_size=16,
        verbose=1,
        callbacks=[early_stop]
    )

    # -----------------------------
    # Evaluate on test set
    # -----------------------------
    test_preds = model.predict(X_test_bow)
    test_preds = np.argmax(test_preds, axis=1)
    decoded_preds = label_encoder.inverse_transform(test_preds)

    print("\nTest Classification Report (Dropout={:.1f}):".format(DROPOUT_RATE))
    print(classification_report(y_test, decoded_preds))


BoW feature size: 10000

===== TRAINING CONFIGURATION =====
Max Features: 10000
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.2949 - loss: 1.5689 - val_accuracy: 0.4359 - val_loss: 1.3158
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.5379 - loss: 1.1780 - val_accuracy: 0.4537 - val_loss: 1.2519
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.6737 - loss: 0.9013 - val_accuracy: 0.4733 - val_loss: 1.3351
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.7860 - loss: 0.6147 - val_accuracy: 0.4537 - val_loss: 1.5548
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.8613 - loss: 0.4453 - val_accuracy: 0.4466 - val_loss: 1.6419
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.9114 - loss: 0.2961 - val_accuracy: 0.4359 - val_loss: 1.8296
Epoch 7/20
[1m176/176

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 31ms/step - accuracy: 0.3003 - loss: 1.5712 - val_accuracy: 0.3968 - val_loss: 1.4335
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.4342 - loss: 1.3634 - val_accuracy: 0.4431 - val_loss: 1.3046
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.5652 - loss: 1.1099 - val_accuracy: 0.4680 - val_loss: 1.2501
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.6786 - loss: 0.8918 - val_accuracy: 0.4484 - val_loss: 1.3105
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7368 - loss: 0.7139 - val_accuracy: 0.4502 - val_loss: 1.4551
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8194 - loss: 0.5677 - val_accuracy: 0.4573 - val_loss: 1.6238
Epoch 7/20
[1m176/17

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.2773 - loss: 1.5910 - val_accuracy: 0.3737 - val_loss: 1.4770
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step - accuracy: 0.3559 - loss: 1.4677 - val_accuracy: 0.4004 - val_loss: 1.3798
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.4336 - loss: 1.3677 - val_accuracy: 0.4075 - val_loss: 1.3082
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 33ms/step - accuracy: 0.5252 - loss: 1.2138 - val_accuracy: 0.4359 - val_loss: 1.2708
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.6015 - loss: 1.0426 - val_accuracy: 0.4520 - val_loss: 1.2647
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.6811 - loss: 0.8928 - val_accuracy: 0.4537 - val_loss: 1.3182
Epoch 7/20
[1m176/176

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Bag-of-Words vectorization
# -----------------------------
for MAX_FEATURES in [5000, 10000, 20000]:
  vectorizer = CountVectorizer(max_features=MAX_FEATURES, ngram_range=(1,2))

  X_train_bow = vectorizer.fit_transform(X_train).toarray()
  X_val_bow   = vectorizer.transform(X_val).toarray()
  X_test_bow  = vectorizer.transform(X_test).toarray()

  print("BoW feature size:", X_train_bow.shape[1])

  # -----------------------------
  # Neural Network (Dense, no Dropout)
  # -----------------------------
  model = Sequential([
      Dense(64, activation='relu', input_shape=(X_train_bow.shape[1],)),
      Dense(32, activation='relu'),
      Dense(32, activation='relu'),
      Dense(NUM_CLASSES, activation='softmax')
  ])

  model.compile(
      optimizer='adam',
      loss='categorical_crossentropy',
      metrics=['accuracy']
  )
  print("\n===== TRAINING CONFIGURATION =====")
  print(f"Max Features: {MAX_FEATURES}")
  print(f"Embedding dim: {EMBED_DIM}")
  print(f"Epochs: 20")
  print(f"Batch size: 16")
  print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
  print(f"Number of classes: {NUM_CLASSES}")
  model.summary()

  # -----------------------------
  # EarlyStopping callback
  # -----------------------------
  early_stop = EarlyStopping(
      monitor='val_loss',
      patience=5,
      restore_best_weights=True
  )

  # -----------------------------
  # Train
  # -----------------------------
  model.fit(
      X_train_bow,
      y_train_enc,
      validation_data=(X_val_bow, y_val_enc),
      epochs=20,
      batch_size=16,
      verbose=1,
      callbacks=[early_stop]
  )

  # -----------------------------
  # Evaluate on test set
  # -----------------------------
  test_preds = model.predict(X_test_bow)
  test_preds = np.argmax(test_preds, axis=1)
  decoded_preds = label_encoder.inverse_transform(test_preds)

  print("\nTest Classification Report:")
  print(classification_report(y_test, decoded_preds))


BoW feature size: 5000

===== TRAINING CONFIGURATION =====
Max Features: 5000
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.2915 - loss: 1.5504 - val_accuracy: 0.4324 - val_loss: 1.3059
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.5981 - loss: 1.0547 - val_accuracy: 0.4858 - val_loss: 1.2291
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.8182 - loss: 0.5773 - val_accuracy: 0.4662 - val_loss: 1.4963
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.9049 - loss: 0.3165 - val_accuracy: 0.4591 - val_loss: 1.8368
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.9351 - loss: 0.2012 - val_accuracy: 0.4786 - val_loss: 2.1181
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.9474 - loss: 0.1474 - val_accuracy: 0.4698 - val_loss: 2.1674
Epoch 7/20
[1m176/176

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 42ms/step - accuracy: 0.3227 - loss: 1.5505 - val_accuracy: 0.4359 - val_loss: 1.2924
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.6065 - loss: 1.0143 - val_accuracy: 0.4715 - val_loss: 1.2704
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 30ms/step - accuracy: 0.8626 - loss: 0.4941 - val_accuracy: 0.4609 - val_loss: 1.5424
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 25ms/step - accuracy: 0.9130 - loss: 0.2717 - val_accuracy: 0.4573 - val_loss: 1.8961
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.9399 - loss: 0.1797 - val_accuracy: 0.4359 - val_loss: 2.0025
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.9425 - loss: 0.1423 - val_accuracy: 0.4431 - val_loss: 2.2491
Epoch 7/20
[1m176/1

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 29ms/step - accuracy: 0.3138 - loss: 1.5233 - val_accuracy: 0.4395 - val_loss: 1.2850
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.6843 - loss: 0.8965 - val_accuracy: 0.4733 - val_loss: 1.2860
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.8934 - loss: 0.3883 - val_accuracy: 0.4662 - val_loss: 1.6112
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 33ms/step - accuracy: 0.9367 - loss: 0.2307 - val_accuracy: 0.4893 - val_loss: 1.7728
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 49ms/step - accuracy: 0.9356 - loss: 0.1544 - val_accuracy: 0.4804 - val_loss: 1.9702
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.9455 - loss: 0.1313 - val_accuracy: 0.4448 - val_loss: 1.9681
[1m12/12[0m [32m━━━

Visszatérés embeddingre

In [35]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense)
# -----------------------------
for EMBED_DIM in [32, 64, 128]:

  model = Sequential([
      Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
      Flatten(),
      Dense(64, activation='relu'),
      Dense(32, activation='relu'),
      Dense(32, activation='relu'),
      Dense(NUM_CLASSES, activation='softmax')
  ])

  model.compile(
      optimizer='adam',
      loss='categorical_crossentropy',
      metrics=['accuracy']
  )

  print("\n===== TRAINING CONFIGURATION =====")
  print(f"MAX_VOCAB: {MAX_VOCAB}")
  print(f"MAX_LEN: {MAX_LEN}")
  print(f"Embedding dim: {EMBED_DIM}")
  print(f"Epochs: 20")
  print(f"Batch size: 16")
  print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
  print(f"Number of classes: {NUM_CLASSES}")
  model.build(input_shape=(None, MAX_LEN))
  model.summary()

  # -----------------------------
  # EarlyStopping callback
  # -----------------------------
  early_stop = EarlyStopping(
      monitor='val_loss',
      patience=5,
      restore_best_weights=True
  )

  # -----------------------------
  # Train
  # -----------------------------
  model.fit(
      X_train_pad,
      y_train_enc,
      validation_data=(X_val_pad, y_val_enc),
      epochs=20,
      batch_size=16,
      verbose=1,
      callbacks=[early_stop]
  )

  # -----------------------------
  # Evaluate on test set
  # -----------------------------
  test_preds = model.predict(X_test_pad)
  test_preds = np.argmax(test_preds, axis=1)
  decoded_preds = label_encoder.inverse_transform(test_preds)

  print("\nTest Classification Report:")
  print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - accuracy: 0.3721 - loss: 1.4339 - val_accuracy: 0.4039 - val_loss: 1.2997
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.5029 - loss: 1.1975 - val_accuracy: 0.4715 - val_loss: 1.2435
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.7339 - loss: 0.7553 - val_accuracy: 0.4698 - val_loss: 1.4389
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.8937 - loss: 0.3580 - val_accuracy: 0.4520 - val_loss: 1.7839
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9465 - loss: 0.1924 - val_accuracy: 0.4573 - val_loss: 2.0027
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9391 - loss: 0.1664 - val_accuracy: 0.4359 - val_loss: 2.2083
Epoch 7/20
[1m176/176



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.3586 - loss: 1.3995 - val_accuracy: 0.4466 - val_loss: 1.2891
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.5160 - loss: 1.1649 - val_accuracy: 0.4662 - val_loss: 1.2962
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.8064 - loss: 0.5770 - val_accuracy: 0.4662 - val_loss: 1.4372
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9387 - loss: 0.2180 - val_accuracy: 0.4698 - val_loss: 1.6257
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9568 - loss: 0.1419 - val_accuracy: 0.4626 - val_loss: 1.8884
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9535 - loss: 0.1040 - val_accuracy: 0.4431 - val_loss: 2.1285
[1m12/12[0m [32m━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.3856 - loss: 1.3956 - val_accuracy: 0.4431 - val_loss: 1.2828
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.5726 - loss: 1.1092 - val_accuracy: 0.4573 - val_loss: 1.2881
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.8468 - loss: 0.4811 - val_accuracy: 0.4359 - val_loss: 1.4822
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.9445 - loss: 0.1912 - val_accuracy: 0.3950 - val_loss: 1.9215
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.9601 - loss: 0.1276 - val_accuracy: 0.4680 - val_loss: 1.7893
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.9601 - loss: 0.1023 - val_accuracy: 0.4573 - val_loss: 1.9939
[1m12/12[0m [32m━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense)
# -----------------------------
for EMBED_DIM in [32, 64, 128]:

  model = Sequential([
      Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
      Flatten(),
      Dense(32, activation='relu'),
      Dense(32, activation='relu'),
      Dense(NUM_CLASSES, activation='softmax')
  ])

  model.compile(
      optimizer='adam',
      loss='categorical_crossentropy',
      metrics=['accuracy']
  )

  print("\n===== TRAINING CONFIGURATION =====")
  print(f"MAX_VOCAB: {MAX_VOCAB}")
  print(f"MAX_LEN: {MAX_LEN}")
  print(f"Embedding dim: {EMBED_DIM}")
  print(f"Epochs: 20")
  print(f"Batch size: 16")
  print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
  print(f"Number of classes: {NUM_CLASSES}")
  model.build(input_shape=(None, MAX_LEN))
  model.summary()

  # -----------------------------
  # EarlyStopping callback
  # -----------------------------
  early_stop = EarlyStopping(
      monitor='val_loss',
      patience=5,
      restore_best_weights=True
  )

  # -----------------------------
  # Train
  # -----------------------------
  model.fit(
      X_train_pad,
      y_train_enc,
      validation_data=(X_val_pad, y_val_enc),
      epochs=20,
      batch_size=16,
      verbose=1,
      callbacks=[early_stop]
  )

  # -----------------------------
  # Evaluate on test set
  # -----------------------------
  test_preds = model.predict(X_test_pad)
  test_preds = np.argmax(test_preds, axis=1)
  decoded_preds = label_encoder.inverse_transform(test_preds)

  print("\nTest Classification Report:")
  print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3561 - loss: 1.4147 - val_accuracy: 0.4306 - val_loss: 1.2960
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5248 - loss: 1.2058 - val_accuracy: 0.4840 - val_loss: 1.2243
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7334 - loss: 0.7882 - val_accuracy: 0.4609 - val_loss: 1.3514
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.8974 - loss: 0.3810 - val_accuracy: 0.4769 - val_loss: 1.4589
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9473 - loss: 0.1790 - val_accuracy: 0.4662 - val_loss: 1.6543
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9617 - loss: 0.1321 - val_accuracy: 0.4591 - val_loss: 1.7023
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.3724 - loss: 1.4114 - val_accuracy: 0.4004 - val_loss: 1.3114
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.5222 - loss: 1.1621 - val_accuracy: 0.4840 - val_loss: 1.2341
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7983 - loss: 0.6267 - val_accuracy: 0.5000 - val_loss: 1.3715
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9311 - loss: 0.2629 - val_accuracy: 0.4555 - val_loss: 1.5263
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9562 - loss: 0.1558 - val_accuracy: 0.4715 - val_loss: 1.6259
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9597 - loss: 0.1349 - val_accuracy: 0.4466 - val_loss: 1.7004
Epoch 7/20
[1m176/176



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.3728 - loss: 1.4101 - val_accuracy: 0.4217 - val_loss: 1.2928
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.6070 - loss: 1.0620 - val_accuracy: 0.4929 - val_loss: 1.2470
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.8592 - loss: 0.4744 - val_accuracy: 0.4484 - val_loss: 1.4762
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9356 - loss: 0.2162 - val_accuracy: 0.4626 - val_loss: 1.5804
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9597 - loss: 0.1310 - val_accuracy: 0.4662 - val_loss: 1.6734
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.9692 - loss: 0.0941 - val_accuracy: 0.4520 - val_loss: 1.8112
Epoch 7/20
[1m176/176

In [37]:
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense with Dropout)
# -----------------------------
for EMBED_DIM in [32, 64, 128]:
    for DROPOUT_RATE in [0.2, 0.3, 0.4]:

      model = Sequential([
          Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
          Flatten(),
          Dense(32, activation='relu'),
          Dropout(DROPOUT_RATE),
          Dense(32, activation='relu'),
          Dropout(DROPOUT_RATE),
          Dense(NUM_CLASSES, activation='softmax')
      ])

      model.compile(
          optimizer='adam',
          loss='categorical_crossentropy',
          metrics=['accuracy']
      )

      print("\n===== TRAINING CONFIGURATION =====")
      print(f"MAX_VOCAB: {MAX_VOCAB}")
      print(f"MAX_LEN: {MAX_LEN}")
      print(f"Embedding dim: {EMBED_DIM}")
      print(f"Epochs: 20")
      print(f"Batch size: 16")
      print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
      print(f"Number of classes: {NUM_CLASSES}")
      print(f"Dropout rate: {DROPOUT_RATE}")
      model.build(input_shape=(None, MAX_LEN))
      model.summary()

      # -----------------------------
      # EarlyStopping callback
      # -----------------------------
      early_stop = EarlyStopping(
          monitor='val_loss',
          patience=5,
          restore_best_weights=True
      )

      # -----------------------------
      # Train
      # -----------------------------
      model.fit(
          X_train_pad,
          y_train_enc,
          validation_data=(X_val_pad, y_val_enc),
          epochs=20,
          batch_size=16,
          verbose=1,
          callbacks=[early_stop]
      )

      # -----------------------------
      # Evaluate on test set
      # -----------------------------
      test_preds = model.predict(X_test_pad)
      test_preds = np.argmax(test_preds, axis=1)
      decoded_preds = label_encoder.inverse_transform(test_preds)

      print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
      print(classification_report(y_test, decoded_preds))



===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.2




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3347 - loss: 1.4784 - val_accuracy: 0.4217 - val_loss: 1.3162
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4455 - loss: 1.3030 - val_accuracy: 0.4306 - val_loss: 1.2865
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5452 - loss: 1.1235 - val_accuracy: 0.4644 - val_loss: 1.2523
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.7319 - loss: 0.7691 - val_accuracy: 0.4555 - val_loss: 1.4093
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.8638 - loss: 0.4534 - val_accuracy: 0.4324 - val_loss: 1.5977
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9231 - loss: 0.2508 - val_accuracy: 0.4164 - val_loss: 1.7730
Epoch 7/20
[1m176/176[0m



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3354 - loss: 1.4528 - val_accuracy: 0.4057 - val_loss: 1.3191
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4502 - loss: 1.3074 - val_accuracy: 0.4502 - val_loss: 1.2600
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5570 - loss: 1.0867 - val_accuracy: 0.4591 - val_loss: 1.2797
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7410 - loss: 0.7209 - val_accuracy: 0.4715 - val_loss: 1.3475
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8529 - loss: 0.4583 - val_accuracy: 0.4698 - val_loss: 1.5609
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9122 - loss: 0.2917 - val_accuracy: 0.4769 - val_loss: 1.6664
Epoch 7/20
[1m176/176[0m 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.2803 - loss: 1.5266 - val_accuracy: 0.4146 - val_loss: 1.3469
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.3981 - loss: 1.3689 - val_accuracy: 0.4093 - val_loss: 1.3486
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4909 - loss: 1.2550 - val_accuracy: 0.4662 - val_loss: 1.2917
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6354 - loss: 0.9454 - val_accuracy: 0.4680 - val_loss: 1.2830
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.7627 - loss: 0.6878 - val_accuracy: 0.4591 - val_loss: 1.4716
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8503 - loss: 0.4667 - val_accuracy: 0.4644 - val_loss: 1.7367
Epoch 7/20
[1m176/176[0m



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.3601 - loss: 1.4365 - val_accuracy: 0.4270 - val_loss: 1.3253
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.4583 - loss: 1.2924 - val_accuracy: 0.4769 - val_loss: 1.2415
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.6725 - loss: 0.8993 - val_accuracy: 0.4680 - val_loss: 1.3263
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8792 - loss: 0.4080 - val_accuracy: 0.4591 - val_loss: 1.5419
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9201 - loss: 0.2558 - val_accuracy: 0.4520 - val_loss: 1.7770
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9494 - loss: 0.1777 - val_accuracy: 0.4342 - val_loss: 1.7761
Epoch 7/20
[1m176/176



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.3126 - loss: 1.5095 - val_accuracy: 0.4217 - val_loss: 1.3403
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.4449 - loss: 1.3228 - val_accuracy: 0.4324 - val_loss: 1.2764
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5912 - loss: 1.0568 - val_accuracy: 0.4591 - val_loss: 1.2490
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.7718 - loss: 0.6635 - val_accuracy: 0.4288 - val_loss: 1.4115
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8845 - loss: 0.4029 - val_accuracy: 0.4342 - val_loss: 1.5816
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9227 - loss: 0.2590 - val_accuracy: 0.4270 - val_loss: 1.6910
Epoch 7/20
[1m176/176



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.3136 - loss: 1.4983 - val_accuracy: 0.4093 - val_loss: 1.3569
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.4370 - loss: 1.3362 - val_accuracy: 0.4502 - val_loss: 1.2989
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.5862 - loss: 1.1044 - val_accuracy: 0.4626 - val_loss: 1.2769
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7277 - loss: 0.7741 - val_accuracy: 0.4502 - val_loss: 1.3933
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8658 - loss: 0.4149 - val_accuracy: 0.4288 - val_loss: 1.5247
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9152 - loss: 0.3092 - val_accuracy: 0.4324 - val_loss: 1.7143
Epoch 7/20
[1m176/176

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.3715 - loss: 1.4480 - val_accuracy: 0.3986 - val_loss: 1.3409
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.5180 - loss: 1.2347 - val_accuracy: 0.4715 - val_loss: 1.2528
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.7632 - loss: 0.7074 - val_accuracy: 0.4698 - val_loss: 1.3481
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8911 - loss: 0.3698 - val_accuracy: 0.4377 - val_loss: 1.6570
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.9387 - loss: 0.2201 - val_accuracy: 0.4502 - val_loss: 1.7427
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9434 - loss: 0.1838 - val_accuracy: 0.4662 - val_loss: 1.9986
Epoch 7/20
[1m176/176

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.3483 - loss: 1.4718 - val_accuracy: 0.4395 - val_loss: 1.3220
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.4816 - loss: 1.2530 - val_accuracy: 0.4982 - val_loss: 1.2418
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.6616 - loss: 0.8585 - val_accuracy: 0.4822 - val_loss: 1.3279
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8536 - loss: 0.4573 - val_accuracy: 0.4573 - val_loss: 1.5669
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9163 - loss: 0.2756 - val_accuracy: 0.4128 - val_loss: 1.7635
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9353 - loss: 0.2281 - val_accuracy: 0.4555 - val_loss: 1.7334
Epoch 7/20
[1m176/176



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.3408 - loss: 1.5009 - val_accuracy: 0.4377 - val_loss: 1.3259
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.4406 - loss: 1.3532 - val_accuracy: 0.4484 - val_loss: 1.2657
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.5443 - loss: 1.1393 - val_accuracy: 0.4591 - val_loss: 1.2380
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.7168 - loss: 0.7885 - val_accuracy: 0.4626 - val_loss: 1.3012
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8499 - loss: 0.4763 - val_accuracy: 0.4164 - val_loss: 1.4801
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.8856 - loss: 0.3736 - val_accuracy: 0.4609 - val_loss: 1.6198
Epoch 7/20
[1m176/176

In [38]:
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense with Dropout)
# -----------------------------
for EMBED_DIM in [32]:
    for DROPOUT_RATE in [0.0, 0.3]:

      model = Sequential([
          Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
          Flatten(),
          Dense(32, activation='relu'),
          Dropout(DROPOUT_RATE),
          Dense(32, activation='relu'),
          Dropout(DROPOUT_RATE),
          Dense(NUM_CLASSES, activation='softmax')
      ])

      model.compile(
          optimizer='adam',
          loss='categorical_crossentropy',
          metrics=['accuracy']
      )

      print("\n===== TRAINING CONFIGURATION =====")
      print(f"MAX_VOCAB: {MAX_VOCAB}")
      print(f"MAX_LEN: {MAX_LEN}")
      print(f"Embedding dim: {EMBED_DIM}")
      print(f"Epochs: 20")
      print(f"Batch size: 16")
      print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
      print(f"Number of classes: {NUM_CLASSES}")
      print(f"Dropout rate: {DROPOUT_RATE}")
      model.build(input_shape=(None, MAX_LEN))
      model.summary()

      # -----------------------------
      # EarlyStopping callback
      # -----------------------------
      early_stop = EarlyStopping(
          monitor='val_loss',
          patience=5,
          restore_best_weights=True
      )

      # -----------------------------
      # Train
      # -----------------------------
      model.fit(
          X_train_pad,
          y_train_enc,
          validation_data=(X_val_pad, y_val_enc),
          epochs=20,
          batch_size=16,
          verbose=1,
          callbacks=[early_stop]
      )

      # -----------------------------
      # Evaluate on test set
      # -----------------------------
      test_preds = model.predict(X_test_pad)
      test_preds = np.argmax(test_preds, axis=1)
      decoded_preds = label_encoder.inverse_transform(test_preds)

      print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
      print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.0




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.3621 - loss: 1.4120 - val_accuracy: 0.4306 - val_loss: 1.2907
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4999 - loss: 1.2099 - val_accuracy: 0.4893 - val_loss: 1.2221
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7386 - loss: 0.7565 - val_accuracy: 0.4751 - val_loss: 1.3835
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9033 - loss: 0.3391 - val_accuracy: 0.4769 - val_loss: 1.5912
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9460 - loss: 0.1894 - val_accuracy: 0.4591 - val_loss: 1.7047
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9529 - loss: 0.1383 - val_accuracy: 0.4662 - val_loss: 1.8241
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.3284 - loss: 1.4869 - val_accuracy: 0.4235 - val_loss: 1.3241
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.4358 - loss: 1.3069 - val_accuracy: 0.4484 - val_loss: 1.2703
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5661 - loss: 1.1024 - val_accuracy: 0.4698 - val_loss: 1.2426
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6990 - loss: 0.8034 - val_accuracy: 0.4840 - val_loss: 1.3553
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8621 - loss: 0.4387 - val_accuracy: 0.4217 - val_loss: 1.6273
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8982 - loss: 0.3278 - val_accuracy: 0.4502 - val_loss: 1.7719
Epoch 7/20
[1m176/176[0m 

In [40]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
for MAX_VOCAB in [5000, 10000, 20000]:
  MAX_LEN = 100

  tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
  tokenizer.fit_on_texts(X_train)

  X_train_seq = tokenizer.texts_to_sequences(X_train)
  X_val_seq   = tokenizer.texts_to_sequences(X_val)
  X_test_seq  = tokenizer.texts_to_sequences(X_test)

  X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
  X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
  X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

  # -----------------------------
  # Neural Network (Embedding → Flatten → Dense)
  # -----------------------------
  for EMBED_DIM in [32]:

    model = Sequential([
        Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
        Flatten(),
        Dense(32, activation='relu'),
        Dense(32, activation='relu'),
        Dense(NUM_CLASSES, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    print("\n===== TRAINING CONFIGURATION =====")
    print(f"MAX_VOCAB: {MAX_VOCAB}")
    print(f"MAX_LEN: {MAX_LEN}")
    print(f"Embedding dim: {EMBED_DIM}")
    print(f"Epochs: 20")
    print(f"Batch size: 16")
    print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
    print(f"Number of classes: {NUM_CLASSES}")
    model.build(input_shape=(None, MAX_LEN))
    model.summary()

    # -----------------------------
    # EarlyStopping callback
    # -----------------------------
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    # -----------------------------
    # Train
    # -----------------------------
    model.fit(
        X_train_pad,
        y_train_enc,
        validation_data=(X_val_pad, y_val_enc),
        epochs=20,
        batch_size=16,
        verbose=1,
        callbacks=[early_stop]
    )

    # -----------------------------
    # Evaluate on test set
    # -----------------------------
    test_preds = model.predict(X_test_pad)
    test_preds = np.argmax(test_preds, axis=1)
    decoded_preds = label_encoder.inverse_transform(test_preds)

    print("\nTest Classification Report:")
    print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3921 - loss: 1.4030 - val_accuracy: 0.4110 - val_loss: 1.2970
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5033 - loss: 1.2351 - val_accuracy: 0.4751 - val_loss: 1.2257
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7162 - loss: 0.8326 - val_accuracy: 0.4875 - val_loss: 1.3265
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8948 - loss: 0.3647 - val_accuracy: 0.4520 - val_loss: 1.5751
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9379 - loss: 0.2133 - val_accuracy: 0.4502 - val_loss: 1.6591
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9541 - loss: 0.1492 - val_accuracy: 0.4466 - val_loss: 1.8765
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.3360 - loss: 1.4259 - val_accuracy: 0.4181 - val_loss: 1.3144
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.5189 - loss: 1.2162 - val_accuracy: 0.4520 - val_loss: 1.2696
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7945 - loss: 0.6911 - val_accuracy: 0.4715 - val_loss: 1.4056
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9180 - loss: 0.2616 - val_accuracy: 0.4342 - val_loss: 1.5612
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9527 - loss: 0.1471 - val_accuracy: 0.4502 - val_loss: 1.6771
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9592 - loss: 0.1086 - val_accuracy: 0.4270 - val_loss: 1.7699
Epoch 7/20
[1m176/176[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



===== TRAINING CONFIGURATION =====
MAX_VOCAB: 20000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.3472 - loss: 1.4515 - val_accuracy: 0.4146 - val_loss: 1.3165
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5226 - loss: 1.1864 - val_accuracy: 0.4662 - val_loss: 1.2553
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8716 - loss: 0.5012 - val_accuracy: 0.4395 - val_loss: 1.4763
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9398 - loss: 0.2127 - val_accuracy: 0.4555 - val_loss: 1.6125
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9530 - loss: 0.1498 - val_accuracy: 0.4573 - val_loss: 1.6798
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9588 - loss: 0.1258 - val_accuracy: 0.4502 - val_loss: 1.7469
Epoch 7/20
[1m176/176

In [41]:
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
for MAX_VOCAB in [5000, 10000, 20000]:
  MAX_LEN = 100

  tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
  tokenizer.fit_on_texts(X_train)

  X_train_seq = tokenizer.texts_to_sequences(X_train)
  X_val_seq   = tokenizer.texts_to_sequences(X_val)
  X_test_seq  = tokenizer.texts_to_sequences(X_test)

  X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
  X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
  X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

  # -----------------------------
  # Neural Network (Embedding → Flatten → Dense with Dropout)
  # -----------------------------
  for EMBED_DIM in [32]:
      for DROPOUT_RATE in [0.2, 0.3, 0.4]:

        model = Sequential([
            Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
            Flatten(),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(NUM_CLASSES, activation='softmax')
        ])

        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

        print("\n===== TRAINING CONFIGURATION =====")
        print(f"MAX_VOCAB: {MAX_VOCAB}")
        print(f"MAX_LEN: {MAX_LEN}")
        print(f"Embedding dim: {EMBED_DIM}")
        print(f"Epochs: 20")
        print(f"Batch size: 16")
        print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
        print(f"Number of classes: {NUM_CLASSES}")
        print(f"Dropout rate: {DROPOUT_RATE}")
        model.build(input_shape=(None, MAX_LEN))
        model.summary()

        # -----------------------------
        # EarlyStopping callback
        # -----------------------------
        early_stop = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )

        # -----------------------------
        # Train
        # -----------------------------
        model.fit(
            X_train_pad,
            y_train_enc,
            validation_data=(X_val_pad, y_val_enc),
            epochs=20,
            batch_size=16,
            verbose=1,
            callbacks=[early_stop]
        )

        # -----------------------------
        # Evaluate on test set
        # -----------------------------
        test_preds = model.predict(X_test_pad)
        test_preds = np.argmax(test_preds, axis=1)
        decoded_preds = label_encoder.inverse_transform(test_preds)
        print(f"Max vocab:{MAX_VOCAB}")
        print(f"Dropout rate:{DROPOUT_RATE}")
        print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
        print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 5000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.2




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3627 - loss: 1.4312 - val_accuracy: 0.4253 - val_loss: 1.3220
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.4623 - loss: 1.2943 - val_accuracy: 0.4537 - val_loss: 1.2740
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6075 - loss: 1.0427 - val_accuracy: 0.4804 - val_loss: 1.2249
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.7775 - loss: 0.6512 - val_accuracy: 0.4840 - val_loss: 1.3741
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8900 - loss: 0.3443 - val_accuracy: 0.4591 - val_loss: 1.5411
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9269 - loss: 0.2484 - val_accuracy: 0.4591 - val_loss: 1.7625
Epoch 7/20
[1m176/176[0m



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3365 - loss: 1.4479 - val_accuracy: 0.3932 - val_loss: 1.3607
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.4443 - loss: 1.3069 - val_accuracy: 0.4662 - val_loss: 1.2577
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.5788 - loss: 1.0973 - val_accuracy: 0.4733 - val_loss: 1.2270
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7086 - loss: 0.7942 - val_accuracy: 0.4751 - val_loss: 1.3410
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8480 - loss: 0.4808 - val_accuracy: 0.4502 - val_loss: 1.6382
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8995 - loss: 0.3339 - val_accuracy: 0.4484 - val_loss: 1.7891
Epoch 7/20
[1m176/176[0m



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.2737 - loss: 1.5432 - val_accuracy: 0.4181 - val_loss: 1.3468
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.4005 - loss: 1.3744 - val_accuracy: 0.4342 - val_loss: 1.3353
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.4855 - loss: 1.2431 - val_accuracy: 0.4662 - val_loss: 1.2484
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6240 - loss: 0.9546 - val_accuracy: 0.4591 - val_loss: 1.2952
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7526 - loss: 0.6999 - val_accuracy: 0.4253 - val_loss: 1.4528
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8257 - loss: 0.5093 - val_accuracy: 0.4377 - val_loss: 1.5387
Epoch 7/20
[1m176/176[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



===== TRAINING CONFIGURATION =====
MAX_VOCAB: 10000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.2




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.3259 - loss: 1.4814 - val_accuracy: 0.4324 - val_loss: 1.3473
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4712 - loss: 1.2765 - val_accuracy: 0.4698 - val_loss: 1.2695
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6584 - loss: 0.9420 - val_accuracy: 0.4786 - val_loss: 1.2562
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8488 - loss: 0.5203 - val_accuracy: 0.4822 - val_loss: 1.4257
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9275 - loss: 0.2621 - val_accuracy: 0.4591 - val_loss: 1.6351
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9503 - loss: 0.1722 - val_accuracy: 0.4342 - val_loss: 1.7608
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.2622 - loss: 1.5003 - val_accuracy: 0.4359 - val_loss: 1.3277
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4235 - loss: 1.3287 - val_accuracy: 0.4644 - val_loss: 1.2817
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5934 - loss: 1.0938 - val_accuracy: 0.4520 - val_loss: 1.3000
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.7834 - loss: 0.6402 - val_accuracy: 0.4715 - val_loss: 1.4583
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.8937 - loss: 0.3708 - val_accuracy: 0.4217 - val_loss: 1.7504
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9351 - loss: 0.2367 - val_accuracy: 0.4680 - val_loss: 1.7466
Epoch 7/20
[1m176/176[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.3136 - loss: 1.4981 - val_accuracy: 0.4235 - val_loss: 1.3270
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.3905 - loss: 1.3717 - val_accuracy: 0.4555 - val_loss: 1.2991
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5364 - loss: 1.1695 - val_accuracy: 0.4822 - val_loss: 1.2740
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6901 - loss: 0.8017 - val_accuracy: 0.4662 - val_loss: 1.3627
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8134 - loss: 0.5261 - val_accuracy: 0.4751 - val_loss: 1.5448
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8889 - loss: 0.3461 - val_accuracy: 0.4698 - val_loss: 1.7305
Epoch 7/20
[1m176/176[0



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.3498 - loss: 1.4378 - val_accuracy: 0.4377 - val_loss: 1.3124
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.5135 - loss: 1.2130 - val_accuracy: 0.4698 - val_loss: 1.2505
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7312 - loss: 0.7632 - val_accuracy: 0.4448 - val_loss: 1.3578
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9130 - loss: 0.3238 - val_accuracy: 0.4502 - val_loss: 1.5370
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9470 - loss: 0.2013 - val_accuracy: 0.4537 - val_loss: 1.7269
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9592 - loss: 0.1603 - val_accuracy: 0.4377 - val_loss: 1.6832
Epoch 7/20
[1m176/176

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.3654 - loss: 1.4335 - val_accuracy: 0.4253 - val_loss: 1.3365
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.4790 - loss: 1.2842 - val_accuracy: 0.4875 - val_loss: 1.2536
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.6711 - loss: 0.9139 - val_accuracy: 0.4875 - val_loss: 1.3118
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.8738 - loss: 0.4282 - val_accuracy: 0.4217 - val_loss: 1.5241
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9299 - loss: 0.2718 - val_accuracy: 0.4893 - val_loss: 1.6508
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9451 - loss: 0.1963 - val_accuracy: 0.4413 - val_loss: 1.7219
Epoch 7/20
[1m176/176



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.3035 - loss: 1.5002 - val_accuracy: 0.4306 - val_loss: 1.3342
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.4139 - loss: 1.3340 - val_accuracy: 0.4555 - val_loss: 1.2760
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.5679 - loss: 1.0998 - val_accuracy: 0.4733 - val_loss: 1.2779
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.7053 - loss: 0.7745 - val_accuracy: 0.4822 - val_loss: 1.3051
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8337 - loss: 0.4770 - val_accuracy: 0.4804 - val_loss: 1.4723
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9098 - loss: 0.3138 - val_accuracy: 0.4626 - val_loss: 1.6294
Epoch 7/20
[1m176/176

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
for MAX_VOCAB in [10000]:
  MAX_LEN = 100

  tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
  tokenizer.fit_on_texts(X_train)

  X_train_seq = tokenizer.texts_to_sequences(X_train)
  X_val_seq   = tokenizer.texts_to_sequences(X_val)
  X_test_seq  = tokenizer.texts_to_sequences(X_test)

  X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
  X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
  X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

  # -----------------------------
  # Neural Network (Embedding → Flatten → Dense with Dropout)
  # -----------------------------
  for EMBED_DIM in [32]:
      for DROPOUT_RATE in [0.4]:

        model = Sequential([
            Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
            Flatten(),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(NUM_CLASSES, activation='softmax')
        ])

        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

        print("\n===== TRAINING CONFIGURATION =====")
        print(f"MAX_VOCAB: {MAX_VOCAB}")
        print(f"MAX_LEN: {MAX_LEN}")
        print(f"Embedding dim: {EMBED_DIM}")
        print(f"Epochs: 20")
        print(f"Batch size: 16")
        print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
        print(f"Number of classes: {NUM_CLASSES}")
        print(f"Dropout rate: {DROPOUT_RATE}")
        model.build(input_shape=(None, MAX_LEN))
        model.summary()

        # -----------------------------
        # EarlyStopping callback
        # -----------------------------
        early_stop = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )

        # -----------------------------
        # Train
        # -----------------------------
        model.fit(
            X_train_pad,
            y_train_enc,
            validation_data=(X_val_pad, y_val_enc),
            epochs=20,
            batch_size=16,
            verbose=1,
            callbacks=[early_stop]
        )

        # -----------------------------
        # Evaluate on test set
        # -----------------------------
        test_preds = model.predict(X_test_pad)
        test_preds = np.argmax(test_preds, axis=1)
        decoded_preds = label_encoder.inverse_transform(test_preds)
        print(f"Max vocab:{MAX_VOCAB}")
        print(f"Dropout rate:{DROPOUT_RATE}")
        print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
        print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 10000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.4




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.3418 - loss: 1.4620 - val_accuracy: 0.4146 - val_loss: 1.3445
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4175 - loss: 1.3425 - val_accuracy: 0.4591 - val_loss: 1.2699
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5391 - loss: 1.1467 - val_accuracy: 0.4573 - val_loss: 1.2215
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6790 - loss: 0.8439 - val_accuracy: 0.4698 - val_loss: 1.2671
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7910 - loss: 0.6182 - val_accuracy: 0.4484 - val_loss: 1.4022
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8628 - loss: 0.4357 - val_accuracy: 0.4644 - val_loss: 1.6156
Epoch 7/20
[1m176/176[0

In [43]:
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
for MAX_VOCAB in [10000]:
    MAX_LEN = 100

    tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
    tokenizer.fit_on_texts(X_train)

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_val_seq   = tokenizer.texts_to_sequences(X_val)
    X_test_seq  = tokenizer.texts_to_sequences(X_test)

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
    X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
    X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

    # -----------------------------
    # Neural Network (Embedding → Flatten → Dense with Dropout + L2)
    # -----------------------------
    for EMBED_DIM in [32]:
        for DROPOUT_RATE in [0.4]:
            L2_REG = 1e-4  # L2 regularization factor

            model = Sequential([
                Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
                Flatten(),
                Dense(32, activation='relu', kernel_regularizer=l2(L2_REG)),
                Dropout(DROPOUT_RATE),
                Dense(32, activation='relu', kernel_regularizer=l2(L2_REG)),
                Dropout(DROPOUT_RATE),
                Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=l2(L2_REG))
            ])

            model.compile(
                optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy']
            )

            print("\n===== TRAINING CONFIGURATION =====")
            print(f"MAX_VOCAB: {MAX_VOCAB}")
            print(f"MAX_LEN: {MAX_LEN}")
            print(f"Embedding dim: {EMBED_DIM}")
            print(f"Epochs: 20")
            print(f"Batch size: 16")
            print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
            print(f"Number of classes: {NUM_CLASSES}")
            print(f"Dropout rate: {DROPOUT_RATE}")
            print(f"L2 regularization factor: {L2_REG}")
            model.build(input_shape=(None, MAX_LEN))
            model.summary()

            # -----------------------------
            # EarlyStopping callback
            # -----------------------------
            early_stop = EarlyStopping(
                monitor='val_loss',
                patience=5,
                restore_best_weights=True
            )

            # -----------------------------
            # Train
            # -----------------------------
            model.fit(
                X_train_pad,
                y_train_enc,
                validation_data=(X_val_pad, y_val_enc),
                epochs=20,
                batch_size=16,
                verbose=1,
                callbacks=[early_stop]
            )

            # -----------------------------
            # Evaluate on test set
            # -----------------------------
            test_preds = model.predict(X_test_pad)
            test_preds = np.argmax(test_preds, axis=1)
            decoded_preds = label_encoder.inverse_transform(test_preds)

            print(f"Max vocab: {MAX_VOCAB}")
            print(f"Dropout rate: {DROPOUT_RATE}")
            print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
            print(classification_report(y_test, decoded_preds))



===== TRAINING CONFIGURATION =====
MAX_VOCAB: 10000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.4
L2 regularization factor: 0.0001




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.3226 - loss: 1.4820 - val_accuracy: 0.4181 - val_loss: 1.3521
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4356 - loss: 1.3352 - val_accuracy: 0.4431 - val_loss: 1.2941
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.5372 - loss: 1.1667 - val_accuracy: 0.4626 - val_loss: 1.2624
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7106 - loss: 0.8167 - val_accuracy: 0.4698 - val_loss: 1.3431
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8429 - loss: 0.5214 - val_accuracy: 0.4858 - val_loss: 1.5719
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9109 - loss: 0.3442 - val_accuracy: 0.4680 - val_loss: 1.6466
Epoch 7/20
[1m176/176[0

In [44]:
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
for MAX_VOCAB in [10000]:
  MAX_LEN = 100

  tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
  tokenizer.fit_on_texts(X_train)

  X_train_seq = tokenizer.texts_to_sequences(X_train)
  X_val_seq   = tokenizer.texts_to_sequences(X_val)
  X_test_seq  = tokenizer.texts_to_sequences(X_test)

  X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
  X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
  X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

  # -----------------------------
  # Neural Network (Embedding → Flatten → Dense with Dropout)
  # -----------------------------
  for EMBED_DIM in [32]:
      for DROPOUT_RATE in [0.4]:

        model = Sequential([
            Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
            Flatten(),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(NUM_CLASSES, activation='softmax')
        ])

        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

        print("\n===== TRAINING CONFIGURATION =====")
        print(f"MAX_VOCAB: {MAX_VOCAB}")
        print(f"MAX_LEN: {MAX_LEN}")
        print(f"Embedding dim: {EMBED_DIM}")
        print(f"Epochs: 20")
        print(f"Batch size: 8")
        print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
        print(f"Number of classes: {NUM_CLASSES}")
        print(f"Dropout rate: {DROPOUT_RATE}")
        model.build(input_shape=(None, MAX_LEN))
        model.summary()

        # -----------------------------
        # EarlyStopping callback
        # -----------------------------
        early_stop = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )

        # -----------------------------
        # Train
        # -----------------------------
        model.fit(
            X_train_pad,
            y_train_enc,
            validation_data=(X_val_pad, y_val_enc),
            epochs=20,
            batch_size=8,
            verbose=1,
            callbacks=[early_stop]
        )

        # -----------------------------
        # Evaluate on test set
        # -----------------------------
        test_preds = model.predict(X_test_pad)
        test_preds = np.argmax(test_preds, axis=1)
        decoded_preds = label_encoder.inverse_transform(test_preds)
        print(f"Max vocab:{MAX_VOCAB}")
        print(f"Dropout rate:{DROPOUT_RATE}")
        print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
        print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 10000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 8
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.4




Epoch 1/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.3351 - loss: 1.4654 - val_accuracy: 0.4199 - val_loss: 1.3215
Epoch 2/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.4407 - loss: 1.3222 - val_accuracy: 0.4484 - val_loss: 1.2572
Epoch 3/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5954 - loss: 1.0469 - val_accuracy: 0.4715 - val_loss: 1.2583
Epoch 4/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.7721 - loss: 0.6600 - val_accuracy: 0.4591 - val_loss: 1.4350
Epoch 5/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.8777 - loss: 0.4003 - val_accuracy: 0.4520 - val_loss: 1.7147
Epoch 6/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.9168 - loss: 0.2637 - val_accuracy: 0.3612 - val_loss: 1.8802
Epoch 7/20
[1m352/352[0m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Végső választott modell.
Később a dropout törölve lett, mert úgy jobb eredményt adott.

In [45]:
#Végső modell tesztelése új adathalmaz felbontásban


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Split dataset into train / val / test
# -----------------------------
# First split: train vs temp (val + test)


# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense with Dropout)
# -----------------------------
EMBED_DIM = 32
DROPOUT_RATE = 0.4

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\n===== TRAINING CONFIGURATION =====")
print(f"MAX_VOCAB: {MAX_VOCAB}")
print(f"MAX_LEN: {MAX_LEN}")
print(f"Embedding dim: {EMBED_DIM}")
print(f"Epochs: 20")
print(f"Batch size: 16")
print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
print(f"Number of classes: {NUM_CLASSES}")
print(f"Dropout rate: {DROPOUT_RATE}")
model.build(input_shape=(None, MAX_LEN))
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print(f"\nMax vocab: {MAX_VOCAB}")
print(f"Dropout rate: {DROPOUT_RATE}")
print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
print(classification_report(y_test, decoded_preds))



===== TRAINING CONFIGURATION =====
MAX_VOCAB: 10000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.4




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.3494 - loss: 1.4712 - val_accuracy: 0.4181 - val_loss: 1.3255
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.4291 - loss: 1.3227 - val_accuracy: 0.4520 - val_loss: 1.2744
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5524 - loss: 1.1148 - val_accuracy: 0.4626 - val_loss: 1.2564
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.7331 - loss: 0.7358 - val_accuracy: 0.4893 - val_loss: 1.3346
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8418 - loss: 0.5054 - val_accuracy: 0.4733 - val_loss: 1.5081
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8984 - loss: 0.3563 - val_accuracy: 0.4644 - val_loss: 1.7455
Epoch 7/20
[1m176/176[0m

In [46]:
#Lehetséges megoldás a kiegyenlítetlenségre
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

#Végső modell tesztelése új adathalmaz felbontásban


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np



# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense with Dropout)
# -----------------------------
EMBED_DIM = 32
DROPOUT_RATE = 0.4

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\n===== TRAINING CONFIGURATION =====")
print(f"MAX_VOCAB: {MAX_VOCAB}")
print(f"MAX_LEN: {MAX_LEN}")
print(f"Embedding dim: {EMBED_DIM}")
print(f"Epochs: 20")
print(f"Batch size: 16")
print(f"Learning rate: {model.optimizer.learning_rate.numpy()}")
print(f"Number of classes: {NUM_CLASSES}")
print(f"Dropout rate: {DROPOUT_RATE}")
model.build(input_shape=(None, MAX_LEN))
model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

y_train_int = label_encoder.transform(y_train)  # already done as y_train_enc before one-hot
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_int),
    y=y_train_int
)
class_weights_dict = dict(enumerate(class_weights))

# -----------------------------
# Train model with class weights
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop],
    class_weight=class_weights_dict   # <-- add here
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print(f"\nMax vocab: {MAX_VOCAB}")
print(f"Dropout rate: {DROPOUT_RATE}")
print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
print(classification_report(y_test, decoded_preds))


===== TRAINING CONFIGURATION =====
MAX_VOCAB: 10000
MAX_LEN: 100
Embedding dim: 32
Epochs: 20
Batch size: 16
Learning rate: 0.0010000000474974513
Number of classes: 5
Dropout rate: 0.4




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.2952 - loss: 1.6141 - val_accuracy: 0.3310 - val_loss: 1.5254
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.3253 - loss: 1.4753 - val_accuracy: 0.3950 - val_loss: 1.3800
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.4439 - loss: 1.2449 - val_accuracy: 0.4342 - val_loss: 1.3801
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5281 - loss: 1.0419 - val_accuracy: 0.4751 - val_loss: 1.2536
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6625 - loss: 0.7400 - val_accuracy: 0.4448 - val_loss: 1.3544
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7532 - loss: 0.5580 - val_accuracy: 0.4146 - val_loss: 1.4606
Epoch 7/20
[1m176/176[0m 

Accuracy romlott, de minden osztályra becsül a modell (végül nem lett használva)