Adatok előfeldolgozása

In [81]:
import zipfile

zip_path = "legaltextdecoder.zip"
extract_path = "data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [82]:
base_dir = "data"
all_data = []

json_file_count = 0
record_count = 0

for root, dirs, files in os.walk(base_dir):
    dirs[:] = [d for d in dirs if d != "consensus"]

    for file in files:
        if file.endswith(".json"):
            json_file_count += 1
            file_path = os.path.join(root, file)

            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

                if isinstance(data, list):
                    all_data.extend(data)
                    record_count += len(data)
                else:
                    all_data.append(data)
                    record_count += 1

print(f"JSON files loaded: {json_file_count}")
print(f"Total data records: {record_count}")

JSON files loaded: 34
Total data records: 3897


In [83]:
texts = []
labels = []

for item in all_data:
    text = item.get("data", {}).get("text")

    if not text:
        continue

    annotations = item.get("annotations", [])
    if not annotations:
        continue

    ann = annotations[0]
    results = ann.get("result", [])
    if not results:
        continue

    choices = results[0].get("value", {}).get("choices", [])
    if not choices:
        continue

    label = choices[0]

    texts.append(text)
    labels.append(label)

print("Loaded data records:", len(texts))

Loaded data records: 3747


In [84]:
from collections import Counter

label_counts = Counter(labels)

print("Label distribution:")
for label, count in label_counts.items():
    print(f"{label}: {count}")


Label distribution:
5-Könnyen érthető: 1159
4-Érthető: 1122
3-Többé/kevésbé megértem: 816
2-Nehezen érthető: 450
1-Nagyon nehezen érthető: 200


In [13]:
#Baseline modell
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

#Túltanítás
train_texts = texts[:8]
train_labels = labels[:8]

test_texts = texts[8:]
test_labels = labels[8:]

label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train = label_encoder.transform(train_labels)
y_test = label_encoder.transform(test_labels)

NUM_CLASSES = 5
y_train = np.clip(y_train, 0, NUM_CLASSES - 1)
y_test = np.clip(y_test, 0, NUM_CLASSES - 1)

y_train = to_categorical(y_train, num_classes=NUM_CLASSES)

MAX_FEATURES = 2000

tfidf = TfidfVectorizer(
    max_features=MAX_FEATURES,
    ngram_range=(1, 2),
    min_df=1
)

X_train = tfidf.fit_transform(train_texts).toarray()
X_test = tfidf.transform(test_texts).toarray()

print("TF-IDF feature size:", X_train.shape[1])

# -----------------------------
# Neural Network (MLP)
# -----------------------------
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train,
    y_train,
    epochs=100,
    batch_size=2,
    verbose=1
)


TF-IDF feature size: 86


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.3167 - loss: 1.5985
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2333 - loss: 1.5454    
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9500 - loss: 1.4953 
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7667 - loss: 1.4741 
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9500 - loss: 1.3850 
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8667 - loss: 1.3597
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8667 - loss: 1.3214 
Epoch 8/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9500 - loss: 1.2699 
Epoch 9/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x7985650805c0>

Az overfitt megtörtént.

In [85]:
import re

def clean_text(text):
    text = text.lower()                      # lowercase
    text = re.sub(r'\s+', ' ', text)        # collapse multiple spaces
    #text = re.sub(r'[^a-z0-9 ]', '', text)  # remove punctuation except letters/numbers
    return text.strip()

clean_texts = [clean_text(line) for line in texts]

Baseline modell betanítása

In [86]:
from sklearn.model_selection import train_test_split

# Train vs temp (val + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    texts,
    labels,
    test_size=0.25,
    stratify=labels,
    random_state=42
)

# Val vs test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.40,
    stratify=y_temp,
    random_state=42
)

print("Train size:", len(X_train))
print("Val size:  ", len(X_val))
print("Test size: ", len(X_test))


Train size: 2810
Val size:   562
Test size:  375


In [30]:
label_encoder = LabelEncoder()
label_encoder.fit(labels)  # Fit on all labels

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

# One-hot encoding for MLP
NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# TF-IDF Vectorization
# -----------------------------
MAX_FEATURES = 2000

tfidf = TfidfVectorizer(
    max_features=MAX_FEATURES,
    ngram_range=(1, 2),
    min_df=1
)

X_train_vec = tfidf.fit_transform(X_train).toarray()
X_val_vec   = tfidf.transform(X_val).toarray()
X_test_vec  = tfidf.transform(X_test).toarray()

print("TF-IDF feature size:", X_train_vec.shape[1])

# -----------------------------
# Neural Network (MLP)
# -----------------------------
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_vec.shape[1],)),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# Train (with validation)
# -----------------------------
model.fit(
    X_train_vec,
    y_train_enc,
    validation_data=(X_val_vec, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_vec)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))

TF-IDF feature size: 2000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.3108 - loss: 1.5360 - val_accuracy: 0.3665 - val_loss: 1.4373
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4571 - loss: 1.3409 - val_accuracy: 0.4217 - val_loss: 1.3399
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6022 - loss: 1.0484 - val_accuracy: 0.4235 - val_loss: 1.3679
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7073 - loss: 0.8042 - val_accuracy: 0.4253 - val_loss: 1.4628
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7993 - loss: 0.6282 - val_accuracy: 0.4199 - val_loss: 1.5941
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8539 - loss: 0.4938 - val_accuracy: 0.4342 - val_loss: 1.7781
Epoch 7/20
[1m176/176[0m 

Early stopping hozzáadása


In [31]:
from tensorflow.keras.callbacks import EarlyStopping


# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)  # Fit on all labels

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# TF-IDF Vectorization
# -----------------------------
MAX_FEATURES = 2000

tfidf = TfidfVectorizer(
    max_features=MAX_FEATURES,
    ngram_range=(1, 2),
    min_df=1
)

X_train_vec = tfidf.fit_transform(X_train).toarray()
X_val_vec   = tfidf.transform(X_val).toarray()
X_test_vec  = tfidf.transform(X_test).toarray()

print("TF-IDF feature size:", X_train_vec.shape[1])

# -----------------------------
# Neural Network (MLP)
# -----------------------------
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_vec.shape[1],)),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',   # Stop when validation loss stops improving
    patience=5,           # Wait 5 epochs before stopping
    restore_best_weights=True
)

# -----------------------------
# Train with EarlyStopping
# -----------------------------
model.fit(
    X_train_vec,
    y_train_enc,
    validation_data=(X_val_vec, y_val_enc),
    epochs=100,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_vec)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))

TF-IDF feature size: 2000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.2872 - loss: 1.5663 - val_accuracy: 0.3559 - val_loss: 1.4453
Epoch 2/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4267 - loss: 1.3520 - val_accuracy: 0.3950 - val_loss: 1.3411
Epoch 3/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5802 - loss: 1.0957 - val_accuracy: 0.4235 - val_loss: 1.3619
Epoch 4/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6912 - loss: 0.8465 - val_accuracy: 0.4288 - val_loss: 1.4474
Epoch 5/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7894 - loss: 0.6198 - val_accuracy: 0.4110 - val_loss: 1.5799
Epoch 6/100
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8580 - loss: 0.4748 - val_accuracy: 0.4199 - val_loss: 1.7275
Epoch 7/100
[1m176/17

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TFIDF csere Embeddingre

In [32]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense)
# -----------------------------
EMBED_DIM = 32

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.3463 - loss: 1.4337 - val_accuracy: 0.4431 - val_loss: 1.3229
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5030 - loss: 1.2260 - val_accuracy: 0.4626 - val_loss: 1.2755
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7561 - loss: 0.7329 - val_accuracy: 0.4840 - val_loss: 1.4157
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9212 - loss: 0.2908 - val_accuracy: 0.4484 - val_loss: 1.6979
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9464 - loss: 0.1868 - val_accuracy: 0.4359 - val_loss: 1.8112
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9647 - loss: 0.1284 - val_accuracy: 0.4502 - val_loss: 1.9868
Epoch 7/20
[1m176/176[0m 

LSTM kipróbálása

In [34]:
from tensorflow.keras.layers import LSTM
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → LSTM → Dense)
# -----------------------------
EMBED_DIM = 32
LSTM_UNITS = 16

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    LSTM(LSTM_UNITS),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 33ms/step - accuracy: 0.3002 - loss: 1.5088 - val_accuracy: 0.3523 - val_loss: 1.4049
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 33ms/step - accuracy: 0.3415 - loss: 1.3835 - val_accuracy: 0.4021 - val_loss: 1.3512
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.3949 - loss: 1.3618 - val_accuracy: 0.4306 - val_loss: 1.3254
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 37ms/step - accuracy: 0.4467 - loss: 1.2884 - val_accuracy: 0.4217 - val_loss: 1.2987
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.5011 - loss: 1.1949 - val_accuracy: 0.4555 - val_loss: 1.2988
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 37ms/step - accuracy: 0.5957 - loss: 1.0520 - val_accuracy: 0.4448 - val_loss: 1.3712
Epoch 7/20
[1m176/1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


GRU kipróbálása (elvileg jó kis adathalmazokon)

In [35]:
from tensorflow.keras.layers import GRU
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → LSTM → Dense)
# -----------------------------
EMBED_DIM = 32

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    GRU(32),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 50ms/step - accuracy: 0.2836 - loss: 1.5000 - val_accuracy: 0.3167 - val_loss: 1.4163
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 48ms/step - accuracy: 0.3461 - loss: 1.4164 - val_accuracy: 0.4306 - val_loss: 1.3221
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step - accuracy: 0.4462 - loss: 1.3107 - val_accuracy: 0.4342 - val_loss: 1.2891
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 48ms/step - accuracy: 0.5382 - loss: 1.1397 - val_accuracy: 0.4253 - val_loss: 1.3380
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 48ms/step - accuracy: 0.6201 - loss: 0.9683 - val_accuracy: 0.4199 - val_loss: 1.4348
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step - accuracy: 0.6581 - loss: 0.8559 - val_accuracy: 0.4164 - val_loss: 1.5764
Epoch 7/20
[1m176/

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Dropout bevezetése

In [37]:
from tensorflow.keras.layers import Dropout

label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → GRU → Dense) with Dropout
# -----------------------------
EMBED_DIM = 32
GRU_UNITS = 32
DROPOUT_RATE = 0.3  # 30% dropout

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    GRU(GRU_UNITS),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 53ms/step - accuracy: 0.3032 - loss: 1.5469 - val_accuracy: 0.3221 - val_loss: 1.4361
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 49ms/step - accuracy: 0.3049 - loss: 1.4602 - val_accuracy: 0.3505 - val_loss: 1.3805
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 43ms/step - accuracy: 0.3567 - loss: 1.3897 - val_accuracy: 0.4395 - val_loss: 1.3008
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 49ms/step - accuracy: 0.4680 - loss: 1.2299 - val_accuracy: 0.4484 - val_loss: 1.3022
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - accuracy: 0.5532 - loss: 1.0641 - val_accuracy: 0.4110 - val_loss: 1.3688
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 47ms/step - accuracy: 0.6334 - loss: 0.9619 - val_accuracy: 0.4288 - val_loss: 1.5428
Epoch 7/20
[1m176/1

Regularizáció bevezetése

In [42]:
from tensorflow.keras.regularizers import l2

label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → GRU → Dense) with Dropout
# -----------------------------
EMBED_DIM = 32
GRU_UNITS = 32
DROPOUT_RATE = 0.3  # 30% dropout
L2_REG = 1e-4

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    GRU(GRU_UNITS),
    Dense(32, activation='relu', kernel_regularizer=l2(L2_REG)),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=l2(L2_REG))
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 87ms/step - accuracy: 0.2992 - loss: 1.5099 - val_accuracy: 0.3327 - val_loss: 1.4205
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 49ms/step - accuracy: 0.3431 - loss: 1.4282 - val_accuracy: 0.3772 - val_loss: 1.3647
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 43ms/step - accuracy: 0.3911 - loss: 1.3686 - val_accuracy: 0.4359 - val_loss: 1.3046
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 51ms/step - accuracy: 0.4516 - loss: 1.2512 - val_accuracy: 0.4359 - val_loss: 1.2814
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 49ms/step - accuracy: 0.5571 - loss: 1.0946 - val_accuracy: 0.4448 - val_loss: 1.3141
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 45ms/step - accuracy: 0.6130 - loss: 0.9823 - val_accuracy: 0.4324 - val_loss: 1.4261
Epoch 7/20
[1m176/

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [43]:
from tensorflow.keras.layers import Dropout

label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → GRU → Dense) with Dropout
# -----------------------------
EMBED_DIM = 32
GRU_UNITS = 32
DROPOUT_RATE = 0.3  # 30% dropout

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    GRU(GRU_UNITS),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 54ms/step - accuracy: 0.3180 - loss: 1.5166 - val_accuracy: 0.3523 - val_loss: 1.4149
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 43ms/step - accuracy: 0.3356 - loss: 1.4564 - val_accuracy: 0.3577 - val_loss: 1.3913
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 49ms/step - accuracy: 0.3408 - loss: 1.3893 - val_accuracy: 0.3719 - val_loss: 1.3614
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 49ms/step - accuracy: 0.4558 - loss: 1.2436 - val_accuracy: 0.4395 - val_loss: 1.2712
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 46ms/step - accuracy: 0.5694 - loss: 1.0588 - val_accuracy: 0.4253 - val_loss: 1.3525
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 47ms/step - accuracy: 0.6594 - loss: 0.8902 - val_accuracy: 0.4359 - val_loss: 1.5100
Epoch 7/20
[1m176/1

In [44]:
from tensorflow.keras.regularizers import l2

label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → GRU → Dense) with Dropout
# -----------------------------
EMBED_DIM = 32
GRU_UNITS = 32
DROPOUT_RATE = 0.3  # 30% dropout
L2_REG = 1e-4

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    GRU(GRU_UNITS),
    Dense(32, activation='relu', kernel_regularizer=l2(L2_REG)),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=l2(L2_REG))
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 52ms/step - accuracy: 0.2749 - loss: 1.5531 - val_accuracy: 0.3292 - val_loss: 1.4223
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 43ms/step - accuracy: 0.3100 - loss: 1.4471 - val_accuracy: 0.4199 - val_loss: 1.3564
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 49ms/step - accuracy: 0.4259 - loss: 1.3351 - val_accuracy: 0.4377 - val_loss: 1.3272
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 50ms/step - accuracy: 0.5220 - loss: 1.1324 - val_accuracy: 0.4448 - val_loss: 1.3452
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 43ms/step - accuracy: 0.6358 - loss: 0.9155 - val_accuracy: 0.4431 - val_loss: 1.5004
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 49ms/step - accuracy: 0.6989 - loss: 0.8171 - val_accuracy: 0.4199 - val_loss: 1.5672
Epoch 7/20
[1m176/17

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Visszatértem sima neurális hálókhoz, az LSTM és a GRU nem teljesítettek jól a tesztjeim alatt.

Különböző perceptron számok és több layer tesztelése


In [47]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense)
# -----------------------------
EMBED_DIM = 32

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.3767 - loss: 1.4082 - val_accuracy: 0.4359 - val_loss: 1.3115
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.5384 - loss: 1.1928 - val_accuracy: 0.4466 - val_loss: 1.3663
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.7699 - loss: 0.6587 - val_accuracy: 0.4573 - val_loss: 1.4658
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.9173 - loss: 0.2889 - val_accuracy: 0.4448 - val_loss: 1.6862
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9513 - loss: 0.1710 - val_accuracy: 0.3879 - val_loss: 1.9251
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.9573 - loss: 0.1308 - val_accuracy: 0.4288 - val_loss: 1.8973
[1m12/12[0m [32m━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense)
# -----------------------------
EMBED_DIM = 32

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print("\nTest Classification Report:")
print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.3655 - loss: 1.4199 - val_accuracy: 0.4306 - val_loss: 1.3041
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.5146 - loss: 1.1950 - val_accuracy: 0.4644 - val_loss: 1.2677
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.8190 - loss: 0.5738 - val_accuracy: 0.4448 - val_loss: 1.5152
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9263 - loss: 0.2449 - val_accuracy: 0.4502 - val_loss: 1.8512
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9532 - loss: 0.1554 - val_accuracy: 0.4181 - val_loss: 1.9845
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9634 - loss: 0.1049 - val_accuracy: 0.4484 - val_loss: 2.0407
Epoch 7/20
[1m176/176[0

Innentől rengeteg kombinációban kipróbáltam különböző hálókat különböző dropout paraméterekkel, regularizácioval, embedding mérettel és vocabulary mérettel


In [54]:
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense) with Dropout
# -----------------------------
EMBED_DIM = 32
DROPOUT_RATE = 0.2
for DROPOUT_RATE in [0.2, 0.3, 0.4]:
  model = Sequential([
      Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
      Flatten(),
      Dense(64, activation='relu'),
      Dropout(DROPOUT_RATE),
      Dense(32, activation='relu'),
      Dropout(DROPOUT_RATE),
      Dense(32, activation='relu'),
      Dropout(DROPOUT_RATE),
      Dense(NUM_CLASSES, activation='softmax')
  ])

  model.compile(
      optimizer='adam',
      loss='categorical_crossentropy',
      metrics=['accuracy']
  )

  model.summary()

  # -----------------------------
  # EarlyStopping callback
  # -----------------------------
  early_stop = EarlyStopping(
      monitor='val_loss',
      patience=5,
      restore_best_weights=True
  )

  # -----------------------------
  # Train
  # -----------------------------
  model.fit(
      X_train_pad,
      y_train_enc,
      validation_data=(X_val_pad, y_val_enc),
      epochs=20,
      batch_size=16,
      verbose=1,
      callbacks=[early_stop]
  )

  # -----------------------------
  # Evaluate on test set
  # -----------------------------
  test_preds = model.predict(X_test_pad)
  test_preds = np.argmax(test_preds, axis=1)
  decoded_preds = label_encoder.inverse_transform(test_preds)

  print("\nTest Classification Report:")
  print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.3182 - loss: 1.4916 - val_accuracy: 0.4306 - val_loss: 1.3636
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.4232 - loss: 1.3398 - val_accuracy: 0.4146 - val_loss: 1.2995
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5596 - loss: 1.0936 - val_accuracy: 0.4591 - val_loss: 1.2943
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7479 - loss: 0.7133 - val_accuracy: 0.4217 - val_loss: 1.6179
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8721 - loss: 0.4003 - val_accuracy: 0.4573 - val_loss: 1.7119
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9417 - loss: 0.2420 - val_accuracy: 0.4413 - val_loss: 2.1037
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.3117 - loss: 1.4957 - val_accuracy: 0.3968 - val_loss: 1.3359
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.3979 - loss: 1.3359 - val_accuracy: 0.4484 - val_loss: 1.2740
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5041 - loss: 1.1387 - val_accuracy: 0.4555 - val_loss: 1.3573
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6657 - loss: 0.8343 - val_accuracy: 0.4484 - val_loss: 1.5142
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.7883 - loss: 0.5724 - val_accuracy: 0.4324 - val_loss: 1.8188
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8766 - loss: 0.3845 - val_accuracy: 0.3808 - val_loss: 2.1318
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.3317 - loss: 1.4741 - val_accuracy: 0.4164 - val_loss: 1.4029
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.3960 - loss: 1.3883 - val_accuracy: 0.4484 - val_loss: 1.3005
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4628 - loss: 1.2496 - val_accuracy: 0.4306 - val_loss: 1.2915
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5873 - loss: 1.0252 - val_accuracy: 0.4715 - val_loss: 1.3697
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6821 - loss: 0.7912 - val_accuracy: 0.4662 - val_loss: 1.6480
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7694 - loss: 0.5970 - val_accuracy: 0.4324 - val_loss: 1.9609
Epoch 7/20
[1m176/176[0m 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Bag of Word kipróbálása

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Bag-of-Words vectorization
# -----------------------------
MAX_FEATURES = 5000
vectorizer = CountVectorizer(max_features=MAX_FEATURES, ngram_range=(1,2))

X_train_bow = vectorizer.fit_transform(X_train).toarray()
X_val_bow   = vectorizer.transform(X_val).toarray()
X_test_bow  = vectorizer.transform(X_test).toarray()

print("BoW feature size:", X_train_bow.shape[1])

# -----------------------------
# Neural Network (Dense + Dropout)
# -----------------------------
for DROPOUT_RATE in [0.2, 0.3, 0.4]:
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_bow.shape[1],)),
        Dropout(DROPOUT_RATE),
        Dense(32, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(32, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(NUM_CLASSES, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.summary()

    # -----------------------------
    # EarlyStopping callback
    # -----------------------------
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    # -----------------------------
    # Train
    # -----------------------------
    model.fit(
        X_train_bow,
        y_train_enc,
        validation_data=(X_val_bow, y_val_enc),
        epochs=20,
        batch_size=16,
        verbose=1,
        callbacks=[early_stop]
    )

    # -----------------------------
    # Evaluate on test set
    # -----------------------------
    test_preds = model.predict(X_test_bow)
    test_preds = np.argmax(test_preds, axis=1)
    decoded_preds = label_encoder.inverse_transform(test_preds)

    print("\nTest Classification Report (Dropout={:.1f}):".format(DROPOUT_RATE))
    print(classification_report(y_test, decoded_preds))



BoW feature size: 5000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.2876 - loss: 1.5491 - val_accuracy: 0.4021 - val_loss: 1.3733
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.4779 - loss: 1.2489 - val_accuracy: 0.4306 - val_loss: 1.2821
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6277 - loss: 0.9640 - val_accuracy: 0.4591 - val_loss: 1.3116
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7805 - loss: 0.6795 - val_accuracy: 0.4377 - val_loss: 1.5301
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8204 - loss: 0.5153 - val_accuracy: 0.4377 - val_loss: 1.7101
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8788 - loss: 0.3873 - val_accuracy: 0.4537 - val_loss: 2.0094
Epoch 7/20
[1m176/176[0m 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.3247 - loss: 1.5578 - val_accuracy: 0.3950 - val_loss: 1.3824
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.4112 - loss: 1.3217 - val_accuracy: 0.4502 - val_loss: 1.2842
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5202 - loss: 1.1167 - val_accuracy: 0.4466 - val_loss: 1.2988
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.6195 - loss: 0.9367 - val_accuracy: 0.4555 - val_loss: 1.3176
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6798 - loss: 0.8171 - val_accuracy: 0.4698 - val_loss: 1.4492
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7679 - loss: 0.6796 - val_accuracy: 0.4555 - val_loss: 1.5871
Epoch 7/20
[1m176/176[0m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.2768 - loss: 1.5745 - val_accuracy: 0.3701 - val_loss: 1.4716
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.3433 - loss: 1.4609 - val_accuracy: 0.3879 - val_loss: 1.3804
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.4425 - loss: 1.3088 - val_accuracy: 0.4591 - val_loss: 1.3037
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4918 - loss: 1.1977 - val_accuracy: 0.4431 - val_loss: 1.2679
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5556 - loss: 1.0764 - val_accuracy: 0.4715 - val_loss: 1.2987
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6551 - loss: 0.8909 - val_accuracy: 0.4591 - val_loss: 1.3415
Epoch 7/20
[1m176/176[0m 

In [57]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Bag-of-Words vectorization
# -----------------------------
MAX_FEATURES = 10000
vectorizer = CountVectorizer(max_features=MAX_FEATURES, ngram_range=(1,2))

X_train_bow = vectorizer.fit_transform(X_train).toarray()
X_val_bow   = vectorizer.transform(X_val).toarray()
X_test_bow  = vectorizer.transform(X_test).toarray()

print("BoW feature size:", X_train_bow.shape[1])

# -----------------------------
# Neural Network (Dense + Dropout)
# -----------------------------
for DROPOUT_RATE in [0.2, 0.3, 0.4]:
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_bow.shape[1],)),
        Dropout(DROPOUT_RATE),
        Dense(32, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(32, activation='relu'),
        Dropout(DROPOUT_RATE),
        Dense(NUM_CLASSES, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.summary()

    # -----------------------------
    # EarlyStopping callback
    # -----------------------------
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    # -----------------------------
    # Train
    # -----------------------------
    model.fit(
        X_train_bow,
        y_train_enc,
        validation_data=(X_val_bow, y_val_enc),
        epochs=20,
        batch_size=16,
        verbose=1,
        callbacks=[early_stop]
    )

    # -----------------------------
    # Evaluate on test set
    # -----------------------------
    test_preds = model.predict(X_test_bow)
    test_preds = np.argmax(test_preds, axis=1)
    decoded_preds = label_encoder.inverse_transform(test_preds)

    print("\nTest Classification Report (Dropout={:.1f}):".format(DROPOUT_RATE))
    print(classification_report(y_test, decoded_preds))


BoW feature size: 10000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.2886 - loss: 1.5666 - val_accuracy: 0.4217 - val_loss: 1.3724
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.4740 - loss: 1.2546 - val_accuracy: 0.4359 - val_loss: 1.2957
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.6311 - loss: 0.9398 - val_accuracy: 0.4644 - val_loss: 1.3980
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.7816 - loss: 0.6166 - val_accuracy: 0.4199 - val_loss: 1.6429
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8688 - loss: 0.4378 - val_accuracy: 0.4270 - val_loss: 1.7812
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8821 - loss: 0.3413 - val_accuracy: 0.4217 - val_loss: 2.0889
Epoch 7/20
[1m176/176

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.3033 - loss: 1.5462 - val_accuracy: 0.3861 - val_loss: 1.4052
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.4154 - loss: 1.3300 - val_accuracy: 0.4466 - val_loss: 1.2735
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.5551 - loss: 1.0922 - val_accuracy: 0.4253 - val_loss: 1.3186
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6786 - loss: 0.8996 - val_accuracy: 0.4448 - val_loss: 1.4295
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7673 - loss: 0.6702 - val_accuracy: 0.4128 - val_loss: 1.6680
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8102 - loss: 0.5568 - val_accuracy: 0.4448 - val_loss: 1.7064
Epoch 7/20
[1m176/176

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.2713 - loss: 1.5897 - val_accuracy: 0.4181 - val_loss: 1.4567
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.4006 - loss: 1.4297 - val_accuracy: 0.4431 - val_loss: 1.3300
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.4830 - loss: 1.2760 - val_accuracy: 0.4537 - val_loss: 1.2689
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.5556 - loss: 1.1300 - val_accuracy: 0.4448 - val_loss: 1.2854
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6319 - loss: 0.9495 - val_accuracy: 0.4181 - val_loss: 1.3598
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6795 - loss: 0.8556 - val_accuracy: 0.4431 - val_loss: 1.4599
Epoch 7/20
[1m176/176

In [61]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Bag-of-Words vectorization
# -----------------------------
for MAX_FEATURES in [5000, 10000, 20000]:
  vectorizer = CountVectorizer(max_features=MAX_FEATURES, ngram_range=(1,2))

  X_train_bow = vectorizer.fit_transform(X_train).toarray()
  X_val_bow   = vectorizer.transform(X_val).toarray()
  X_test_bow  = vectorizer.transform(X_test).toarray()

  print("BoW feature size:", X_train_bow.shape[1])

  # -----------------------------
  # Neural Network (Dense, no Dropout)
  # -----------------------------
  model = Sequential([
      Dense(64, activation='relu', input_shape=(X_train_bow.shape[1],)),
      Dense(32, activation='relu'),
      Dense(32, activation='relu'),
      Dense(NUM_CLASSES, activation='softmax')
  ])

  model.compile(
      optimizer='adam',
      loss='categorical_crossentropy',
      metrics=['accuracy']
  )

  model.summary()

  # -----------------------------
  # EarlyStopping callback
  # -----------------------------
  early_stop = EarlyStopping(
      monitor='val_loss',
      patience=5,
      restore_best_weights=True
  )

  # -----------------------------
  # Train
  # -----------------------------
  model.fit(
      X_train_bow,
      y_train_enc,
      validation_data=(X_val_bow, y_val_enc),
      epochs=20,
      batch_size=16,
      verbose=1,
      callbacks=[early_stop]
  )

  # -----------------------------
  # Evaluate on test set
  # -----------------------------
  test_preds = model.predict(X_test_bow)
  test_preds = np.argmax(test_preds, axis=1)
  decoded_preds = label_encoder.inverse_transform(test_preds)

  print("\nTest Classification Report:")
  print(classification_report(y_test, decoded_preds))


BoW feature size: 5000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.3047 - loss: 1.5266 - val_accuracy: 0.4253 - val_loss: 1.3081
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5679 - loss: 1.0417 - val_accuracy: 0.4359 - val_loss: 1.3219
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8170 - loss: 0.5691 - val_accuracy: 0.4448 - val_loss: 1.5792
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9053 - loss: 0.3113 - val_accuracy: 0.4306 - val_loss: 2.2048
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9382 - loss: 0.2024 - val_accuracy: 0.4306 - val_loss: 2.3482
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9509 - loss: 0.1462 - val_accuracy: 0.4270 - val_loss: 2.5617
[1m12/12[0m [32m━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.3137 - loss: 1.5383 - val_accuracy: 0.4555 - val_loss: 1.2838
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6501 - loss: 0.9804 - val_accuracy: 0.4484 - val_loss: 1.3471
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8560 - loss: 0.4943 - val_accuracy: 0.4466 - val_loss: 1.7224
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9215 - loss: 0.2576 - val_accuracy: 0.4520 - val_loss: 2.1818
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9452 - loss: 0.1718 - val_accuracy: 0.4359 - val_loss: 2.2549
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9470 - loss: 0.1299 - val_accuracy: 0.4253 - val_loss: 2.4542
[1m12/12[0m [32m━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.3286 - loss: 1.5259 - val_accuracy: 0.4253 - val_loss: 1.2990
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.6790 - loss: 0.9040 - val_accuracy: 0.4253 - val_loss: 1.4009
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.8830 - loss: 0.3961 - val_accuracy: 0.4484 - val_loss: 1.8762
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9302 - loss: 0.2245 - val_accuracy: 0.4110 - val_loss: 2.0852
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.9499 - loss: 0.1539 - val_accuracy: 0.4217 - val_loss: 2.2421
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.9514 - loss: 0.1127 - val_accuracy: 0.4075 - val_loss: 2.5532
[1m12/12[0m [32m━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Visszatérés embeddingre

In [62]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense)
# -----------------------------
for EMBED_DIM in [32, 64, 128]:

  model = Sequential([
      Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
      Flatten(),
      Dense(64, activation='relu'),
      Dense(32, activation='relu'),
      Dense(32, activation='relu'),
      Dense(NUM_CLASSES, activation='softmax')
  ])

  model.compile(
      optimizer='adam',
      loss='categorical_crossentropy',
      metrics=['accuracy']
  )

  model.summary()

  # -----------------------------
  # EarlyStopping callback
  # -----------------------------
  early_stop = EarlyStopping(
      monitor='val_loss',
      patience=5,
      restore_best_weights=True
  )

  # -----------------------------
  # Train
  # -----------------------------
  model.fit(
      X_train_pad,
      y_train_enc,
      validation_data=(X_val_pad, y_val_enc),
      epochs=20,
      batch_size=16,
      verbose=1,
      callbacks=[early_stop]
  )

  # -----------------------------
  # Evaluate on test set
  # -----------------------------
  test_preds = model.predict(X_test_pad)
  test_preds = np.argmax(test_preds, axis=1)
  decoded_preds = label_encoder.inverse_transform(test_preds)

  print("\nTest Classification Report:")
  print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.3825 - loss: 1.4061 - val_accuracy: 0.4431 - val_loss: 1.3072
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5040 - loss: 1.2015 - val_accuracy: 0.4626 - val_loss: 1.2494
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7581 - loss: 0.6723 - val_accuracy: 0.4769 - val_loss: 1.5252
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9157 - loss: 0.2870 - val_accuracy: 0.4537 - val_loss: 1.7339
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9488 - loss: 0.1641 - val_accuracy: 0.4520 - val_loss: 2.0733
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9658 - loss: 0.1020 - val_accuracy: 0.3879 - val_loss: 2.1855
Epoch 7/20
[1m176/176[0m



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.3956 - loss: 1.3903 - val_accuracy: 0.4181 - val_loss: 1.3189
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.5365 - loss: 1.1387 - val_accuracy: 0.4573 - val_loss: 1.4583
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8552 - loss: 0.4750 - val_accuracy: 0.4235 - val_loss: 1.7017
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9506 - loss: 0.1845 - val_accuracy: 0.4164 - val_loss: 1.8849
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9610 - loss: 0.1276 - val_accuracy: 0.4128 - val_loss: 1.9909
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9628 - loss: 0.1095 - val_accuracy: 0.4448 - val_loss: 2.1299
[1m12/12[0m [32m━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - accuracy: 0.3744 - loss: 1.4085 - val_accuracy: 0.4146 - val_loss: 1.3036
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.5536 - loss: 1.1312 - val_accuracy: 0.4431 - val_loss: 1.2883
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.8728 - loss: 0.4326 - val_accuracy: 0.4502 - val_loss: 1.6153
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.9457 - loss: 0.1960 - val_accuracy: 0.4199 - val_loss: 1.7239
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 25ms/step - accuracy: 0.9661 - loss: 0.1118 - val_accuracy: 0.4181 - val_loss: 2.1000
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.9682 - loss: 0.0834 - val_accuracy: 0.4128 - val_loss: 2.1892
Epoch 7/20
[1m176/176

In [63]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense)
# -----------------------------
for EMBED_DIM in [32, 64, 128]:

  model = Sequential([
      Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
      Flatten(),
      Dense(32, activation='relu'),
      Dense(32, activation='relu'),
      Dense(NUM_CLASSES, activation='softmax')
  ])

  model.compile(
      optimizer='adam',
      loss='categorical_crossentropy',
      metrics=['accuracy']
  )

  model.summary()

  # -----------------------------
  # EarlyStopping callback
  # -----------------------------
  early_stop = EarlyStopping(
      monitor='val_loss',
      patience=5,
      restore_best_weights=True
  )

  # -----------------------------
  # Train
  # -----------------------------
  model.fit(
      X_train_pad,
      y_train_enc,
      validation_data=(X_val_pad, y_val_enc),
      epochs=20,
      batch_size=16,
      verbose=1,
      callbacks=[early_stop]
  )

  # -----------------------------
  # Evaluate on test set
  # -----------------------------
  test_preds = model.predict(X_test_pad)
  test_preds = np.argmax(test_preds, axis=1)
  decoded_preds = label_encoder.inverse_transform(test_preds)

  print("\nTest Classification Report:")
  print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.3745 - loss: 1.4297 - val_accuracy: 0.4342 - val_loss: 1.3112
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4999 - loss: 1.2228 - val_accuracy: 0.4662 - val_loss: 1.2596
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7261 - loss: 0.7811 - val_accuracy: 0.4573 - val_loss: 1.4251
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8806 - loss: 0.3972 - val_accuracy: 0.4199 - val_loss: 1.6031
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9470 - loss: 0.1914 - val_accuracy: 0.4235 - val_loss: 1.8604
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9605 - loss: 0.1307 - val_accuracy: 0.4306 - val_loss: 1.8901
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.3566 - loss: 1.4177 - val_accuracy: 0.4537 - val_loss: 1.3220
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5252 - loss: 1.1597 - val_accuracy: 0.4751 - val_loss: 1.2444
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7602 - loss: 0.6864 - val_accuracy: 0.4644 - val_loss: 1.4665
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8999 - loss: 0.3026 - val_accuracy: 0.4164 - val_loss: 1.6773
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9416 - loss: 0.1890 - val_accuracy: 0.4431 - val_loss: 1.7382
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9613 - loss: 0.1275 - val_accuracy: 0.4306 - val_loss: 1.8797
Epoch 7/20
[1m176/176[0m



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.3957 - loss: 1.3769 - val_accuracy: 0.4359 - val_loss: 1.3099
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6294 - loss: 1.0374 - val_accuracy: 0.4537 - val_loss: 1.2999
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8903 - loss: 0.3702 - val_accuracy: 0.4395 - val_loss: 1.5518
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9469 - loss: 0.1820 - val_accuracy: 0.4217 - val_loss: 1.7281
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9648 - loss: 0.1057 - val_accuracy: 0.4270 - val_loss: 1.6966
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9643 - loss: 0.0983 - val_accuracy: 0.3950 - val_loss: 1.8699
Epoch 7/20
[1m176/176

In [64]:
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense with Dropout)
# -----------------------------
for EMBED_DIM in [32, 64, 128]:
    for DROPOUT_RATE in [0.2, 0.3, 0.4]:

      model = Sequential([
          Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
          Flatten(),
          Dense(32, activation='relu'),
          Dropout(DROPOUT_RATE),
          Dense(32, activation='relu'),
          Dropout(DROPOUT_RATE),
          Dense(NUM_CLASSES, activation='softmax')
      ])

      model.compile(
          optimizer='adam',
          loss='categorical_crossentropy',
          metrics=['accuracy']
      )

      model.summary()

      # -----------------------------
      # EarlyStopping callback
      # -----------------------------
      early_stop = EarlyStopping(
          monitor='val_loss',
          patience=5,
          restore_best_weights=True
      )

      # -----------------------------
      # Train
      # -----------------------------
      model.fit(
          X_train_pad,
          y_train_enc,
          validation_data=(X_val_pad, y_val_enc),
          epochs=20,
          batch_size=16,
          verbose=1,
          callbacks=[early_stop]
      )

      # -----------------------------
      # Evaluate on test set
      # -----------------------------
      test_preds = model.predict(X_test_pad)
      test_preds = np.argmax(test_preds, axis=1)
      decoded_preds = label_encoder.inverse_transform(test_preds)

      print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
      print(classification_report(y_test, decoded_preds))




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.3386 - loss: 1.4581 - val_accuracy: 0.4342 - val_loss: 1.3142
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.4592 - loss: 1.2743 - val_accuracy: 0.4413 - val_loss: 1.2765
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6060 - loss: 1.0026 - val_accuracy: 0.4573 - val_loss: 1.2928
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8135 - loss: 0.5602 - val_accuracy: 0.4413 - val_loss: 1.5506
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9185 - loss: 0.2949 - val_accuracy: 0.4128 - val_loss: 1.7082
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9446 - loss: 0.2071 - val_accuracy: 0.3950 - val_loss: 1.9057
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3400 - loss: 1.4642 - val_accuracy: 0.4253 - val_loss: 1.3292
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.4345 - loss: 1.3248 - val_accuracy: 0.4484 - val_loss: 1.2783
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.5821 - loss: 1.0771 - val_accuracy: 0.4502 - val_loss: 1.2488
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7511 - loss: 0.6903 - val_accuracy: 0.4395 - val_loss: 1.4725
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8839 - loss: 0.3952 - val_accuracy: 0.4057 - val_loss: 1.6400
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9161 - loss: 0.2933 - val_accuracy: 0.4306 - val_loss: 1.8367
Epoch 7/20
[1m176/176[0m 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.3019 - loss: 1.4997 - val_accuracy: 0.4004 - val_loss: 1.3595
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.3887 - loss: 1.3824 - val_accuracy: 0.4199 - val_loss: 1.3502
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4609 - loss: 1.2400 - val_accuracy: 0.4377 - val_loss: 1.2830
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.5990 - loss: 0.9869 - val_accuracy: 0.4537 - val_loss: 1.3284
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7464 - loss: 0.7152 - val_accuracy: 0.4306 - val_loss: 1.5113
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8549 - loss: 0.4531 - val_accuracy: 0.4057 - val_loss: 1.6695
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.3560 - loss: 1.4334 - val_accuracy: 0.4217 - val_loss: 1.3222
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.4607 - loss: 1.2556 - val_accuracy: 0.4466 - val_loss: 1.2508
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6900 - loss: 0.8345 - val_accuracy: 0.4680 - val_loss: 1.3404
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.8603 - loss: 0.4389 - val_accuracy: 0.4377 - val_loss: 1.6431
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9350 - loss: 0.2353 - val_accuracy: 0.4395 - val_loss: 1.8320
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9514 - loss: 0.1736 - val_accuracy: 0.4342 - val_loss: 2.0143
Epoch 7/20
[1m176/176[



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.3365 - loss: 1.4730 - val_accuracy: 0.4199 - val_loss: 1.3165
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.4287 - loss: 1.3161 - val_accuracy: 0.4448 - val_loss: 1.2740
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5963 - loss: 1.0348 - val_accuracy: 0.4911 - val_loss: 1.2889
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7765 - loss: 0.6562 - val_accuracy: 0.4537 - val_loss: 1.4310
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9040 - loss: 0.3367 - val_accuracy: 0.4324 - val_loss: 1.6739
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9363 - loss: 0.2423 - val_accuracy: 0.4164 - val_loss: 1.8439
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.3027 - loss: 1.4957 - val_accuracy: 0.4181 - val_loss: 1.3595
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4150 - loss: 1.3350 - val_accuracy: 0.4431 - val_loss: 1.2770
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5219 - loss: 1.1626 - val_accuracy: 0.4680 - val_loss: 1.2387
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6727 - loss: 0.8619 - val_accuracy: 0.4520 - val_loss: 1.3348
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8180 - loss: 0.5497 - val_accuracy: 0.4377 - val_loss: 1.6051
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8992 - loss: 0.3305 - val_accuracy: 0.4128 - val_loss: 1.7777
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.3378 - loss: 1.4538 - val_accuracy: 0.4520 - val_loss: 1.3175
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.4942 - loss: 1.2152 - val_accuracy: 0.4484 - val_loss: 1.2572
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7262 - loss: 0.7621 - val_accuracy: 0.3950 - val_loss: 1.4397
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.8988 - loss: 0.3379 - val_accuracy: 0.4164 - val_loss: 1.6253
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9214 - loss: 0.2499 - val_accuracy: 0.4359 - val_loss: 1.9306
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9532 - loss: 0.1720 - val_accuracy: 0.4021 - val_loss: 2.0007
Epoch 7/20
[1m176/176

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.3001 - loss: 1.4829 - val_accuracy: 0.4128 - val_loss: 1.3998
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.4773 - loss: 1.2688 - val_accuracy: 0.4929 - val_loss: 1.2485
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.6845 - loss: 0.8378 - val_accuracy: 0.4377 - val_loss: 1.3842
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8644 - loss: 0.4357 - val_accuracy: 0.4235 - val_loss: 1.4916
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9227 - loss: 0.2772 - val_accuracy: 0.4484 - val_loss: 1.7881
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.9417 - loss: 0.2056 - val_accuracy: 0.4466 - val_loss: 1.8456
Epoch 7/20
[1m176/176



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.3037 - loss: 1.5198 - val_accuracy: 0.4431 - val_loss: 1.3342
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.4245 - loss: 1.3229 - val_accuracy: 0.4555 - val_loss: 1.2748
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.5366 - loss: 1.1129 - val_accuracy: 0.4537 - val_loss: 1.3229
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.7136 - loss: 0.7788 - val_accuracy: 0.4609 - val_loss: 1.3722
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8432 - loss: 0.4998 - val_accuracy: 0.4573 - val_loss: 1.6365
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9039 - loss: 0.3360 - val_accuracy: 0.4413 - val_loss: 1.7600
Epoch 7/20
[1m176/176

In [67]:
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense with Dropout)
# -----------------------------
for EMBED_DIM in [32]:
    for DROPOUT_RATE in [0.0, 0.3]:

      model = Sequential([
          Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
          Flatten(),
          Dense(32, activation='relu'),
          Dropout(DROPOUT_RATE),
          Dense(32, activation='relu'),
          Dropout(DROPOUT_RATE),
          Dense(NUM_CLASSES, activation='softmax')
      ])

      model.compile(
          optimizer='adam',
          loss='categorical_crossentropy',
          metrics=['accuracy']
      )

      model.summary()

      # -----------------------------
      # EarlyStopping callback
      # -----------------------------
      early_stop = EarlyStopping(
          monitor='val_loss',
          patience=5,
          restore_best_weights=True
      )

      # -----------------------------
      # Train
      # -----------------------------
      model.fit(
          X_train_pad,
          y_train_enc,
          validation_data=(X_val_pad, y_val_enc),
          epochs=20,
          batch_size=16,
          verbose=1,
          callbacks=[early_stop]
      )

      # -----------------------------
      # Evaluate on test set
      # -----------------------------
      test_preds = model.predict(X_test_pad)
      test_preds = np.argmax(test_preds, axis=1)
      decoded_preds = label_encoder.inverse_transform(test_preds)

      print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
      print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.3714 - loss: 1.4030 - val_accuracy: 0.4502 - val_loss: 1.3024
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5362 - loss: 1.1849 - val_accuracy: 0.4698 - val_loss: 1.2816
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.7932 - loss: 0.6619 - val_accuracy: 0.4786 - val_loss: 1.5163
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9185 - loss: 0.2713 - val_accuracy: 0.4288 - val_loss: 1.6889
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9567 - loss: 0.1563 - val_accuracy: 0.4609 - val_loss: 1.8665
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9611 - loss: 0.1333 - val_accuracy: 0.4395 - val_loss: 1.8523
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.3385 - loss: 1.4797 - val_accuracy: 0.4146 - val_loss: 1.3292
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4461 - loss: 1.3283 - val_accuracy: 0.4555 - val_loss: 1.2856
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5917 - loss: 1.1025 - val_accuracy: 0.4431 - val_loss: 1.2804
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7576 - loss: 0.7090 - val_accuracy: 0.4324 - val_loss: 1.4766
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9013 - loss: 0.3705 - val_accuracy: 0.4110 - val_loss: 1.6887
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9248 - loss: 0.2797 - val_accuracy: 0.4128 - val_loss: 1.8332
Epoch 7/20
[1m176/176[0m 

In [68]:
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
for MAX_VOCAB in [5000, 10000, 20000]:
  MAX_LEN = 100

  tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
  tokenizer.fit_on_texts(X_train)

  X_train_seq = tokenizer.texts_to_sequences(X_train)
  X_val_seq   = tokenizer.texts_to_sequences(X_val)
  X_test_seq  = tokenizer.texts_to_sequences(X_test)

  X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
  X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
  X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

  # -----------------------------
  # Neural Network (Embedding → Flatten → Dense)
  # -----------------------------
  for EMBED_DIM in [32]:

    model = Sequential([
        Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
        Flatten(),
        Dense(32, activation='relu'),
        Dense(32, activation='relu'),
        Dense(NUM_CLASSES, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.summary()

    # -----------------------------
    # EarlyStopping callback
    # -----------------------------
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    # -----------------------------
    # Train
    # -----------------------------
    model.fit(
        X_train_pad,
        y_train_enc,
        validation_data=(X_val_pad, y_val_enc),
        epochs=20,
        batch_size=16,
        verbose=1,
        callbacks=[early_stop]
    )

    # -----------------------------
    # Evaluate on test set
    # -----------------------------
    test_preds = model.predict(X_test_pad)
    test_preds = np.argmax(test_preds, axis=1)
    decoded_preds = label_encoder.inverse_transform(test_preds)

    print("\nTest Classification Report:")
    print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.3721 - loss: 1.4065 - val_accuracy: 0.4342 - val_loss: 1.3117
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.5209 - loss: 1.2001 - val_accuracy: 0.4359 - val_loss: 1.3027
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7956 - loss: 0.6545 - val_accuracy: 0.3808 - val_loss: 1.5841
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9342 - loss: 0.2578 - val_accuracy: 0.4253 - val_loss: 1.7246
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9656 - loss: 0.1511 - val_accuracy: 0.3683 - val_loss: 1.8905
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9655 - loss: 0.1100 - val_accuracy: 0.4146 - val_loss: 2.0474
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.3761 - loss: 1.4041 - val_accuracy: 0.4484 - val_loss: 1.3170
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.5314 - loss: 1.1815 - val_accuracy: 0.4448 - val_loss: 1.2613
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8473 - loss: 0.5162 - val_accuracy: 0.4306 - val_loss: 1.4928
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9530 - loss: 0.1913 - val_accuracy: 0.4235 - val_loss: 1.7333
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9568 - loss: 0.1367 - val_accuracy: 0.4164 - val_loss: 1.7905
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9580 - loss: 0.1095 - val_accuracy: 0.4146 - val_loss: 1.9436
Epoch 7/20
[1m176/176[0m



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.3678 - loss: 1.4268 - val_accuracy: 0.4520 - val_loss: 1.3205
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5476 - loss: 1.1682 - val_accuracy: 0.4537 - val_loss: 1.3084
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8398 - loss: 0.5676 - val_accuracy: 0.4448 - val_loss: 1.5796
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9316 - loss: 0.2158 - val_accuracy: 0.4448 - val_loss: 1.7280
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9555 - loss: 0.1359 - val_accuracy: 0.4502 - val_loss: 1.7649
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9636 - loss: 0.1004 - val_accuracy: 0.4270 - val_loss: 2.1626
Epoch 7/20
[1m176/176[0

In [70]:
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
for MAX_VOCAB in [5000, 10000, 20000]:
  MAX_LEN = 100

  tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
  tokenizer.fit_on_texts(X_train)

  X_train_seq = tokenizer.texts_to_sequences(X_train)
  X_val_seq   = tokenizer.texts_to_sequences(X_val)
  X_test_seq  = tokenizer.texts_to_sequences(X_test)

  X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
  X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
  X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

  # -----------------------------
  # Neural Network (Embedding → Flatten → Dense with Dropout)
  # -----------------------------
  for EMBED_DIM in [32]:
      for DROPOUT_RATE in [0.2, 0.3, 0.4]:

        model = Sequential([
            Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
            Flatten(),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(NUM_CLASSES, activation='softmax')
        ])

        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

        model.summary()

        # -----------------------------
        # EarlyStopping callback
        # -----------------------------
        early_stop = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )

        # -----------------------------
        # Train
        # -----------------------------
        model.fit(
            X_train_pad,
            y_train_enc,
            validation_data=(X_val_pad, y_val_enc),
            epochs=20,
            batch_size=16,
            verbose=1,
            callbacks=[early_stop]
        )

        # -----------------------------
        # Evaluate on test set
        # -----------------------------
        test_preds = model.predict(X_test_pad)
        test_preds = np.argmax(test_preds, axis=1)
        decoded_preds = label_encoder.inverse_transform(test_preds)
        print(f"Max vocab:{MAX_VOCAB}")
        print(f"Dropout rate:{DROPOUT_RATE}")
        print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
        print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.3472 - loss: 1.4243 - val_accuracy: 0.4342 - val_loss: 1.3229
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.4554 - loss: 1.2783 - val_accuracy: 0.4448 - val_loss: 1.2841
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6111 - loss: 1.0116 - val_accuracy: 0.4626 - val_loss: 1.3226
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.8351 - loss: 0.5404 - val_accuracy: 0.4484 - val_loss: 1.5201
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9216 - loss: 0.3090 - val_accuracy: 0.4377 - val_loss: 1.7045
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9466 - loss: 0.2008 - val_accuracy: 0.4537 - val_loss: 1.9798
Epoch 7/20
[1m176/176



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.3364 - loss: 1.4788 - val_accuracy: 0.4181 - val_loss: 1.3683
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4354 - loss: 1.3227 - val_accuracy: 0.4395 - val_loss: 1.2860
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.5751 - loss: 1.0951 - val_accuracy: 0.4306 - val_loss: 1.2893
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7729 - loss: 0.6731 - val_accuracy: 0.4199 - val_loss: 1.5287
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8878 - loss: 0.3920 - val_accuracy: 0.4502 - val_loss: 1.8042
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9249 - loss: 0.2774 - val_accuracy: 0.3897 - val_loss: 1.9546
Epoch 7/20
[1m176/176[0m 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.3333 - loss: 1.4754 - val_accuracy: 0.4324 - val_loss: 1.3693
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4166 - loss: 1.3497 - val_accuracy: 0.4484 - val_loss: 1.3070
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5016 - loss: 1.2062 - val_accuracy: 0.4644 - val_loss: 1.2527
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6548 - loss: 0.8850 - val_accuracy: 0.4413 - val_loss: 1.3794
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7929 - loss: 0.5857 - val_accuracy: 0.4413 - val_loss: 1.5873
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8841 - loss: 0.3783 - val_accuracy: 0.4057 - val_loss: 1.8683
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.3427 - loss: 1.4592 - val_accuracy: 0.4431 - val_loss: 1.3213
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4577 - loss: 1.2880 - val_accuracy: 0.4715 - val_loss: 1.2750
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.6402 - loss: 0.9821 - val_accuracy: 0.4306 - val_loss: 1.3430
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8518 - loss: 0.4748 - val_accuracy: 0.3790 - val_loss: 1.6712
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9399 - loss: 0.2359 - val_accuracy: 0.3968 - val_loss: 1.8680
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9444 - loss: 0.1956 - val_accuracy: 0.4075 - val_loss: 2.0003
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.3138 - loss: 1.4927 - val_accuracy: 0.4021 - val_loss: 1.3287
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4356 - loss: 1.2882 - val_accuracy: 0.4555 - val_loss: 1.2537
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5920 - loss: 1.0635 - val_accuracy: 0.4502 - val_loss: 1.2715
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.7587 - loss: 0.6997 - val_accuracy: 0.4555 - val_loss: 1.5030
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8644 - loss: 0.4142 - val_accuracy: 0.4609 - val_loss: 1.8256
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9082 - loss: 0.3032 - val_accuracy: 0.4448 - val_loss: 1.9987
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3202 - loss: 1.4975 - val_accuracy: 0.3968 - val_loss: 1.3560
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4001 - loss: 1.3838 - val_accuracy: 0.4537 - val_loss: 1.2814
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5046 - loss: 1.1818 - val_accuracy: 0.4715 - val_loss: 1.2587
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6929 - loss: 0.8652 - val_accuracy: 0.4484 - val_loss: 1.3463
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8299 - loss: 0.5229 - val_accuracy: 0.4466 - val_loss: 1.5323
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8885 - loss: 0.3648 - val_accuracy: 0.3879 - val_loss: 1.8580
Epoch 7/20
[1m176/176[0m 



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.3298 - loss: 1.4861 - val_accuracy: 0.4342 - val_loss: 1.3350
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.4624 - loss: 1.2939 - val_accuracy: 0.4609 - val_loss: 1.2806
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.6514 - loss: 0.9488 - val_accuracy: 0.4573 - val_loss: 1.3143
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8423 - loss: 0.5225 - val_accuracy: 0.4199 - val_loss: 1.5672
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9353 - loss: 0.2372 - val_accuracy: 0.3523 - val_loss: 1.9295
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9470 - loss: 0.1722 - val_accuracy: 0.4146 - val_loss: 1.9489
Epoch 7/20
[1m176/176[0



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.3025 - loss: 1.4833 - val_accuracy: 0.4270 - val_loss: 1.3572
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4094 - loss: 1.3557 - val_accuracy: 0.4555 - val_loss: 1.3021
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.5941 - loss: 1.0766 - val_accuracy: 0.4270 - val_loss: 1.3443
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8063 - loss: 0.6049 - val_accuracy: 0.4359 - val_loss: 1.5879
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9086 - loss: 0.3068 - val_accuracy: 0.4306 - val_loss: 1.7382
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9433 - loss: 0.2135 - val_accuracy: 0.4235 - val_loss: 1.9045
Epoch 7/20
[1m176/176[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.3234 - loss: 1.4983 - val_accuracy: 0.4128 - val_loss: 1.3535
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4183 - loss: 1.3523 - val_accuracy: 0.4413 - val_loss: 1.3276
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.5660 - loss: 1.0890 - val_accuracy: 0.4342 - val_loss: 1.3026
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7615 - loss: 0.6962 - val_accuracy: 0.4306 - val_loss: 1.4598
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8712 - loss: 0.4191 - val_accuracy: 0.4110 - val_loss: 1.7196
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9167 - loss: 0.2934 - val_accuracy: 0.4110 - val_loss: 1.7602
Epoch 7/20
[1m176/176[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [71]:
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
for MAX_VOCAB in [10000]:
  MAX_LEN = 100

  tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
  tokenizer.fit_on_texts(X_train)

  X_train_seq = tokenizer.texts_to_sequences(X_train)
  X_val_seq   = tokenizer.texts_to_sequences(X_val)
  X_test_seq  = tokenizer.texts_to_sequences(X_test)

  X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
  X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
  X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

  # -----------------------------
  # Neural Network (Embedding → Flatten → Dense with Dropout)
  # -----------------------------
  for EMBED_DIM in [32]:
      for DROPOUT_RATE in [0.4]:

        model = Sequential([
            Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
            Flatten(),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(NUM_CLASSES, activation='softmax')
        ])

        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

        model.summary()

        # -----------------------------
        # EarlyStopping callback
        # -----------------------------
        early_stop = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )

        # -----------------------------
        # Train
        # -----------------------------
        model.fit(
            X_train_pad,
            y_train_enc,
            validation_data=(X_val_pad, y_val_enc),
            epochs=20,
            batch_size=16,
            verbose=1,
            callbacks=[early_stop]
        )

        # -----------------------------
        # Evaluate on test set
        # -----------------------------
        test_preds = model.predict(X_test_pad)
        test_preds = np.argmax(test_preds, axis=1)
        decoded_preds = label_encoder.inverse_transform(test_preds)
        print(f"Max vocab:{MAX_VOCAB}")
        print(f"Dropout rate:{DROPOUT_RATE}")
        print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
        print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3217 - loss: 1.5050 - val_accuracy: 0.4181 - val_loss: 1.3461
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4117 - loss: 1.3544 - val_accuracy: 0.4555 - val_loss: 1.3072
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5106 - loss: 1.1724 - val_accuracy: 0.4644 - val_loss: 1.2848
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6906 - loss: 0.8198 - val_accuracy: 0.4502 - val_loss: 1.4172
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.8223 - loss: 0.5040 - val_accuracy: 0.4342 - val_loss: 1.6095
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8815 - loss: 0.3651 - val_accuracy: 0.4377 - val_loss: 1.8184
Epoch 7/20
[1m176/176[0m 

In [72]:
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np

# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
for MAX_VOCAB in [10000]:
    MAX_LEN = 100

    tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
    tokenizer.fit_on_texts(X_train)

    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_val_seq   = tokenizer.texts_to_sequences(X_val)
    X_test_seq  = tokenizer.texts_to_sequences(X_test)

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
    X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
    X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

    # -----------------------------
    # Neural Network (Embedding → Flatten → Dense with Dropout + L2)
    # -----------------------------
    for EMBED_DIM in [32]:
        for DROPOUT_RATE in [0.4]:
            L2_REG = 1e-4  # L2 regularization factor

            model = Sequential([
                Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
                Flatten(),
                Dense(32, activation='relu', kernel_regularizer=l2(L2_REG)),
                Dropout(DROPOUT_RATE),
                Dense(32, activation='relu', kernel_regularizer=l2(L2_REG)),
                Dropout(DROPOUT_RATE),
                Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=l2(L2_REG))
            ])

            model.compile(
                optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy']
            )

            model.summary()

            # -----------------------------
            # EarlyStopping callback
            # -----------------------------
            early_stop = EarlyStopping(
                monitor='val_loss',
                patience=5,
                restore_best_weights=True
            )

            # -----------------------------
            # Train
            # -----------------------------
            model.fit(
                X_train_pad,
                y_train_enc,
                validation_data=(X_val_pad, y_val_enc),
                epochs=20,
                batch_size=16,
                verbose=1,
                callbacks=[early_stop]
            )

            # -----------------------------
            # Evaluate on test set
            # -----------------------------
            test_preds = model.predict(X_test_pad)
            test_preds = np.argmax(test_preds, axis=1)
            decoded_preds = label_encoder.inverse_transform(test_preds)

            print(f"Max vocab: {MAX_VOCAB}")
            print(f"Dropout rate: {DROPOUT_RATE}")
            print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
            print(classification_report(y_test, decoded_preds))




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.2846 - loss: 1.5431 - val_accuracy: 0.4377 - val_loss: 1.3629
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.3971 - loss: 1.3929 - val_accuracy: 0.4484 - val_loss: 1.3063
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.4999 - loss: 1.2077 - val_accuracy: 0.4662 - val_loss: 1.2909
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6932 - loss: 0.8495 - val_accuracy: 0.4270 - val_loss: 1.4521
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8202 - loss: 0.5703 - val_accuracy: 0.4199 - val_loss: 1.7885
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9004 - loss: 0.3764 - val_accuracy: 0.4502 - val_loss: 1.9724
Epoch 7/20
[1m176/176[0m

In [74]:
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
for MAX_VOCAB in [10000]:
  MAX_LEN = 100

  tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
  tokenizer.fit_on_texts(X_train)

  X_train_seq = tokenizer.texts_to_sequences(X_train)
  X_val_seq   = tokenizer.texts_to_sequences(X_val)
  X_test_seq  = tokenizer.texts_to_sequences(X_test)

  X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
  X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
  X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

  # -----------------------------
  # Neural Network (Embedding → Flatten → Dense with Dropout)
  # -----------------------------
  for EMBED_DIM in [32]:
      for DROPOUT_RATE in [0.4]:

        model = Sequential([
            Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
            Flatten(),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(32, activation='relu'),
            Dropout(DROPOUT_RATE),
            Dense(NUM_CLASSES, activation='softmax')
        ])

        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

        model.summary()

        # -----------------------------
        # EarlyStopping callback
        # -----------------------------
        early_stop = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )

        # -----------------------------
        # Train
        # -----------------------------
        model.fit(
            X_train_pad,
            y_train_enc,
            validation_data=(X_val_pad, y_val_enc),
            epochs=20,
            batch_size=8,
            verbose=1,
            callbacks=[early_stop]
        )

        # -----------------------------
        # Evaluate on test set
        # -----------------------------
        test_preds = model.predict(X_test_pad)
        test_preds = np.argmax(test_preds, axis=1)
        decoded_preds = label_encoder.inverse_transform(test_preds)
        print(f"Max vocab:{MAX_VOCAB}")
        print(f"Dropout rate:{DROPOUT_RATE}")
        print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
        print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - accuracy: 0.3316 - loss: 1.4951 - val_accuracy: 0.4342 - val_loss: 1.3365
Epoch 2/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.4347 - loss: 1.3189 - val_accuracy: 0.4662 - val_loss: 1.2607
Epoch 3/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.6310 - loss: 0.9708 - val_accuracy: 0.4395 - val_loss: 1.2899
Epoch 4/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.8215 - loss: 0.5534 - val_accuracy: 0.4253 - val_loss: 1.5660
Epoch 5/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9077 - loss: 0.3335 - val_accuracy: 0.4431 - val_loss: 1.8815
Epoch 6/20
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9187 - loss: 0.2533 - val_accuracy: 0.4164 - val_loss: 1.9347
Epoch 7/20
[1m352/352[0m 

Végső választott modell.
Később a dropout törölve lett, mert úgy jobb eredményt adott.

In [None]:
#Végső modell tesztelése új adathalmaz felbontásban


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np



# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense with Dropout)
# -----------------------------
EMBED_DIM = 32
DROPOUT_RATE = 0.4

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# -----------------------------
# Train
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print(f"\nMax vocab: {MAX_VOCAB}")
print(f"Dropout rate: {DROPOUT_RATE}")
print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
print(classification_report(y_test, decoded_preds))




Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.3316 - loss: 1.4665 - val_accuracy: 0.4306 - val_loss: 1.3511
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.4546 - loss: 1.3468 - val_accuracy: 0.4520 - val_loss: 1.2899
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.5479 - loss: 1.1542 - val_accuracy: 0.4840 - val_loss: 1.2587
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.7279 - loss: 0.7914 - val_accuracy: 0.4502 - val_loss: 1.3480
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8428 - loss: 0.4801 - val_accuracy: 0.4306 - val_loss: 1.5677
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9091 - loss: 0.3182 - val_accuracy: 0.4413 - val_loss: 1.7961
Epoch 7/20
[1m176/176[0m

In [88]:
#Lehetséges megoldás a kiegyenlítetlenségre
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

#Végső modell tesztelése új adathalmaz felbontásban


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np



# -----------------------------
# Encode labels
# -----------------------------
label_encoder = LabelEncoder()
label_encoder.fit(labels)

y_train_enc = label_encoder.transform(y_train)
y_val_enc   = label_encoder.transform(y_val)
y_test_enc  = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)
y_train_enc = to_categorical(y_train_enc, num_classes=NUM_CLASSES)
y_val_enc   = to_categorical(y_val_enc, num_classes=NUM_CLASSES)

# -----------------------------
# Tokenize texts
# -----------------------------
MAX_VOCAB = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post')
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

# -----------------------------
# Neural Network (Embedding → Flatten → Dense with Dropout)
# -----------------------------
EMBED_DIM = 32
DROPOUT_RATE = 0.4

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(32, activation='relu'),
    Dropout(DROPOUT_RATE),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# -----------------------------
# EarlyStopping callback
# -----------------------------
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

y_train_int = label_encoder.transform(y_train)  # already done as y_train_enc before one-hot
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_int),
    y=y_train_int
)
class_weights_dict = dict(enumerate(class_weights))

# -----------------------------
# Train model with class weights
# -----------------------------
model.fit(
    X_train_pad,
    y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop],
    class_weight=class_weights_dict   # <-- add here
)

# -----------------------------
# Evaluate on test set
# -----------------------------
test_preds = model.predict(X_test_pad)
test_preds = np.argmax(test_preds, axis=1)
decoded_preds = label_encoder.inverse_transform(test_preds)

print(f"\nMax vocab: {MAX_VOCAB}")
print(f"Dropout rate: {DROPOUT_RATE}")
print("\nTest Classification Report (Embedding dim={}):".format(EMBED_DIM))
print(classification_report(y_test, decoded_preds))



Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.3111 - loss: 1.5581 - val_accuracy: 0.3701 - val_loss: 1.4606
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.3627 - loss: 1.5110 - val_accuracy: 0.3897 - val_loss: 1.3929
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4113 - loss: 1.3196 - val_accuracy: 0.3861 - val_loss: 1.3361
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5616 - loss: 0.9760 - val_accuracy: 0.4199 - val_loss: 1.3736
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6842 - loss: 0.7033 - val_accuracy: 0.4306 - val_loss: 1.4069
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8032 - loss: 0.4811 - val_accuracy: 0.4093 - val_loss: 1.5994
Epoch 7/20
[1m176/176[0m 

Accuracy romlott, de minden osztályra becsül a modell (végül nem lett használva)