In [None]:
import numpy as np
import pandas as pd
import pickle
import re
import tensorflow as tf
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout, Bidirectional, Conv1D, GlobalMaxPooling1D

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('punkt_tab')

In [None]:
stop_words = set(stopwords.words("english"))
custom_stopwords = {'feel', 'feeling', 'like', 'im', 'really', 'get', 'http', 'href', 'www'}
stop_words.update(custom_stopwords)

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text) #Xóa kí tự đặc biệt
    text = re.sub(r'\s+', ' ', text)  #Xóa khoảng trắng dư
    text = re.sub(r'\d+', '', text) #Xóa số
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

with open('train_X.pkl', 'rb') as f:
    train_X = pickle.load(f)
with open('train_y.pkl', 'rb') as f:
    train_y = pickle.load(f)
with open('private_X.pkl', 'rb') as f:
    test_X = pickle.load(f)

df = pd.DataFrame(list(zip(train_X, train_y)), columns=['sentence', 'emotion'])
test = pd.DataFrame(test_X, columns=['sentence'])

df["sentence"] = df["sentence"].apply(preprocess_text)
test["sentence"] = test["sentence"].apply(preprocess_text)

df = df.drop_duplicates()

In [None]:
max_words = 13000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df["sentence"])
X_seq = tokenizer.texts_to_sequences(df["sentence"])
X_padded = pad_sequences(X_seq, maxlen=max_len)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, df['emotion'], test_size=0.2, random_state=42)

In [None]:
cnn_model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(6, activation='softmax')
])
cnn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

cnn_model = create_cnn_model()
cnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))
cnn_acc = cnn_model.evaluate(X_test, y_test)[1]
print("CNN Accuracy:", cnn_acc)

def submission(model, filename):
    test_seq = tokenizer.texts_to_sequences(test["sentence"])
    test_padded = pad_sequences(test_seq, maxlen=max_len)
    y_pred = model.predict(test_padded)
    y_pred = np.argmax(y_pred, axis=1)
    with open(filename, 'w') as f:
        for item in y_pred:
            f.write("%s\n" % item)

submission(cnn_model, 'submission_cnn.txt')