In [8]:
import pandas as pd
import os
import re
import time
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

# # Kiểm tra sự tồn tại của các file
# files = ['train_nor_811.xlsx', 'valid_nor_811.xlsx', 'test_nor_811.xlsx']
# for file in files:
#     if not os.path.exists(file):
#         raise FileNotFoundError(f"File {file} not found.")

# Đọc dữ liệu từ các file
train_data = pd.read_excel('D:/HK5/NLP/UIT-VSMEC-20241212T141641Z-001/UIT-VSMEC/train_nor_811.xlsx')
valid_data = pd.read_excel('D:/HK5/NLP/UIT-VSMEC-20241212T141641Z-001/UIT-VSMEC/valid_nor_811.xlsx')
test_data = pd.read_excel('D:/HK5/NLP/UIT-VSMEC-20241212T141641Z-001/UIT-VSMEC/test_nor_811.xlsx')

# Kết hợp dữ liệu train và valid để tạo tập huấn luyện
combined_data = pd.concat([train_data, valid_data], ignore_index=True)

# Tiền xử lý dữ liệu
def preprocess_text(text):
    text = text.lower()  # Chuyển về chữ thường
    text = re.sub(r'\W', ' ', text)  # Loại bỏ dấu câu
    return text

# Áp dụng tiền xử lý
combined_data['Sentence'] = combined_data['Sentence'].apply(preprocess_text)

# Chia dữ liệu thành đặc trưng và nhãn
X = combined_data['Sentence']
y = combined_data['Emotion']

# Chia tập huấn luyện thành train và test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Chuyển đổi văn bản thành đặc trưng sử dụng TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Mã hóa nhãn
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Tạo mô hình MaxEnt (Logistic Regression)
model = LogisticRegression(solver='lbfgs',max_iter=1000)

# Thời gian bắt đầu huấn luyện
start_time = time.time()
model.fit(X_train_vectorized, y_train_encoded)

# Đánh giá mô hình với cross-validation
cv_scores = cross_val_score(model, X_train_vectorized, y_train_encoded, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

# Dự đoán trên tập test
y_pred = model.predict(X_test_vectorized)

# Đánh giá mô hình
print("Accuracy:", accuracy_score(y_test_encoded, y_pred))
print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))

# Dự đoán trên tập test
test_vectorized = vectorizer.transform(test_data['Sentence'])
test_pred = model.predict(test_vectorized)

# Thêm dự đoán vào DataFrame
test_data['Predicted Emotion'] = label_encoder.inverse_transform(test_pred)

# Lưu kết quả ra file Excel
test_data.to_excel('test_predictions.xlsx', index=False)

# Lưu mô hình
joblib.dump(model, 'emotion_model.pkl')

# Thời gian thực hiện
end_time = time.time()
print(f"Thời gian thực hiện: {end_time - start_time} giây")

Cross-validation scores: [0.48396794 0.5260521  0.50952859 0.51955868 0.53259779]
Mean cross-validation score: 0.5143410190491314
Accuracy: 0.529270248596632
              precision    recall  f1-score   support

       Anger       0.53      0.24      0.33        79
     Disgust       0.53      0.59      0.56       257
   Enjoyment       0.56      0.76      0.64       344
        Fear       0.74      0.35      0.47        75
       Other       0.39      0.36      0.38       238
     Sadness       0.56      0.52      0.54       197
    Surprise       0.64      0.25      0.35        57

    accuracy                           0.53      1247
   macro avg       0.56      0.44      0.47      1247
weighted avg       0.53      0.53      0.52      1247

Thời gian thực hiện: 1.4918832778930664 giây


In [6]:
pip install emoji

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
   ---------------------------------------- 0.0/586.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/586.9 kB ? eta -:--:--
    --------------------------------------- 10.2/586.9 kB ? eta -:--:--
   -- ------------------------------------- 30.7/586.9 kB 1.4 MB/s eta 0:00:01
   ---- ---------------------------------- 61.4/586.9 kB 656.4 kB/s eta 0:00:01
   ----------- ---------------------------- 174.1/586.9 kB 1.3 MB/s eta 0:00:01
   ---------------------- ----------------- 327.7/586.9 kB 1.9 MB/s eta 0:00:01
   ------------------------------------ --- 542.7/586.9 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 586.9/586.9 kB 2.8 MB/s eta 0:00:00
Installing collected packages: emoji
Successfully installed emoji-2.14.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [49]:
import re
from pyvi import ViTokenizer
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV

# Define preprocessing functions
def preprocessEmoji(sentence):
    emotion_dict = {
        r'(:|;|=)+(\)|\]|>)+': '🙂',
        r'(:|;|=)+(\(|\[|<)+': '😞',
        r'(:|;|=)+(D|d)': '😁',
        r'(-_-)|(-\.-)': '😐',
        r':v': '_pacman_smile_',
        r'(:|;|=)+(\'|`|\")+(\)|\]|>)+': '🥲',
        r'(:|;|=)+(\'|`|\")+(\(|\[|<)+': '😢',
        r'@@': '😵‍💫',
        r'đc': 'được',
        r'đk': 'được',
        r'bik': 'biết',
        r'ngừi': 'người',
        r'hix': 'hic',
        r'lm': 'làm'
    }
    for key, value in emotion_dict.items():
        sentence = re.sub(key, value, sentence)
    sentence = emoji.demojize(sentence)
    sentence = re.sub(r":(.*?):", r" _\1_ ", sentence)
    sentence = re.sub(r'([!@#$%^&*()_+={};:"\'<>,/\\|~-])\1+', r'\1', sentence)
    return sentence

def tokenize(sentence):
    start_token = ' _s_ '
    end_token = ' _e_ '
    sentence = sentence.lower()
    sentence = preprocessEmoji(sentence)
    sentence = start_token + sentence + end_token
    return ' '.join(ViTokenizer.tokenize(sentence).split())

# Apply preprocessing to all datasets
def preprocess_data(data):
    data = data.drop(columns=['Unnamed: 0'], errors='ignore')  # Drop unnecessary column if exists
    data['Processed_Sentence'] = data['Sentence'].apply(tokenize)
    return data

# Word2Vec helper functions
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=1000):
    if len(tokens_list) < 1:
        return np.zeros(k)
    vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    avg_vec = np.mean(vectorized, axis=0)
    return avg_vec

def get_word2vec_embeddings(vectors, sentences, k=1000):
    return np.array([get_average_word2vec(sentence.split(), vectors, k=k) for sentence in sentences])

# Load data
file_paths = {
    "train": 'D:/HK5/NLP/UIT-VSMEC-20241212T141641Z-001/UIT-VSMEC/train_nor_811.xlsx',
    "test": 'D:/HK5/NLP/UIT-VSMEC-20241212T141641Z-001/UIT-VSMEC/valid_nor_811.xlsx',
    "valid": 'D:/HK5/NLP/UIT-VSMEC-20241212T141641Z-001/UIT-VSMEC/test_nor_811.xlsx'
}

train_data = pd.read_excel(file_paths["train"])
test_data = pd.read_excel(file_paths["test"])
valid_data = pd.read_excel(file_paths["valid"])

# Preprocess datasets
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)
valid_data = preprocess_data(valid_data)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_data['Processed_Sentence'])
X_test_tfidf = vectorizer.transform(test_data['Processed_Sentence'])

# Word2Vec embeddings
sentences = [sentence.split() for sentence in train_data['Processed_Sentence']]
word2vec_model = Word2Vec(sentences, vector_size=1000, window=10, min_count=20, workers=100)
X_train_w2v = get_word2vec_embeddings(word2vec_model.wv, train_data['Processed_Sentence'])
X_test_w2v = get_word2vec_embeddings(word2vec_model.wv, test_data['Processed_Sentence'])

y_train = train_data['Emotion']
y_test = test_data['Emotion']

# Train and evaluate MaxEnt model with TF-IDF features
model_tfidf = LogisticRegression(C= 2.0 ,solver='lbfgs',max_iter=7000)
model_tfidf.fit(X_train_tfidf, y_train)

# Evaluate TF-IDF model
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
print("Results using TF-IDF features:")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report:\n", classification_report(y_test, y_pred_tfidf))

# Train and evaluate MaxEnt model with Word2Vec features
model_w2v = LogisticRegression(C= 700.0,solver='lbfgs',max_iter=7000)
model_w2v.fit(X_train_w2v, y_train)

# Evaluate Word2Vec model
y_pred_w2v = model_w2v.predict(X_test_w2v)
print("\nResults using Word2Vec features:")
print("Accuracy:", accuracy_score(y_test, y_pred_w2v))
print("Classification Report:\n", classification_report(y_test, y_pred_w2v))

Results using TF-IDF features:
Accuracy: 0.5714285714285714
Classification Report:
               precision    recall  f1-score   support

       Anger       0.52      0.35      0.41        49
     Disgust       0.52      0.59      0.56       135
   Enjoyment       0.69      0.78      0.73       214
        Fear       0.72      0.42      0.53        31
       Other       0.44      0.42      0.43       141
     Sadness       0.52      0.59      0.55        86
    Surprise       0.75      0.20      0.32        30

    accuracy                           0.57       686
   macro avg       0.59      0.48      0.50       686
weighted avg       0.58      0.57      0.56       686


Results using Word2Vec features:
Accuracy: 0.4446064139941691
Classification Report:
               precision    recall  f1-score   support

       Anger       0.36      0.10      0.16        49
     Disgust       0.38      0.48      0.42       135
   Enjoyment       0.51      0.71      0.59       214
        Fear   

LSTM

In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os
import re
from pyvi import ViTokenizer
import emoji

In [7]:
def preprocessEmoji(sentence):
    emotion_dict = {
        r'(:|;|=)+(\)|]|>)+': '🙂', 
        r'(:|;|=)+(\(|\[|<)+': '😞', 
        r'(:|;|=)+(D|d)': '😁',
        r'(-_-)|(-\.-)': '😐',
        r':v': '_pacman_smile_',
        r'(:|;|=)+(\'|`|\")+(\)|]|>)+': '🥲', 
        r'(:|;|=)+(\'|`|\")+(\(|\[|<)+': '😢',
        r'@@': '😵‍💫',
        r'\bđc\b': 'được',
        r'\bđk\b': 'được',
        r'\bbik\b': 'biết',
        r'\bngừi\b': 'người',
        r'\bhix\b': 'hic',
        r'\blm\b': 'làm'
    }
    for key, value in emotion_dict.items():
        sentence = re.sub(key, value, sentence)
    sentence = emoji.demojize(sentence)
    sentence = re.sub(r":(.*?):", r" _\1_ ", sentence)
    sentence = re.sub(r'([!@#$%^&*()_+={};:"\'<>,?/\|~-])\1+', r'\1', sentence)
    return sentence

def tokenize(sentence):
    start_token = ' _s_ '
    end_token = ' _e_ '
    sentence = sentence.lower()
    sentence = preprocessEmoji(sentence)
    sentence = start_token + sentence + end_token
    return ' '.join(ViTokenizer.tokenize(sentence).split())

# Kiểm tra sự tồn tại của các file
files = [
    'D:/HK5/NLP/UIT-VSMEC-20241212T141641Z-001/UIT-VSMEC/train_nor_811.xlsx',
    'D:/HK5/NLP/UIT-VSMEC-20241212T141641Z-001/UIT-VSMEC/valid_nor_811.xlsx',
    'D:/HK5/NLP/UIT-VSMEC-20241212T141641Z-001/UIT-VSMEC/test_nor_811.xlsx'
]

for file in files:
    if not os.path.exists(file):
        raise FileNotFoundError(f"File {file} not found.")

# Đọc dữ liệu từ các file
train_data = pd.read_excel(files[0])
valid_data = pd.read_excel(files[1])
test_data = pd.read_excel(files[2])

# Kết hợp dữ liệu train và valid để tạo tập huấn luyện
combined_data = pd.concat([train_data, valid_data], ignore_index=True)

# Tiền xử lý dữ liệu
combined_data['Sentence'] = combined_data['Sentence'].apply(tokenize)

# Chia dữ liệu thành đặc trưng và nhãn
X = combined_data['Sentence']
y = combined_data['Emotion']

# Chia tập huấn luyện thành train và test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Chuyển đổi nhãn thành số
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Sử dụng Tokenizer để chuyển đổi câu thành vector
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Chuyển đổi văn bản thành số
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding để đảm bảo tất cả các câu có cùng độ dài
max_length = max([len(seq) for seq in X_train_seq])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Xây dựng mô hình LSTM
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length))
model_lstm.add(LSTM(128, return_sequences=True))
model_lstm.add(Dropout(0.2))
model_lstm.add(LSTM(64))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(64, activation='relu'))
model_lstm.add(Dense(1, activation='sigmoid'))  # Thay đổi kích hoạt nếu cần cho bài toán phân loại

# Biên dịch mô hình
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Huấn luyện mô hình
model_lstm.fit(X_train_pad, y_train_encoded, epochs=100, batch_size=32)

# Đánh giá mô hình
accuracy = model_lstm.evaluate(X_test_pad, y_test_encoded)
print(f'Precision Accuracy with LSTM: {accuracy[1] * 100:.2f}%')

Epoch 1/100




[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 59ms/step - accuracy: 0.1982 - loss: -46.1175
Epoch 2/100
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 58ms/step - accuracy: 0.1909 - loss: -457.3328
Epoch 3/100
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 57ms/step - accuracy: 0.1879 - loss: -1399.7852
Epoch 4/100
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 57ms/step - accuracy: 0.1945 - loss: -2793.3647
Epoch 5/100
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 57ms/step - accuracy: 0.1918 - loss: -4788.7822
Epoch 6/100
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 58ms/step - accuracy: 0.1833 - loss: -7258.2212
Epoch 7/100
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 58ms/step - accuracy: 0.1866 - loss: -9931.4639
Epoch 8/100
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 58ms/step - accuracy: 0.1893 - loss: -13398.0850
Epoch