In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Đọc dữ liệu từ file
sentences = []
labels = []
with open('/content/drive/MyDrive/myDrive/train.txt', 'r') as f:
    for line in f:
        if '\t' not in line:
            continue
        label, sentence = line.strip().split('\t')
        sentences.append(sentence)
        labels.append(label)

# Tiền xử lý dữ liệu
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'\W+', ' ', sentence)
    words = sentence.split()
    words = [word for word in words if word not in stop_words]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

sentences = [preprocess(sentence) for sentence in sentences]

# Chuyển đổi dữ liệu văn bản thành các đặc trưng số sử dụng Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences)
X = tokenizer.texts_to_sequences(sentences)
X = pad_sequences(X, maxlen=200)

# Chia dữ liệu thành hai tập con
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Chuyển đổi nhãn thành ma trận one-hot encoding
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
y_train = to_categorical(y_train_enc)
y_test = to_categorical(y_test_enc)

# Xây dựng mô hình LSTM
model = Sequential()
model.add(Embedding(5000, 128, input_length=X.shape[1]))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Huấn luyện mô hình
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

# Đánh giá mô hình bằng độ đo F1-score
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
f1 = f1_score(np.argmax(y_test, axis=1), y_pred, average='weighted')
print("F1-score:", f1)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
F1-score: 0.6649505579424531


In [None]:
# Load data để pred
phase_0=[]
with open('/content/drive/MyDrive/myDrive/data_phase_1.txt', 'r') as f:
  for line in f:
    phase_0.append(line)

# Tiến hành model

T = tokenizer.texts_to_sequences([preprocess(p) for p in phase_0])
T = pad_sequences(T, maxlen=200)
Pred = model.predict(T)
Pred = np.argmax(Pred, axis=1)
Pred = le.inverse_transform(Pred)
print(Pred)

# Trích xuất chỉ có label
with open("/content/drive/MyDrive/myDrive/output_with_labels.txt", "w") as f:
    for line in Pred:
      f.write(line+'\n')


['negative' 'negative' 'negative' 'positive' 'positive' 'positive'
 'positive' 'neutral    ' 'positive' 'positive' 'positive' 'negative'
 'negative' 'negative' 'positive' 'neutral    ' 'negative' 'negative'
 'negative' 'positive' 'neutral    ' 'positive' 'neutral    ' 'negative'
 'negative' 'neutral    ' 'neutral    ' 'negative' 'negative' 'positive'
 'neutral    ' 'positive' 'negative' 'positive' 'positive' 'neutral    '
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'positive' 'negative' 'negative' 'negative' 'negative'
 'negative' 'positive' 'neutral    ' 'neutral    ' 'positive' 'positive'
 'negative' 'negative' 'neutral    ' 'negative' 'negative' 'neutral    '
 'negative' 'positive' 'neutral    ' 'negative' 'neutral    ' 'negative'
 'negative' 'positive' 'negative' 'negative' 'neutral    ' 'neutral    '
 'positive' 'positive' 'negative' 'neutral    ' 'negative' 'positive'
 'negative' 'positive' 'positive' 'negative' 'neutral    ' 'positive'
 'neutra