In [None]:
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# ASPECT CATEGORY DETECTION

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data/Train.txt', sep = '\n', header = None)
df_dev = pd.read_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data/Dev.txt', sep = '\n', header = None)
df_test = pd.read_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data/Test.txt', sep = '\n', header = None)

df_train.head(10)

Unnamed: 0,0
0,#1
1,Giá 53k size vừa.
2,"{DRINKS#PRICES, neutral}, {DRINKS#STYLE&OPTION..."
3,#2
4,Nhưng nói chung cũng hơi đắt.
5,"{RESTAURANT#PRICES, negative}"
6,#3
7,Mình ăn rất hôi mùi dầu.
8,"{FOOD#QUALITY, negative}"
9,#4


In [None]:
# Lấy review
def get_review(df):
    list_index = []
    for i in range(len(df)):
        if i % 3 == 1:
            list_index.append(i)
    df_review = df.iloc[list_index].reset_index(drop=True)
    df_review.columns = ['comment']
    return df_review

df_review_train = get_review(df_train)
df_review_dev = get_review(df_dev)
df_review_test = get_review(df_test)

df_review_train.head(5)

Unnamed: 0,comment
0,Giá 53k size vừa.
1,Nhưng nói chung cũng hơi đắt.
2,Mình ăn rất hôi mùi dầu.
3,Mình ăn chưa baoh thấy mùi hôi hải sản.
4,3 dĩa vs 2 lon Revive mà có 190k thui(.


In [None]:
len(df_review_train), len(df_review_dev), len(df_review_test)

(7028, 771, 1938)

In [None]:
# Lấy aspect
def get_aspect(df):
    list_aspect = []
    for i in range(len(df)):
        if i % 3 == 2:
            list_aspect.append(df.iloc[i][0])
    return list_aspect

# Xử lý aspect
def preprocessing_aspect(list_aspect):
    sentiment = ['negative', 'neutral', 'positive']
    for i in range(len(list_aspect)):
        for stm in sentiment:
            list_aspect[i] = re.sub(', ' + stm, '', list_aspect[i])
        list_aspect[i] = re.sub('[{},]', '', list_aspect[i])
    df_aspect = pd.DataFrame(list_aspect)
    df_aspect.columns = ['aspect']
    return df_aspect

list_aspect_train = get_aspect(df_train)
list_aspect_dev = get_aspect(df_dev)
list_aspect_test = get_aspect(df_test)

df_aspect_train = preprocessing_aspect(list_aspect_train)
df_aspect_dev = preprocessing_aspect(list_aspect_dev)
df_aspect_test = preprocessing_aspect(list_aspect_test)

df_aspect_train.head()

Unnamed: 0,aspect
0,DRINKS#PRICES DRINKS#STYLE&OPTIONS
1,RESTAURANT#PRICES
2,FOOD#QUALITY
3,FOOD#QUALITY
4,RESTAURANT#PRICES


In [None]:
# Dataframe mới
df_train = pd.concat([df_review_train, df_aspect_train], axis=1, join="inner")
df_dev = pd.concat([df_review_dev, df_aspect_dev], axis=1, join="inner")
df_test = pd.concat([df_review_test, df_aspect_test], axis=1, join="inner")

df_train.head()

Unnamed: 0,comment,aspect
0,Giá 53k size vừa.,DRINKS#PRICES DRINKS#STYLE&OPTIONS
1,Nhưng nói chung cũng hơi đắt.,RESTAURANT#PRICES
2,Mình ăn rất hôi mùi dầu.,FOOD#QUALITY
3,Mình ăn chưa baoh thấy mùi hôi hải sản.,FOOD#QUALITY
4,3 dĩa vs 2 lon Revive mà có 190k thui(.,RESTAURANT#PRICES


In [None]:
# Danh sách aspect
list_aspect = []
for label in df_train['aspect']:
    for aspect in label.split():
        if aspect not in list_aspect:
            list_aspect.append(aspect)

print('Số lượng aspect: ', len(list_aspect), '\n')
list_aspect = sorted(list_aspect)
list_aspect

Số lượng aspect:  12 



['AMBIENCE#GENERAL',
 'DRINKS#PRICES',
 'DRINKS#QUALITY',
 'DRINKS#STYLE&OPTIONS',
 'FOOD#PRICES',
 'FOOD#QUALITY',
 'FOOD#STYLE&OPTIONS',
 'LOCATION#GENERAL',
 'RESTAURANT#GENERAL',
 'RESTAURANT#MISCELLANEOUS',
 'RESTAURANT#PRICES',
 'SERVICE#GENERAL']

In [None]:
# Thay đổi định dạng Aspect
for i in range(len(df_train)):
    df_train['aspect'][i] = df_train['aspect'][i].split()
for i in range(len(df_dev)):
    df_dev['aspect'][i] = df_dev['aspect'][i].split()
for i in range(len(df_test)):
    df_test['aspect'][i] = df_test['aspect'][i].split()

df_train.head()

Unnamed: 0,comment,aspect
0,Giá 53k size vừa.,"[DRINKS#PRICES, DRINKS#STYLE&OPTIONS]"
1,Nhưng nói chung cũng hơi đắt.,[RESTAURANT#PRICES]
2,Mình ăn rất hôi mùi dầu.,[FOOD#QUALITY]
3,Mình ăn chưa baoh thấy mùi hôi hải sản.,[FOOD#QUALITY]
4,3 dĩa vs 2 lon Revive mà có 190k thui(.,[RESTAURANT#PRICES]


In [None]:
# Final aspect
mlb = MultiLabelBinarizer()

df_train['aspect'] = mlb.fit_transform(df_train['aspect']).tolist()
df_dev['aspect'] = mlb.transform(df_dev['aspect']).tolist()
df_test['aspect'] = mlb.transform(df_test['aspect']).tolist()

df_train.head()

Unnamed: 0,comment,aspect
0,Giá 53k size vừa.,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Nhưng nói chung cũng hơi đắt.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
2,Mình ăn rất hôi mùi dầu.,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
3,Mình ăn chưa baoh thấy mùi hôi hải sản.,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
4,3 dĩa vs 2 lon Revive mà có 190k thui(.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


In [None]:
# Lưu data
df_train.to_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data_csv/ACD_data/ACD_train.csv', index = False)
df_dev.to_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data_csv/ACD_data/ACD_dev.csv', index = False)
df_test.to_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data_csv/ACD_data/ACD_test.csv', index = False)

# ASPECT POLARITY DETECTION (Head)

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data/Train.txt', sep = '\n', header = None)
df_dev = pd.read_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data/Dev.txt', sep = '\n', header = None)
df_test = pd.read_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data/Test.txt', sep = '\n', header = None)

df_train.head(10)

Unnamed: 0,0
0,#1
1,Giá 53k size vừa.
2,"{DRINKS#PRICES, neutral}, {DRINKS#STYLE&OPTION..."
3,#2
4,Nhưng nói chung cũng hơi đắt.
5,"{RESTAURANT#PRICES, negative}"
6,#3
7,Mình ăn rất hôi mùi dầu.
8,"{FOOD#QUALITY, negative}"
9,#4


In [None]:
# Lấy review
def get_review(df):
    list_index = []
    for i in range(len(df)):
        if i % 3 == 1:
            list_index.append(i)
    df_review = df.iloc[list_index].reset_index(drop=True)
    df_review.columns = ['comment']
    return df_review

df_review_train = get_review(df_train)
df_review_dev = get_review(df_dev)
df_review_test = get_review(df_test)

df_review_train.head(5)

Unnamed: 0,comment
0,Giá 53k size vừa.
1,Nhưng nói chung cũng hơi đắt.
2,Mình ăn rất hôi mùi dầu.
3,Mình ăn chưa baoh thấy mùi hôi hải sản.
4,3 dĩa vs 2 lon Revive mà có 190k thui(.


In [None]:
# Lấy label
def get_label(df):
    list_label = []
    for i in range(len(df)):
        if i % 3 == 2:
            list_label.append(df.iloc[i][0])
    return list_label

list_label_train = get_label(df_train)
list_label_dev = get_label(df_dev)
list_label_test = get_label(df_test)

list_label_train[:5]

['{DRINKS#PRICES, neutral}, {DRINKS#STYLE&OPTIONS, neutral}',
 '{RESTAURANT#PRICES, negative}',
 '{FOOD#QUALITY, negative}',
 '{FOOD#QUALITY, positive}',
 '{RESTAURANT#PRICES, positive}']

In [None]:
# Xử lý label
def preprocessing_label(list_label):
    for i in range(len(list_label)):
        list_label[i] = re.sub('[{},]', '', list_label[i])
        list_label[i] = list_label[i].split()
    return list_label

list_label_train = preprocessing_label(list_label_train)
list_label_dev = preprocessing_label(list_label_dev)
list_label_test = preprocessing_label(list_label_test)

list_label_train[:5]

[['DRINKS#PRICES', 'neutral', 'DRINKS#STYLE&OPTIONS', 'neutral'],
 ['RESTAURANT#PRICES', 'negative'],
 ['FOOD#QUALITY', 'negative'],
 ['FOOD#QUALITY', 'positive'],
 ['RESTAURANT#PRICES', 'positive']]

In [None]:
# Xử lý polarity
def preprocessing_sentiment(list_label):
    for i in range(len(list_label)):
        for j in range(1, len(list_label[i]), 2):
            if list_label[i][j] == 'negative':
                list_label[i][j] = 1
            elif list_label[i][j] == 'neutral':
                list_label[i][j] = 2
            else:
                list_label[i][j] = 3
    return list_label

list_label_train = preprocessing_sentiment(list_label_train)
list_label_dev = preprocessing_sentiment(list_label_dev)
list_label_test = preprocessing_sentiment(list_label_test)

list_label_train[:5]

[['DRINKS#PRICES', 2, 'DRINKS#STYLE&OPTIONS', 2],
 ['RESTAURANT#PRICES', 1],
 ['FOOD#QUALITY', 1],
 ['FOOD#QUALITY', 3],
 ['RESTAURANT#PRICES', 3]]

In [None]:
# Danh sách aspect
list_aspect = []
for i in range(len(list_label_train)):
    for j in range(0, len(list_label_train[i]), 2):
        aspect = list_label_train[i][j]
        if aspect not in list_aspect:
            list_aspect.append(aspect)

print('Số lượng aspect: ', len(list_aspect), '\n')
list_aspect = sorted(list_aspect)
list_aspect

Số lượng aspect:  12 



['AMBIENCE#GENERAL',
 'DRINKS#PRICES',
 'DRINKS#QUALITY',
 'DRINKS#STYLE&OPTIONS',
 'FOOD#PRICES',
 'FOOD#QUALITY',
 'FOOD#STYLE&OPTIONS',
 'LOCATION#GENERAL',
 'RESTAURANT#GENERAL',
 'RESTAURANT#MISCELLANEOUS',
 'RESTAURANT#PRICES',
 'SERVICE#GENERAL']

In [None]:
# Tạo từ điển lưu aspect và index của nó
dict_aspect = {
    'AMBIENCE#GENERAL': 'không gian tổng quát',
    'DRINKS#PRICES': 'đồ uống giá tiền',
    'DRINKS#QUALITY': 'đồ uống chất lượng',
    'DRINKS#STYLE&OPTIONS': 'đồ uống loại',
    'FOOD#PRICES': 'thức ăn giá tiền',
    'FOOD#QUALITY': 'thức ăn chất lượng',
    'FOOD#STYLE&OPTIONS': 'thức ăn loại',
    'LOCATION#GENERAL': 'vị trí tổng quát',
    'RESTAURANT#GENERAL': 'nhà hàng tổng quát',
    'RESTAURANT#MISCELLANEOUS': 'nhà hàng điều khoản khác',
    'RESTAURANT#PRICES': 'nhà hàng giá tiền',
    'SERVICE#GENERAL': 'phục vụ tổng quát'
}

In [None]:
# Kết hợp comment với label
df_review_train['label'] = list_label_train
df_train = df_review_train
df_review_dev['label'] = list_label_dev
df_dev = df_review_dev
df_review_test['label'] = list_label_test
df_test = df_review_test

df_train.head(5)

Unnamed: 0,comment,label
0,Giá 53k size vừa.,"[DRINKS#PRICES, 2, DRINKS#STYLE&OPTIONS, 2]"
1,Nhưng nói chung cũng hơi đắt.,"[RESTAURANT#PRICES, 1]"
2,Mình ăn rất hôi mùi dầu.,"[FOOD#QUALITY, 1]"
3,Mình ăn chưa baoh thấy mùi hôi hải sản.,"[FOOD#QUALITY, 3]"
4,3 dĩa vs 2 lon Revive mà có 190k thui(.,"[RESTAURANT#PRICES, 3]"


In [None]:
def tranform(df, dict_aspect):
    data = []
    for i in range(len(df)):
        review = df['comment'][i]
        list_label = df['label'][i]
        for j in range(0, len(list_label), 2):
            aspect = list_label[j]
            polarity = int(list_label[j + 1]) - 1 
            new_aspect = dict_aspect[aspect]
            new_review = new_aspect + ' ' + review
            data.append([new_review, polarity])
    df = pd.DataFrame(data)
    df.columns = ['text', 'polarity']
    return df

df_train = tranform(df_train, dict_aspect)
df_dev = tranform(df_dev, dict_aspect)
df_test = tranform(df_test, dict_aspect)

df_train.head(5)

Unnamed: 0,text,polarity
0,đồ uống giá tiền Giá 53k size vừa.,1
1,đồ uống loại Giá 53k size vừa.,1
2,nhà hàng giá tiền Nhưng nói chung cũng hơi đắt.,0
3,thức ăn chất lượng Mình ăn rất hôi mùi dầu.,0
4,thức ăn chất lượng Mình ăn chưa baoh thấy mùi ...,2


In [None]:
# Lưu data
df_train.to_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data_csv/APD_data/APD_train.csv', index = False)
df_dev.to_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data_csv/APD_data/APD_dev.csv', index = False)
df_test.to_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data_csv/APD_data/APD_test.csv', index = False)

# ASPECT POLARITY DETECTION (Tail)

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data/Train.txt', sep = '\n', header = None)
df_dev = pd.read_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data/Dev.txt', sep = '\n', header = None)
df_test = pd.read_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data/Test.txt', sep = '\n', header = None)

df_train.head(10)

Unnamed: 0,0
0,#1
1,Giá 53k size vừa.
2,"{DRINKS#PRICES, neutral}, {DRINKS#STYLE&OPTION..."
3,#2
4,Nhưng nói chung cũng hơi đắt.
5,"{RESTAURANT#PRICES, negative}"
6,#3
7,Mình ăn rất hôi mùi dầu.
8,"{FOOD#QUALITY, negative}"
9,#4


In [None]:
# Lấy review
def get_review(df):
    list_index = []
    for i in range(len(df)):
        if i % 3 == 1:
            list_index.append(i)
    df_review = df.iloc[list_index].reset_index(drop=True)
    df_review.columns = ['comment']
    return df_review

df_review_train = get_review(df_train)
df_review_dev = get_review(df_dev)
df_review_test = get_review(df_test)

df_review_train.head(5)

Unnamed: 0,comment
0,Giá 53k size vừa.
1,Nhưng nói chung cũng hơi đắt.
2,Mình ăn rất hôi mùi dầu.
3,Mình ăn chưa baoh thấy mùi hôi hải sản.
4,3 dĩa vs 2 lon Revive mà có 190k thui(.


In [None]:
# Lấy label
def get_label(df):
    list_label = []
    for i in range(len(df)):
        if i % 3 == 2:
            list_label.append(df.iloc[i][0])
    return list_label

list_label_train = get_label(df_train)
list_label_dev = get_label(df_dev)
list_label_test = get_label(df_test)

list_label_train[:5]

In [None]:
# Xử lý label
def preprocessing_label(list_label):
    for i in range(len(list_label)):
        list_label[i] = re.sub('[{},]', '', list_label[i])
        list_label[i] = list_label[i].split()
    return list_label

list_label_train = preprocessing_label(list_label_train)
list_label_dev = preprocessing_label(list_label_dev)
list_label_test = preprocessing_label(list_label_test)

list_label_train[:5]

In [None]:
# Xử lý polarity
def preprocessing_sentiment(list_label):
    for i in range(len(list_label)):
        for j in range(1, len(list_label[i]), 2):
            if list_label[i][j] == 'negative':
                list_label[i][j] = 1
            elif list_label[i][j] == 'neutral':
                list_label[i][j] = 2
            else:
                list_label[i][j] = 3
    return list_label

list_label_train = preprocessing_sentiment(list_label_train)
list_label_dev = preprocessing_sentiment(list_label_dev)
list_label_test = preprocessing_sentiment(list_label_test)

list_label_train[:5]

In [None]:
# Danh sách aspect
list_aspect = []
for i in range(len(list_label_train)):
    for j in range(0, len(list_label_train[i]), 2):
        aspect = list_label_train[i][j]
        if aspect not in list_aspect:
            list_aspect.append(aspect)

print('Số lượng aspect: ', len(list_aspect), '\n')
list_aspect = sorted(list_aspect)
list_aspect

In [None]:
# Tạo từ điển lưu aspect và index của nó
dict_aspect = {
    'AMBIENCE#GENERAL': 'không gian tổng quát',
    'DRINKS#PRICES': 'đồ uống giá tiền',
    'DRINKS#QUALITY': 'đồ uống chất lượng',
    'DRINKS#STYLE&OPTIONS': 'đồ uống loại',
    'FOOD#PRICES': 'thức ăn giá tiền',
    'FOOD#QUALITY': 'thức ăn chất lượng',
    'FOOD#STYLE&OPTIONS': 'thức ăn loại',
    'LOCATION#GENERAL': 'vị trí tổng quát',
    'RESTAURANT#GENERAL': 'nhà hàng tổng quát',
    'RESTAURANT#MISCELLANEOUS': 'nhà hàng điều khoản khác',
    'RESTAURANT#PRICES': 'nhà hàng giá tiền',
    'SERVICE#GENERAL': 'phục vụ tổng quát'
}

In [None]:
# Kết hợp comment với label
df_review_train['label'] = list_label_train
df_train = df_review_train
df_review_dev['label'] = list_label_dev
df_dev = df_review_dev
df_review_test['label'] = list_label_test
df_test = df_review_test

df_train.head(5)

In [None]:
def tranform(df, dict_aspect):
    data = []
    for i in range(len(df)):
        review = df['comment'][i]
        list_label = df['label'][i]
        for j in range(0, len(list_label), 2):
            aspect = list_label[j]
            polarity = int(list_label[j + 1]) - 1 
            new_aspect = dict_aspect[aspect]
            new_review = review + ' ' + new_aspect
            data.append([new_review, polarity])
    df = pd.DataFrame(data)
    df.columns = ['text', 'polarity']
    return df

df_train = tranform(df_train, dict_aspect)
df_dev = tranform(df_dev, dict_aspect)
df_test = tranform(df_test, dict_aspect)

df_train.head(5)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Lưu data
df_train.to_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data_csv/APD_data/APD_train_tail.csv', index = False)
df_dev.to_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data_csv/APD_data/APD_dev_tail.csv', index = False)
df_test.to_csv('/content/drive/MyDrive/NLP for Data Science/NLP_Project/Data_csv/APD_data/APD_test_tail.csv', index = False)