# 基礎文字前處理與關鍵詞搜尋
## 本範例重點
1. 文字前處理
    - 移除標點符號
    - 展開縮寫
    - 統一轉換為小寫
2. 關鍵詞搜尋
    - NLTK
    - Wordcloud
    - 單純貝氏分類器
    - KeyBERT

#### 匯入相關套件

In [None]:
import re
import glob
import string

import pandas as pd
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_colwidth', None)

from bs4 import BeautifulSoup

import nltk
from nltk.tokenize import word_tokenize as wt
from nltk.corpus import stopwords

from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

from wordcloud import WordCloud
from matplotlib import pyplot as plt

from collections import Counter as cnt

from pathlib import Path

#### 讀取檔案

本範例使用 [Pang & Lee Movie Reivew (Cornell Movie Review) Dataset](https://www.cs.cornell.edu/people/pabo/movie-review-data/)。

In [None]:
# 讀取主要工作路徑
cwd = Path.cwd().parent
cwd

In [None]:
# 將工作路徑指定到資料集
data_path = cwd / 'data' / 'csv' / 'polarity_dataset.csv'
data_path

In [None]:
# 讀取CSV檔
file_src = pd.read_csv(str(data_path), index_col=0).reset_index(drop=True)
file_src.head(1)

In [None]:
# 讀取CSV檔
# 透過 Github 網址讀取 CSV 檔

# 1. 未經處理的原始資料
#file_src = pd.read_csv('https://raw.githubusercontent.com/eccmyang/Tutorials-for-AIIS/main/data/csv/polarity_dataset.csv?token=GHSAT0AAAAAACF3SY22LM2Z7QZ4BTENRUKEZGMFSSQ', index_col=0).reset_index(drop=True)
# 2. 經過處理的資料
#file_src = pd.read_csv('https://raw.githubusercontent.com/eccmyang/Tutorials-for-AIIS/main/data/csv/preprocessed_polarity_dataset.csv?token=GHSAT0AAAAAACF3SY22VPLXBQR5OGCE5NDWZGMFS6Q', index_col=0).reset_index(drop=True)


#file_src.head(1)

In [None]:
# 確認讀取資料大小
file_src.shape

### 文字前處理

In [None]:
# 縮寫詞表
mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
           "'cause": "because", "could've": "could have", "couldn't": "could not", 
           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
           "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", 
           "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
           "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
           "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", 
           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have",
           "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", 
           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
           "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
           "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
           "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
           "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", 
           "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
           "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
           "she's": "she is", "should've": "should have", "shouldn't": "should not", 
           "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is",
           "that'd": "that would", "that'd've": "that would have", "that's": "that is", 
           "there'd": "there would", "there'd've": "there would have", "there's": "there is", 
           "here's": "here is","they'd": "they would", "they'd've": "they would have", 
           "they'll": "they will", "they'll've": "they will have", "they're": "they are", 
           "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", 
           "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
           "we're": "we are", "we've": "we have", "weren't": "were not", 
           "what'll": "what will", "what'll've": "what will have","what're": "what are",  
           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", 
           "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", 
           "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", 
           "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", 
           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
           "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have",
           "y'all're": "you all are","y'all've": "you all have","you'd": "you would", 
           "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
           "you're": "you are", "you've": "you have" }

In [None]:
# 前處理函式
def clean_text(text, lemmatize = True):
    soup = BeautifulSoup(text, "lxml") # 移除HTML標籤
    text = soup.get_text()
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")]) # 展開聊天詞與縮寫詞
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # 表情符號
                           u"\U0001F300-\U0001F5FF"  # 一般符號與象形符號
                           u"\U0001F680-\U0001F6FF"  # 交通與地圖符號
                           u"\U0001F1E0-\U0001F1FF"  # 國旗
                           u"\U00002702-\U000027B0"  # 各式可能造成空白位元的符號
                           u"\U000024C2-\U0001F251"  # 各式可能造成空白位元的符號
                           "]+", flags=re.UNICODE)
    text = emoji_clean.sub(r'',text)
    text = re.sub(r'\.(?=\S)', '. ',text) # 於句點後加空格，以便區分句子段落
    text = re.sub(r'http\S+', '', text) # 移除網址前綴
    text = "".join([word.lower() for word in text if word not in string.punctuation]) # 標點符號、括號等
    #tokens = re.split('\W+', text) #create tokens
    #if lemmatize:
    #    text = " ".join([wl.lemmatize(word) for word in text.split() if word not in stop and word.isalpha()]) #lemmatize
    #else:
    #    text = " ".join([word for word in text.split() if word not in stop and word.isalpha()]) 
    return text

In [None]:
# 從讀取檔案複製一份來做前處理
cpfile = file_src
cpfile.shape

In [None]:
# 利用 `.apply()` 方法將前處理函式套用到DataFrame中的評論欄位
cpfile['review'] = cpfile['review'].apply(clean_text, lemmatize=True)

# 移除 `\n` 換行符號
cpfile['review'] = cpfile['review'].replace('\n', '', regex=True)

# 移除 `http` 網址前綴
cpfile['review'] = cpfile['review'].replace('http', '', regex=True)

# 移除 `www` 網址開頭
cpfile['review'] = cpfile['review'].replace('www', '', regex=True)

# 移除 `com` 網址結尾
cpfile['review'] = cpfile['review'].replace('com', '', regex=True)

In [None]:
cpfile.head(1)

In [None]:
# 前處理完後，再複製一份前處理後的資料
preprocessed = cpfile
preprocessed.head(1)

In [None]:
# 也可以將前處理過的資料存成csv檔，之後就不用再跑前處理的程式碼
#preprocessed.to_csv(str(cwd / 'data' / 'csv' / 'preprocessed_polarity_dataset.csv'))

### 文字雲 (WordCloud) 與 詞頻 (Frequencies)

In [None]:
# 為了尋找關鍵詞，需要先把停用詞去除
# 避免在尋找的過程中，一直尋找到停用詞
en_stopw = set(stopwords.words("english"))

# 讀取去除停用詞後的評論之函式
def get_words(review, words, stopw=en_stopw):
    tok_rev = wt(review)
    rev_word = [word for word in tok_rev if word not in stopw]
    words += rev_word

In [None]:
# 正面情感評論
pos_rev = preprocessed[preprocessed.sentiment == 1]

pos_rev.head(1)

In [None]:
pos_rev.shape

In [None]:
# 建立一串列儲存正面關鍵詞
pos_words = []

pos_rev.review.apply(get_words, args=(pos_words,))

In [None]:
# 負面評論
neg_rev = preprocessed[preprocessed.sentiment == 0]

neg_rev.head(1)

In [None]:
neg_rev.shape

In [None]:
# 建立一串列儲存負面關鍵詞
neg_words = []

neg_rev.review.apply(get_words, args=(neg_words,))

In [None]:
# 文字雲顯示函式
def word_cloud(words):
    words_sen = ' '.join(words)
    words_wc = WordCloud(width=1920, height=1080).generate(words_sen)
    
    plt.figure(figsize=(8, 6), facecolor='k')
    plt.imshow(words_wc)
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()

In [None]:
# 正面關鍵詞文字雲
pos_words_wordcloud = word_cloud(pos_words)

In [None]:
# 負面關鍵詞文字雲
neg_words_wordcloud = word_cloud(neg_words)

In [None]:
# 正面詞彙頻率
pos = cnt(pos_words)

# 負面詞彙頻率
neg = cnt(neg_words)

In [None]:
pos.most_common(20)

In [None]:
neg.most_common(20)

In [None]:
# 在負面評論關鍵詞中尋找正面關鍵詞
for word, count in pos.most_common(250):
    negc = neg[word]
    if abs((count-negc)/count) > 0.50:
        print(word, count, negc)

In [None]:
# 在正面評論關鍵詞中尋找負面關鍵詞
for word, count in neg.most_common(250):
    posc = pos[word]
    if abs((count-posc)/count) > 0.50:
        print(word, count, posc)

### 利用單純貝氏分類器 (Naive Bayes Classifier) 搜尋關鍵詞
本範例使用的單純貝氏分類器是來自於NLTK (Natural Language Toolkit)，其詳細原始碼與計算公式可參閱下列網址之套件官方文檔：

[https://www.nltk.org/_modules/nltk/classify/naivebayes.html](https://www.nltk.org/_modules/nltk/classify/naivebayes.html)

- 重點計算方式摘錄
```
"""
A classifier based on the Naive Bayes algorithm.  In order to find the
probability for a label, this algorithm first uses the Bayes rule to
express P(label|features) in terms of P(label) and P(features|label):

|                       P(label) * P(features|label)
|  P(label|features) = ------------------------------
|                              P(features)

The algorithm then makes the 'naive' assumption that all features are
independent, given the label:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                                         P(features)

Rather than computing P(features) explicitly, the algorithm just
calculates the numerator for each label, and normalizes them so they
sum to one:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                        SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
"""
```

In [None]:
# 關鍵詞總和
tot_words = pos + neg

In [None]:
# 全部關鍵詞中的前20個頻率較高的關鍵詞
tot_words.most_common(20)

In [None]:
# 取前1000個頻率較高的關鍵詞
top1k = [x for (x, y) in tot_words.most_common(1000)]

In [None]:
# 將top1k關鍵詞特徵化之函式
def featurize(review, topk=top1k, stopw=en_stopw):
    tok_rev = wt(review)
    rev_word = [word for word in tok_rev if word not in stopw]
    features = {}
    for word in top1k:
        features['contains({})'.format(word)] = (word in rev_word)
    return features

In [None]:
# 將要輸入至單純貝氏分類器的資料轉換成向量
train = [(featurize(rev), senti) for (rev, senti) in zip(preprocessed.review, preprocessed.sentiment)]

In [None]:
# 訓練/執行單純貝氏分類器
nbclassifier = nltk.NaiveBayesClassifier.train(train)

In [None]:
# 顯示前100個出現機率較高的關鍵詞
# 0: negative sentiment, 1: positive sentiment
nbclassifier.show_most_informative_features(100)

### KeyBERT

`KeyBERT` 是一個用預訓練模型來尋找關鍵詞的方法。其特點是利用可利用BERT模型或是Transformer架構模型來尋找關鍵詞。
而其尋找關鍵詞的排序方法則是以計算`Cosine Similarity (餘弦相似度)`來做排序。

詳細可參閱官方網站暨官方文檔：
[https://maartengr.github.io/KeyBERT/guides/quickstart.html](https://maartengr.github.io/KeyBERT/guides/quickstart.html)

附圖為 `KeyBERT` 方法架構圖：

![KeyBERT Architecture](https://i.imgur.com/2G3v6jT.png)

In [None]:
# 選擇並載入預訓練模型
# 其預訓練模型可從 HuggingFace 網站上取得, 或是Sentence-transformer (SBERT)官方文檔中所列的模型
# 本範例使用的是 distilBERT-base-uncased 的模型
sentence_model = SentenceTransformer("distilbert-base-uncased", device="cuda")

memo_ = preprocessed
kw_model = KeyBERT(sentence_model)  # Instantiate KeyBERT model
n_keywords = 100 # Specify number of keywords to extract
ngram = 1  # Specify ngram of keywords

# Apply KeyBERT model extraction function along 'Text' axis of pandas dataframe
memo_keywords_df = memo_['review'].apply(lambda x:
                                       kw_model.extract_keywords(x,
                                                                 keyphrase_ngram_range=(1, ngram),
                                                                 stop_words='english',
                                                                 highlight=False,
                                                                 top_n=n_keywords))
# Display results
for i, memo_keywords in enumerate(memo_keywords_df):
    print("-"*40 + "\nmemo_ #{}: top {} keywords (ngram range 1-{})".format(i, n_keywords, ngram))
    for keyword in memo_keywords:
        print(keyword)