## Library

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import datetime as dt
import re, string
from wordcloud import WordCloud
import pycantonese
import jieba, emoji
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models
import pyLDAvis.sklearn
from nltk.corpus import stopwords
from zhon.hanzi import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim import corpora, models, similarities
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.test.utils import datapath

In [None]:
# index
dt_now = str(dt.datetime.now())
dt_now = dt_now.replace("-","").replace(":","").replace(" ","")
dt_now = dt_now[0:14]

message, from, created_time, shares, comments.limit(0).summary(1), reactions.limit(0).summary(1), reactions.type(LIKE).limit(0).summary(1).as(like), reactions.type(LOVE).limit(0).summary(1).as(love), reactions.type(HAHA).limit(0).summary(1).as(haha), reactions.type(WOW).limit(0).summary(1).as(wow), reactions.type(SAD).limit(0).summary(1).as(sad), reactions.type(ANGRY).limit(0).summary(1).as(angry)

In [None]:
# import json # import csv # from nltk.corpus import stopwords

# # Drop not-using columns
# not_using_col = ["id", "parent_id", "level", "object_id", "query_status", "query_time", "query_type",
#                 "paging.cursors.before", "paging.cursors.after", "paging.next", "reactions.summary.viewer_reaction",
#                 "sad.summary.viewer_reaction", "angry.data", "angry.summary.viewer_reaction", "updated_time", "error.message",
#                 "reactions.data", "id.1", "like.summary.total_count", "like.summary.viewer_reaction", "sad.data",
#                 "error.type", "error.code", "error.fbtrace_id", "from.name", "from.id", "like.data", "object_key", 
#                 "object_type", "reactions.summary.total_count", "sad.summary.total_count", "angry.summary.total_count"]
# df.drop(columns = not_using_col, inplace=True)

## 1. Clean df

In [None]:
#*********************************
Bank_name = 'ZA'
#*********************************

Filename = f'{Bank_name}_Facebook_Bank.csv'
New_Filename = f'Final_{Bank_name}_Facebook.csv'

df = pd.read_csv(f'Collected_data/FB_Weibo_Bank/{Filename}', header=0, encoding ="utf-8-sig")

In [None]:
# Drop non-data and NaN messages
drop_index = df[df.object_type != "data"].index
df.drop(axis = 0, index = drop_index, inplace = True)

drop_index = df[df.message.isna()].index
df.drop(axis = 0, index = drop_index, inplace = True)

In [None]:
def na_to_zero(column):
    return column.replace(np.nan,"0").apply(lambda x: int(x))

In [None]:
# Replace NaN with zero
# Change of column type to int
col_to_transform = ["shares.count", "comment_count", "like_count", "comments.summary.total_count",
                   "reactions.summary.total_count", "sad.summary.total_count", "angry.summary.total_count"]

for i in range(len(col_to_transform)):
    df[col_to_transform[i]] = na_to_zero(df[col_to_transform[i]])

In [None]:
# Categorization of reactions
df['pos_reaction'] = (df['reactions.summary.total_count'] + df['like_count']
                      - df['sad.summary.total_count'] - df['angry.summary.total_count'])
df['neg_reaction'] = df['sad.summary.total_count'] + df['angry.summary.total_count']
df['comments'] = df['comment_count'] + df['comments.summary.total_count']

df = df.rename(columns = {'shares.count' : 'shares', 'from.name' : 'author'})

In [None]:
# Datetime
df["created_time"] = pd.to_datetime(df["created_time"])
df["date"] = df["created_time"].apply(lambda x: x.date())
df["date"] = pd.to_datetime(df["date"])

In [None]:
df.head(2)

In [None]:
# Add bank name
df["bank"] = Bank_name

In [None]:
df = df.reindex(columns = ['bank','level','id','parent_id','date','author','created_time','message',
                           'pos_reaction','neg_reaction','shares','comments'])

# Drop duplicate rows
df.drop_duplicates(inplace=True)

# Reset index
df.reset_index(drop=True, inplace=True)

In [None]:
df.head(3)

In [None]:
# Double-checking
display(df[df["neg_reaction"] < 0])
display(df[df["pos_reaction"] < 0])

In [None]:
df.info()

In [None]:
# df.to_csv(f'Collected_data/FB_Weibo_Bank/{New_Filename}', encoding='utf-8-sig', index=False)

## 2. Concat df

In [None]:
df_1 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_ZA_Facebook.csv', header=0, encoding='utf-8-sig')
df_2 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_AIR_Facebook.csv', header=0, encoding='utf-8-sig')
df_3 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_BEA_Facebook.csv', header=0, encoding='utf-8-sig')
df_4 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_CCB_Facebook.csv', header=0, encoding='utf-8-sig')
df_5 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_CITI_Facebook.csv', header=0, encoding='utf-8-sig')
df_6 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_CNCBI_Facebook.csv', header=0, encoding='utf-8-sig')
df_7 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_DBS_Facebook.csv', header=0, encoding='utf-8-sig')
df_8 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_HSBC_Facebook.csv', header=0, encoding='utf-8-sig')
df_9 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_ICBC_Facebook.csv', header=0, encoding='utf-8-sig')
df_10 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_LIVI_Facebook.csv', header=0, encoding='utf-8-sig')
df_11 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_MOX_Facebook.csv', header=0, encoding='utf-8-sig')
df_12 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_SC_Facebook.csv', header=0, encoding='utf-8-sig')
df_13 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_WELAB_Facebook.csv', header=0, encoding='utf-8-sig')
df_14 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_WH_Facebook.csv', header=0, encoding='utf-8-sig')
df_15 = pd.read_csv('Collected_data/FB_Weibo_Bank/Final_DS_Facebook.csv', header=0, encoding='utf-8-sig')

In [None]:
# Concat all df
frames = [df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, df_11, df_12, df_13, df_14, df_15]  
df_master = pd.concat(frames, ignore_index=True)

In [None]:
display(df_master)

In [None]:
df_master.info()

In [None]:
df_master.groupby(by=['bank'])["level"].value_counts()

In [None]:
stat = df_master["bank"].value_counts()
type(stat)

In [None]:
# stat.to_csv('Collected_data/FB_Weibo_Bank/Stat2_ALL_Facebook.csv')

In [None]:
# df_master.to_csv('Collected_data/FB_Weibo_Bank/Final_ALL_Facebook.csv', encoding='utf-8-sig', index=False)

## 3. Word Cloud

In [None]:
#*********************************
bank_abbev = "BEA"
df = pd.read_csv(f'Collected_data/FB_Weibo_Bank/FB_{bank_abbev}_2022_v2.csv', encoding ="utf-8-sig")
#*********************************

# index
dt_now = str(dt.datetime.now())
dt_now = dt_now.replace("-","").replace(":","").replace(" ","")
dt_now = dt_now[0:14]

In [None]:
# Remove URL
df_noURL = df.message.apply(lambda x: re.sub(r'http\S+', '', x))

In [None]:
jieba.set_dictionary('dict.txt.big.txt')

In [None]:
# Word tokenization

# Combine all rows together for easier process
post_ALL = ''.join(df_noURL)

# Word tokens with length larger than 1
post_ALL_token = jieba.lcut(post_ALL, cut_all=True)
post_ALL_token_2 = [word for word in post_ALL_token if len(word.strip())>1]
word_list = " ".join(post_ALL_token_2)

In [None]:
# Stopwords
# Cantonese
stop_words = pycantonese.stop_words()

# TC Stopwords
tc_stopwords = pd.read_csv("tc_stopwords.txt", header=None)
tc_stopwords = set(tc_stopwords[0])

# Eng Stopwords
eng_stopwords = stopwords.words('english')

# Additional stopwords
add_stopwd = set(["HKD", "ZA", "ZABank", "ZA Bank", "Bank", "BEA", "HSBC", "ICBC", "DBS", "SC", "HK", "Hong", "Kong", "com",
                  "香港", "眾安", "星展", "渣打", "東亞", "東亞銀行", "匯豐", "工銀", "有限公司", "銀行", "DBSHK"])

# Combine stopwords
tc_stopwords_union = tc_stopwords.union(stop_words, add_stopwd, eng_stopwords)

In [None]:
wordcloud = WordCloud(width = 500, height = 500,
                      font_path="simsun.ttf",
                      background_color ='white',
                      stopwords = tc_stopwords_union,
                      collocations=False,
                      min_font_size = 15).generate(word_list)

# plot the WordCloud image                      
plt.figure(figsize = (5, 5), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.savefig(f'Bank_Wordcloud/{bank_abbev}_{dt_now}.png')
plt.show()

In [None]:
# makes the circle using numpy
x, y = np.ogrid[:300, :300]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

wordcloud = WordCloud(width = 800, height = 800,
                      font_path="simsun.ttf",
                      background_color ='white',
                      stopwords = tc_stopwords_union,
                      mask = mask,
                      min_font_size = 10).generate(mytext_ALL_list)

# index
dt_now = str(dt.datetime.now())
dt_now = dt_now.replace("-","").replace(":","").replace(" ","")
dt_now = dt_now[0:14]

# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.savefig(f'Bank_Wordcloud/ZA_{dt_now}.png')
plt.show()

## 4. Sklearn LDA (Topic Modelling)

### 4.1 Sample

In [None]:
df = pd.read_csv("Test_LDA_TopicModelling/datascience.csv", encoding='gb18030')

In [None]:
df.head(3)

<b>Word Tokenization using Jieba</b>

In [None]:
def chinese_word_cut(mytext):
    return " ".join(jieba.cut(mytext))

df["content_cutted"] = df.content.apply(chinese_word_cut)
df.content_cutted.head()

<b>Word Vectorization using TFIDF</b>

In [None]:
# Avoid extracting ALL keywords from the text
n_features = 1000
tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                max_features=n_features,
                                stop_words='english',
                                # Ignore terms that have a document frequency > or < than the threshold
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(df.content_cutted)

<b>Apply LDA to train the model</b>

In [None]:
# Need to self define no. of topics
n_topics = 5
lda = LatentDirichletAllocation(n_components=n_topics, 
                                max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

In [None]:
# Print top words of each topics
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
# Set how many top words to get for each topic
n_top_words = 20

tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

### 4.2 Facebook posts

In [None]:
bank_abbev = "ALL"
df = pd.read_csv(f'Collected_data/FB_Weibo_Bank/FBClean_{bank_abbev}_v2.csv', encoding='utf-8-sig', header=0)

In [None]:
df.head(3)

<b>Cantonese Stopwords</b>

In [None]:
# Stopwords
# Cantonese
stop_words = pycantonese.stop_words()

# TC Stopwords
tc_stopwords = pd.read_csv("tc_stopwords.txt", header=None)
tc_stopwords = set(tc_stopwords[0])

# Eng Stopwords
eng_stopwords = stopwords.words('english')

# Additional stopwords
add_stopwd = set(["HKD", "ZA", "ZABank", "ZA Bank", "Bank", "BEA", "HSBC", "ICBC", "DBS", "SC", "HK", "Hong", "Kong", "com", "bit", "亞銀", "閣下",
                  "香港", "眾安", "星展", "渣打", "東亞", "東亞銀行", "匯豐", "工銀", "有限公司", "銀行", "DBSHK", "html", "conditions", "terms", "ly",
                  'bank', 'bea', 'dbs', 'dbshk', 'hk', 'hkd', 'hong', 'hsbc', 'icbc', 'kong', 'sc', 'za', 'zabank', "condition", "term", '若果',
                  "grp"])

# Combine stopwords
tc_stopwords_union = tc_stopwords.union(stop_words, add_stopwd, eng_stopwords)

<b>Word Tokenization using Jieba</b>

In [None]:
jieba.set_dictionary('dict.txt.big.txt')

In [None]:
def chinese_word_cut(mytext):
    # Jieba - Word tokenization
    word_list = jieba.lcut(mytext, cut_all=True)
    word_list = [word for word in word_list if len(word.strip())>1]
    word_list = [i for i in word_list if not i.isdigit()]
    word_list = " ".join(word_list)
    
    # Remove punctuation marks
    word_list = word_list.translate(str.maketrans('', '', string.punctuation)) # English
    word_list = word_list.translate(str.maketrans('', '', punctuation)) # Chinese
    
    # Remove emojis
    word_list = emoji.get_emoji_regexp().sub(u'', word_list)
    return word_list

df["content_cutted"] = df.message.apply(chinese_word_cut)
df.content_cutted.head()

<b>Word Vectorization using TFIDF</b>

In [None]:
# Avoid extracting ALL keywords from the text
n_features = 1000
tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                max_features=n_features,
                                stop_words=tc_stopwords_union,
                                # Ignore terms that have a document frequency > or < than the threshold
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(df.content_cutted)

<b>Apply LDA to train the model</b>

In [None]:
# Need to self define no. of topics
n_topics = 7
lda = LatentDirichletAllocation(n_components=n_topics, 
                                max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

In [None]:
# Print top words of each topics
def print_top_words(model, feature_names, n_top_words):
    print(bank_abbev, "Bank Topics:\n")
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
# Set how many top words to get for each topic
n_top_words = 20

tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

In [None]:
lda.components_[1]

In [None]:
# index
dt_now = str(dt.datetime.now())
dt_now = dt_now.replace("-","").replace(":","").replace(" ","")
dt_now = dt_now[0:14]

pyLDAvis.enable_notebook()
p = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.save_html(p, f'Test_LDA_TopicModelling/LDA_{bank_abbev}_{dt_now}.html')

## 5. Gensim LDA (Topic Modelling)

<b>Assign topics to ALL Facebook posts and comments</b>

In [None]:
bank_abbev = "ALL"
df = pd.read_csv(f'Collected_data/Final_{bank_abbev}_Facebook.csv', encoding='utf-8-sig', header=0)

In [None]:
display(df.head(3))

# Working

In [131]:
stopword_list = []

with open("tc_stopwords_union.txt", 'r',encoding='utf-8-sig') as f:
    for data in f.readlines():
        stopword_list.append(data.strip())

In [132]:
stopword_list[0:10]

['0', '大家', '來自', '乜嘢', '譬喻', 'because', '所以', '連同', '嘍', '成日']

In [133]:
def _filter(word):
    extra_stoplist = ['\n', ' ', '\xa0', '─', '', 'https', 'http']
    if word in stopword_list + extra_stoplist:
        return False
    elif (len(word) > 1) & (~word.isdigit()):
        return True

def chinese_word_cut(mytext):
    # Jieba - Word tokenization
#     word_list = jieba.cut(mytext, cut_all=False)
    word_list = list(filter(_filter, jieba.cut(mytext)))
#     word_list = [word for word in word_list if len(word.strip())>1]
#     word_list = [i for i in word_list if not i.isdigit()]
#     word_list = " ".join(word_list)
    
#     # Remove punctuation marks
#     word_list = word_list.translate(str.maketrans('', '', string.punctuation)) # English
#     word_list = word_list.translate(str.maketrans('', '', punctuation)) # Chinese
    
#     # Remove emojis
#     word_list = emoji.get_emoji_regexp().sub(u'', word_list)
    return word_list

[chinese_word_cut(x) for x in df.message[0:5]]


# df["content_cutted"] = df.message.apply(chinese_word_cut)
# df.content_cutted.head()

[['好友',
  '齊齊',
  '美容',
  '按摩',
  '項目',
  '買一送一',
  '所謂',
  '休息',
  '更遠',
  '工作',
  '適時',
  '放鬆',
  '一下',
  'Card',
  '特選',
  '商戶',
  '水療',
  '按摩',
  '美容',
  '買一送一',
  '朋友',
  '一齊',
  'relax',
  '身心',
  '重新',
  '注入',
  '能量',
  '出發',
  'Miris',
  'Spa',
  '岩石',
  '按摩',
  '專門店',
  '岩鹽',
  '熱石',
  '身體',
  '排毒',
  '抗疲',
  '適合',
  '孕婦',
  '舒緩',
  '按摩',
  '護理',
  '為身',
  '得到',
  '全面',
  '放鬆',
  'Spa',
  'Sanctuary',
  '提供',
  '多款',
  '按摩',
  '面部',
  '療程',
  '全面',
  '提升',
  '能量',
  '精神飽滿',
  'OUT',
  'OF',
  'COLOURS',
  'Beauty',
  '要望',
  '落夠',
  '點少',
  '一枝',
  '唇膏',
  '亞洲',
  '首創',
  '唇膏',
  '體驗',
  '造型',
  '獨一無二',
  '唇色',
  '仲有',
  '特選',
  '商戶',
  '美容',
  '按摩',
  '項目',
  '買一送一',
  '詳情',
  '3SNzVUe',
  '另一半',
  'spa',
  '屋企',
  '閨密',
  '扮靚靚',
  'ZACard',
  '-------',
  '即日',
  'Card',
  '特選',
  '商戶',
  '購買',
  '指定',
  '美容',
  '項目',
  '即享',
  '買一送一',
  '優惠',
  '優惠',
  '特選',
  '商戶',
  '提供',
  '使用',
  '條件',
  '包括',
  '限於',
  '美容',
  '項目',
  '種類',
  '價錢',
  '優惠',
  '享用',
  '時間',
  '使用',
  '方式

In [134]:
document = [chinese_word_cut(x) for x in df.message]
# document = [df.message.apply(lambda x: chinese_word_cut(x))]
print(type(document), len(document))

<class 'list'> 33100


In [135]:
dictionary = Dictionary(document)

In [136]:
# Token frequency
# dictionary.dfs

詞袋(BOW, bag of word)，轉換成詞袋之後的好處是方便進行LDA模型訓練，但詞袋模型的缺點是不考慮詞跟詞的連接的順序，因而大幅簡化了原文本。

In [137]:
corpus = [dictionary.doc2bow(text) for text in document]

In [148]:
lda = LdaModel(corpus,
               id2word=dictionary,
               num_topics=6,
               random_state=100,
               update_every=1,
               chunksize=100,
               passes=5,
               alpha='auto',
               per_word_topics=True)

In [149]:
lda.print_topics()

[(0,
  '0.256*"答案" + 0.228*"知多少" + 0.025*"Mox" + 0.019*"得獎者" + 0.010*"參與" + 0.009*"Apple" + 0.009*"智郵站" + 0.009*"郵政局" + 0.008*"得獎" + 0.008*"領獎"'),
 (1,
  '0.034*"優惠" + 0.032*"現金" + 0.022*"貸款" + 0.022*"獎賞" + 0.020*"高達" + 0.020*"消費" + 0.019*"申請" + 0.018*"回贈" + 0.017*"專人" + 0.015*"成功"'),
 (2,
  '0.070*"理財" + 0.044*"有獎" + 0.043*"日常" + 0.043*"輕鬆" + 0.018*"服務" + 0.013*"客戶" + 0.010*"目標" + 0.010*"儲蓄" + 0.009*"外幣" + 0.008*"建議"'),
 (3,
  '0.060*"banking" + 0.029*"Wong" + 0.022*"Lam" + 0.019*"健康" + 0.011*"Ng" + 0.010*"Wai" + 0.010*"推薦" + 0.008*"中小" + 0.007*"YNWA" + 0.006*"Ming"'),
 (4,
  '0.053*"Chan" + 0.019*"意食" + 0.017*"Lee" + 0.016*"Cheung" + 0.013*"Chow" + 0.012*"Wing" + 0.012*"Leung" + 0.011*"Yan" + 0.010*"沙律" + 0.010*"Lau"'),
 (5,
  '0.094*"問答" + 0.073*"投資" + 0.064*"Banking" + 0.038*"風險" + 0.019*"外匯" + 0.015*"證券" + 0.014*"涉及" + 0.012*"買賣" + 0.011*"聲明" + 0.009*"資金"')]

In [150]:
# index
dt_now = str(dt.datetime.now())
dt_now = dt_now.replace("-","").replace(":","").replace(" ","")
dt_now = dt_now[0:14]

In [151]:
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary, mds='mmds')
p

  default_term_info = default_term_info.sort_values(


In [152]:
pyLDAvis.save_html(p, f'Test_LDA_TopicModelling/LDA_{bank_abbev}_{dt_now}.html')

<b>Cantonese Stopwords</b>

In [None]:
# Stopwords
# Cantonese
stop_words = pycantonese.stop_words()

# TC Stopwords
tc_stopwords = pd.read_csv("tc_stopwords.txt", header=None)
tc_stopwords = set(tc_stopwords[0])

# Eng Stopwords
eng_stopwords = stopwords.words('english')

# Additional stopwords
add_stopwd = set(["HKD", "ZA", "ZABank", "ZA Bank", "Bank", "BEA", "HSBC", "ICBC", "DBS", "SC", "HK", "Hong", "Kong", "com", "bit", "亞銀", "閣下",
                  "香港", "眾安", "星展", "渣打", "東亞", "東亞銀行", "匯豐", "工銀", "有限公司", "銀行", "DBSHK", "html", "conditions", "terms", "ly",
                  'bank', 'bea', 'dbs', 'dbshk', 'hk', 'hkd', 'hong', 'hsbc', 'icbc', 'kong', 'sc', 'za', 'zabank', "condition", "term", '若果',
                  "grp"])

# Combine stopwords
tc_stopwords_union = tc_stopwords.union(stop_words, add_stopwd, eng_stopwords)

In [None]:
pd.DataFrame(tc_stopwords_union).to_csv('tc_stopwords_union.txt', index=False)

<b>Word Tokenization using Jieba</b>

In [None]:
jieba.set_dictionary('dict.txt.big.txt')

In [None]:
def chinese_word_cut(mytext):
    # Jieba - Word tokenization
    word_list = jieba.lcut(mytext, cut_all=True)
    word_list = [word for word in word_list if len(word.strip())>1]
    word_list = [i for i in word_list if not i.isdigit()]
    word_list = " ".join(word_list)
    
    # Remove punctuation marks
    word_list = word_list.translate(str.maketrans('', '', string.punctuation)) # English
    word_list = word_list.translate(str.maketrans('', '', punctuation)) # Chinese
    
    # Remove emojis
    word_list = emoji.get_emoji_regexp().sub(u'', word_list)
    return word_list

df["content_cutted"] = df.message.apply(chinese_word_cut)
df.content_cutted.head()

<b>Train Gensim LDA model</b>

In [None]:
df.content_cutted[0]

<b>Word Vectorization using TFIDF</b>

In [None]:
# Avoid extracting ALL keywords from the text
n_features = 1000
tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                max_features=n_features,
                                stop_words=tc_stopwords_union,
                                # Ignore terms that have a document frequency > or < than the threshold
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(df.content_cutted)

<b>Apply LDA to train the model</b>

In [None]:
from gensim.models.ldamodel import LdaModel
lda = LdaModel(tf, num_topics=10)

In [None]:
# Need to self define no. of topics
n_topics = 7
lda = LatentDirichletAllocation(n_components=n_topics, 
                                max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

In [None]:
# Print top words of each topics
def print_top_words(model, feature_names, n_top_words):
    print(bank_abbev, "Bank Topics:\n")
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
# Set how many top words to get for each topic
n_top_words = 20

tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

In [None]:
lda.components_[1]

In [None]:
# index
dt_now = str(dt.datetime.now())
dt_now = dt_now.replace("-","").replace(":","").replace(" ","")
dt_now = dt_now[0:14]

pyLDAvis.enable_notebook()
p = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
p

In [None]:
pyLDAvis.save_html(p, f'Test_LDA_TopicModelling/LDA_{bank_abbev}_{dt_now}.html')

## End