# PACKAGES

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import gensim
import re
import nltk
from tqdm import tqdm

import pyLDAvis
import pyLDAvis.gensim
from gensim import models
from gensim.models.coherencemodel import CoherenceModel

from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()
import warnings
warnings.filterwarnings("ignore")

# FUNCTION

In [None]:
def tweet_density(tweets_df):
    tweets_df.clean_tweet.fillna(' ', inplace = True)
    tweets_df['clean_tweet']=tweets_df.clean_tweet.str.replace('long covid',' ')
    tweets_df['clean_tweet']=tweets_df.clean_tweet.str.replace('covid',' ')
    tweets_df['clean_tweet']=tweets_df.clean_tweet.str.replace('covid',' ')
    # Convert date to datetime
    tweets_df['date'] = pd.to_datetime(tweets_df['Date'], errors='coerce')
    # Add column for year
    tweets_df['year'] = tweets_df['date'].dt.year
    tweets_df['month'] = tweets_df['date'].dt.month
    tweets_df['day'] = tweets_df['date'].dt.day
    daily_counts = tweets_df.set_index('date').resample('D').count()
    daily_counts.plot(figsize=(16,10))
    plt.title('Daily Counts of Tweets')
    plt.plot(daily_counts);
    plt.show()
    return tweets_df


def vectorize_dic_bag(tweets_df):
    tweets_tokens = tweets_df.clean_tweet.apply(lambda x: re.split('\s', str(x)))
    dictionary = gensim.corpora.Dictionary(tweets_tokens)
    bow_corpus = [dictionary.doc2bow(tweet) for tweet in tweets_tokens]
    return tweets_tokens,dictionary,bow_corpus


def u_mass(dictionary):
    filtered_dict=dictionary
    coherenceList_umass = []
    num_topics_list = np.arange(7,23)
    min=99999
    num=0
    itera=7
    for num_topics in tqdm(num_topics_list):
        lda = models.LdaMulticore(corpus=bow_corpus, num_topics=num_topics, id2word=dictionary, 
                                  passes=10,chunksize=4000,workers=None,random_state=0)
        cm = CoherenceModel(model=lda, corpus=bow_corpus, 
                            dictionary=filtered_dict, coherence='u_mass')
        coherenceList_umass.append(cm.get_coherence())
        if cm.get_coherence()<=min:
            min=cm.get_coherence()
            num=itera
        itera+=1
        #viz = pyLDAvis.gensim.prepare(lda, bow_corpus, filtered_dict, mds='tsne')
        #pyLDAvis.save_html(viz,f'pyLDAvis_{num_topics}.html')
    return filtered_dict, coherenceList_umass, num_topics_list, num, lda, cm


def plot_TC(num_topics_list,coherenceList_umass):
    plotData = pd.DataFrame({'Number of topics':num_topics_list,
                             'CoherenceScore':coherenceList_umass})
    f,ax = plt.subplots(figsize=(30,10))
    sns.set_style("darkgrid")
    sns.set(font_scale = 2)
    sns.pointplot(x='Number of topics', y= 'CoherenceScore',data=plotData)
    plt.axhline(y=-4.8, color='red')
    plt.title('Topic Coherence')
    #plt.savefig('/kaggle/working/topic_coherence.png')
    return plotData


def get_matrix(tweets_df):
    tweets_df.clean_tweet.fillna(' ', inplace = True)
    tweets_df['clean_tweet']=tweets_df.clean_tweet.str.replace('long covid',' ')
    tweets_df['clean_tweet']=tweets_df.clean_tweet.str.replace('covid',' ')
    count_vectorizer = CountVectorizer(stop_words='english', max_features=40000)
    tweet_text = tweets_df['clean_tweet'].values
    document_term_matrix = count_vectorizer.fit_transform(tweet_text)
    return count_vectorizer,document_term_matrix


def topic_fitting(num,document_term_matrix):
    n_topics = num
    lda_model = LatentDirichletAllocation(n_components=n_topics, learning_method='online', batch_size=10000, 
                                              random_state=0, learning_decay=0.5, verbose=0)
    lda_topic_matrix = lda_model.fit_transform(document_term_matrix)
    return n_topics,lda_topic_matrix


def get_keys(topic_matrix):
    '''
    returns an integer list of predicted topic 
    categories for a given topic matrix
    '''
    keys = topic_matrix.argmax(axis=1).tolist()
    return keys


def keys_to_counts(keys):
    '''
    returns a tuple of topic categories and their 
    accompanying magnitudes for a given list of keys
    '''
    count_pairs = Counter(keys).items()
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)


def get_top_n_words(n, keys, document_term_matrix, count_vectorizer,n_topics):
    '''
    returns a list of n_topic strings, where each string contains the n most common 
    words in a predicted category, in order
    '''
    top_word_indices = []
    for topic in range(n_topics):
        temp_vector_sum = 0
        for i in range(len(keys)):
            if keys[i] == topic:
                temp_vector_sum += document_term_matrix[i]
        #temp_vector_sum = temp_vector_sum.toarray()
        
        #add to deal with the error；int don't have toarray()
        try:
            if(temp_vector_sum.toarray):
                previous = temp_vector_sum
                #print(temp_vector_sum.toarray(),'\n')
                temp_vector_sum = temp_vector_sum.toarray()
        except(AttributeError):
            print('topic'+str(topic)+'error!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            temp_vector_sum = previous.toarray()
        
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
        top_word_indices.append(top_n_word_indices)   
    top_words = []
    for topic in top_word_indices:
        topic_words = []
        for index in topic:
            temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
            temp_word_vector[:,index] = 1
            the_word = count_vectorizer.inverse_transform(temp_word_vector)[0][0]
            topic_words.append(the_word.encode('ascii').decode('utf-8'))
        top_words.append(" ".join(topic_words))         
    return top_words


def save_topic(lda_keys,document_term_matrix,count_vectorizer,sheet_name):
    top_n_words_lda = get_top_n_words(15, lda_keys, document_term_matrix, count_vectorizer,n_topics)
    topic_list=[]
    for i in range(len(top_n_words_lda)):
        print("Topic {}: ".format(i+1), top_n_words_lda[i])
        topic_list.append(top_n_words_lda[i])
    data = {sheet_name:topic_list}
    #LDA_topic = pd.DataFrame(data)
    #LDA_topic.to_csv(sheet_name+'.csv')
    return data


def plot_topic(lda_keys,document_term_matrix,count_vectorizer,n_topics,plot_name):
    top_5_words = get_top_n_words(15, lda_keys, document_term_matrix, count_vectorizer,n_topics) 
    labels = ['Topic {}: \n'.format(i) + top_5_words[i] for i in lda_categories]
    fig, ax = plt.subplots(figsize=(120,20))
    content = [top_5_words[i] for i in lda_categories]
    data={'topic_index':lda_categories,'topic_content':content,'tweets_num':lda_counts}
    LDA_topic = pd.DataFrame(data)
    sort_topic = LDA_topic.sort_values(by='topic_index',ascending=True)
    sort_topic = sort_topic.reset_index(drop=True)
    sort_topic.to_csv(plot_name+'.csv')
    print('\n')
    print(sort_topic)
    print('\n')
    #LDA_topic.to_csv(plot_name+'.csv')
    ax.bar(lda_categories, lda_counts);
    ax.set_xticks(lda_categories);
    ax.set_xticklabels(labels);
    ax.set_title('LDA topic counts');
    ax.set_ylabel('Number of tweets');
    #plt.savefig('/kaggle/working/LDA_topic'+plot_name+'.png')
    return top_5_words,labels

# CONDUCT

In [None]:
%%time

# tweets_df_2020_p = pd.read_csv('/kaggle/input/tweets/PCS_patient_2020_clean.csv')
# tweets_df_2021_p = pd.read_csv('/kaggle/input/tweets/PCS_patient_2021_clean.csv')
# tweets_df_2022_p = pd.read_csv('/kaggle/input/tweets/PCS_patient_2022_clean.csv')
# tweets_df_2023_p = pd.read_csv('/kaggle/input/tweets/PCS_patient_2023_clean.csv')

##########################################
#create file name
filelist=[]
for i in range(4):
        filelist.append('PCS_non_patient_202'+str(i)+'_clean.csv')
# print(filelist)
# ['PCS_patient_2020_clean.csv', 'PCS_patient_2021_clean.csv', 'PCS_patient_2022_clean.csv', 'PCS_patient_2023_clean.csv', 
#  'PCS_non_patient_2020_clean.csv', 'PCS_non_patient_2021_clean.csv', 'PCS_non_patient_2022_clean.csv', 'PCS_non_patient_2023_clean.csv']
createVar = locals()
myVarList = [] # 存放自己创建的变量
for i in range(len(filelist)):
    createVar[filelist[i]] = pd.read_csv('/kaggle/input/tweets/'+filelist[i])
    myVarList.append(createVar[filelist[i]])
    if(i<2):
        continue
#     #to get the remaining data
#     #-------------------------------------------------------------
#     if i < 6:
#         continue
#     #-------------------------------------------------------------
    for j in range(12):
        if (i==2 and j+1<=11):
            continue
        if myVarList[i][myVarList[i]['month'].isin([j+1])].empty:
            continue
        
#         #to get the remaining data
#         #-------------------------------------------------------------
#         if i == 6 and j < 7:
#             continue
#         #-------------------------------------------------------------
        
        else:
            tweets_df=myVarList[i][myVarList[i]['month'].isin([j+1])]
            tweets_df = tweet_density(tweets_df)

            tweets_tokens,dictionary,bow_corpus = vectorize_dic_bag(tweets_df)
            
            filtered_dict,coherenceList_umass,num_topics_list,num,lda,cm = u_mass(dictionary)

            plotData=plot_TC(num_topics_list,coherenceList_umass)

            lda_model_bow = gensim.models.LdaMulticore(corpus=bow_corpus, num_topics=num, id2word=dictionary, decay=0.5,
                                                       chunksize=10000, passes=10, workers=None, random_state=0)
            lda_viz = pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, dictionary, mds='tsne')
            pyLDAvis.enable_notebook()
            lda_viz

            count_vectorizer,document_term_matrix=get_matrix(tweets_df)
            n_topics,lda_topic_matrix=topic_fitting(num,document_term_matrix)
            lda_keys = get_keys(lda_topic_matrix)
            lda_categories, lda_counts = keys_to_counts(lda_keys)
#             try:
#                 get_top_n_words(n, keys, document_term_matrix, count_vectorizer,n_topics)
                
            if i<4:
                #LDA_topic=save_topic(lda_keys,document_term_matrix,count_vectorizer,'topic_p_202'+str(i)+'_'+str(j+1))
                #LDA_topic
                top_5_words,labels=plot_topic(lda_keys,document_term_matrix,count_vectorizer,n_topics,'topic_np_202'+str(i)+'_'+str(j+1))
##########################################



#--------------------------------------------------------------------------------------------------------------------------------------
# tweets_df_2020_p = pd.read_csv('/kaggle/input/tweets/PCS_patient_2020_clean.csv')
# tweets_df=tweets_df_2020_p
# tweets_df=tweets_df[tweets_df['month'].isin([5])]
# tweets_df = tweet_density(tweets_df)

# tweets_tokens,dictionary,bow_corpus = vectorize_dic_bag(tweets_df)

# filtered_dict,coherenceList_umass,num_topics_list,num,lda,cm = u_mass(dictionary)

# plotData=plot_TC(num_topics_list,coherenceList_umass)

# lda_model_bow = gensim.models.LdaMulticore(corpus=bow_corpus, num_topics=num, id2word=dictionary, decay=0.5,
#                                            chunksize=10000, passes=10, workers=None, random_state=0)
# lda_viz = pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, dictionary, mds='tsne')
# pyLDAvis.enable_notebook()
# lda_viz

# count_vectorizer,document_term_matrix=get_matrix(tweets_df)
# n_topics,lda_topic_matrix=topic_fitting(num,document_term_matrix)
# lda_keys = get_keys(lda_topic_matrix)
# lda_categories, lda_counts = keys_to_counts(lda_keys)
# # LDA_topic=save_topic(lda_keys,document_term_matrix,count_vectorizer,sheet_name)
# #sheet_name need to be changed
# LDA_topic=save_topic(lda_keys,document_term_matrix,count_vectorizer,'topic_np_2020_5')
# LDA_topic

# top_5_words,labels=plot_topic(lda_keys,document_term_matrix,count_vectorizer,n_topics,'_2020_5')
