### Sample program for Topic Model Analysis using gensim (LDA)    

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_numeric
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation

#### Parameters  

In [None]:
csv_in = 'newsgroups5-1.csv'

model_file = 'topic_newsgroups5-1.model'

#### Read CSV file  

In [None]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

#### Check the number of documents in each category  

In [None]:
print(df['target'].value_counts())
n_targets = df['target'].value_counts().shape[0]
print(n_targets)

#### Assign docID according to its category  
- docID = 'd' + number, such as d0, d1, ..., d1000, d1001, ...
 - number = target * 1000 + j

In [None]:
docID = []
j = np.zeros(n_targets)
for i in range(len(df)):
    tgt = df.at[i, 'target']
    # base of document ID:
    #   0 for documents of target 0, 1000 for documents of target 1,
    #   2000 for documents of target 2, ...
    docID.append('d'+str(int(tgt*1000+j[tgt])))
    # increment j for target "tgt"
    j[tgt] += 1
df['docID'] = docID
display(df.head())

### Remove stop_words, punctuations, etc.   
- Use custom filters  
- in this case, strip_multiple_whiltespaces is not used.  

In [None]:
CUSTOM_FILTERS = [lambda x: x.lower(),
                  strip_tags, remove_stopwords,
                  strip_punctuation, strip_numeric, 
                 ]
df['content'] = df['content'].map(lambda y: preprocess_string(y, CUSTOM_FILTERS))
display(df.head())

**Non-significant words are omitted.  
Now the value of "content" column is a list of words.**

#### Build dictionary  

In [None]:
dic = Dictionary(df['content'])
print('#docs:', dic.num_docs)
print('#vocabulary_size:', len(dic))
print('#words_in_total:', dic.num_pos)

#### Check frequency of each word in corpus  

In [None]:
df_dic = pd.DataFrame([])
df_dic['word_id'] = list(dic.cfs.keys())
df_dic['word'] = df_dic['word_id'].map(dic)
df_dic['frequency'] = list(dic.cfs.values())
df_dic_sorted = df_dic.sort_values('frequency', ascending=False)
display(df_dic_sorted.head())
display(df_dic_sorted.tail())

#### Filter word set (omit too less / too much frequent words)  
- no_below: Minimum frequency of a word for the analysis  
- no_above: Maximum ratio of appearance in documents (omit words appeared in too many docs)  

In [None]:
dic.filter_extremes(no_below=5, no_above=0.1)
print('#docs:', dic.num_docs)
print('#vocabulary_size:', len(dic))
print('#words_in_total:', dic.num_pos)

**After the filtering, the vocabulary size is shrinked**  

#### Check frequency of each word in corpus after filtering  

In [None]:
df_dic = pd.DataFrame([],)
df_dic['word_id'] = list(dic.cfs.keys())
df_dic['word'] = df_dic['word_id'].map(dic)
df_dic['frequency'] = list(dic.cfs.values())
df_dic_sorted = df_dic.sort_values('frequency', ascending=False)
display(df_dic_sorted.head())
display(df_dic_sorted.tail())

#### Make corpus (BoW for each document)  

In [None]:
corpus = [dic.doc2bow(text) for text in df['content']]
print('corpus:')
print(corpus[:2])

#### Calculation of LDA  
- Set n_topics = 5 for example (same as the number of newsgroups)  
- Set alpha = 'symmetric', i.e. prior probability for each topic is uniform      

In [None]:
n_topics = 5
alpha = 'symmetric'
lda = LdaModel(corpus, num_topics=n_topics, alpha=alpha, random_state=1)

#### Topic probability for each doc  
- A list of (topic_id, topic_probability)  

In [None]:
# Show topic prob for the 1st 7 documents as samples
for c in corpus[:7]:
    doc_topics = lda.get_document_topics(c)
    print(doc_topics)

#### Correspondence between category (newsgroup) and topics  
- Average probability of topics of documents for each category  

In [None]:
cat_topics = np.zeros((n_targets, n_topics))
n_tgt_docs = np.zeros(n_targets)
tgt_names = ['']*n_targets
for i in range(dic.num_docs):
    target_id = df.at[i, 'target']
    if tgt_names[target_id] == '':
        tgt_names[target_id] = df.at[i, 'target_names']
    n_tgt_docs[target_id] += 1
    for topic_id,topic_prob in lda.get_document_topics(corpus[i]):
        cat_topics[target_id, topic_id] += topic_prob

for i in range(n_targets):
    cat_topics[i, :] /= n_tgt_docs[i]

df_cat_topics = pd.DataFrame(cat_topics)
df_cat_topics['target_names'] = tgt_names
display(df_cat_topics)

#### Word probability for each topic   

In [None]:
n_top = 5
for i in range(n_topics):
    print('Topic #{}'.format(i))
    for word_id,prob in lda.get_topic_terms(i, topn=n_top):
        print(dic[word_id], prob)
    print('')

#### Visualization of word probability using wordcloud  

In [None]:
from wordcloud import WordCloud

wc = WordCloud(
    background_color="white",
    colormap="winter",
    collocations=False,
)

fig = plt.figure(figsize=(6,8))
fig.subplots_adjust(left=0, right=1, bottom=0, top=0.5, hspace=0.3)
for i in range(n_topics):
    plt.subplot(3,2,i+1)
    x = {dic[t[0]]:t[1] for t in lda.get_topic_terms(i, topn=20)}
    im = wc.generate_from_frequencies(x)
    plt.imshow(im)
    plt.axis('off')
    plt.title('Topic #'+str(i))
plt.show()

**It seems that correspondence between categories (newsgroups) and topics exists.  
Note that probability of topics is determined per document, not per newsgroup, and various subjects are included in articles of a newsgroup (e.g. NOT all articles in rec.sports.baseball talk about baseball)**  

**Target 0 (rec.autos): mainly generated from topics #1 and #3.  
Target 1 (soc.religion.christian): mainly generated from topic #4.  
Target 2 (rec.sport.baseball): mainly generated from topic #0.  
Target 3 (sci.med): mainly generated from topics #2 and #3.  
Target 4 (sci.electronics): mainly generated from topics #1 and #4.**