## LDA With Newsgroups

Reference
- https://www.kaggle.com/sumantindurkhya/topic-modeling-on-20-newsgroup-data-lsa-and-lda

## Step 1 - Load Data

In [None]:
from sklearn.datasets import fetch_20newsgroups

categories = ['rec.sport.baseball', 'sci.crypt', 'comp.graphics', 'misc.forsale']
data = fetch_20newsgroups(subset='train',categories=categories,shuffle=True, 
                          remove=('headers', 'footers', 'qutes'), random_state=123)
data.keys()

## Step 2 - Explore Data

In [None]:
import pandas as pd

news_df = pd.DataFrame({'text': data.data,
                       'Target': data.target})

news_df['Target_name'] = news_df['Target'].apply(lambda x: data.target_names[x])

print ('news_df.shape :', news_df.shape)
news_df.sample(10)

## Step 3 - Cleanup Text

In [None]:
# remove non alphabetic characters
# remove stopwords and lemmatize
import re
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean_text(sentence):
    # remove non alphabetic sequences
    pattern = re.compile(r'[^a-z]+')
    sentence = sentence.lower()
    sentence = pattern.sub(' ', sentence).strip()
    
    # Tokenize
    word_list = word_tokenize(sentence)
    
    # stop words
    stopwords_list = set(stopwords.words('english'))
    # puctuation
    # punct = set(string.punctuation)
    
    # remove stop words
    word_list = [word for word in word_list if word not in stopwords_list]
    # remove very small words, length < 3
    # they don't contribute any useful information
    word_list = [word for word in word_list if len(word) > 2]
    # remove punctuation
    # word_list = [word for word in word_list if word not in punct]
    
    # stemming
    # ps  = PorterStemmer()
    # word_list = [ps.stem(word) for word in word_list]
    
    # lemmatize
    lemma = WordNetLemmatizer()
    word_list = [lemma.lemmatize(word) for word in word_list]
    return word_list
    ## list to sentence
    #sentence = ' '.join(word_list)
    #return sentence



In [None]:
%%time 

from tqdm import tqdm

# we'll use tqdm to monitor progress of data cleaning process
# create tqdm for pandas
tqdm.pandas()
# clean text data
news_df['tokens'] = news_df['text'].progress_apply(lambda x: clean_text(str(x)))


In [None]:
# now the 'text' column should be cleaned up
news_df.sample(10)

In [None]:
texts = news_df['tokens'].to_list()
len(texts)

## Step 5 - LDA

In [None]:
from gensim import corpora
from pprint import pprint
import random

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
print ('dict: num_tokens : ', len(dictionary))
for x in random.sample(dictionary.items(), 10):
    print(x)
print('-----')
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
print ('corpus: num_docs : ', len(corpus))
# for i, c in enumerate(corpus):
#     print("doc" , i, c)

for x in random.sample(corpus, 3):
    print (x)
    print ()
    
## TODO : can you understand this output below of dictionary and corpus?

## Step 6 - LDA Model

In [None]:
%%time 

import gensim
from gensim.models.ldamodel import LdaModel

# generate LDA model
## TODO : Experiment with these properties
## topic_count : change this number and see what topics are coming up
## passes : try changing this too

# topic_count = len(categories)
topic_count = 10

ldamodel = LdaModel(corpus, num_topics=topic_count, id2word = dictionary, passes=20)
print (ldamodel)

In [None]:
from pprint import pprint

pprint(ldamodel.print_topics())