## Data preparation
Today we will use the 20newsgroups dataset as seen in the "bitezize-NLP-prep-20newsgroups" repository

In [1]:
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.datasets import fetch_20newsgroups

stop_words = stopwords.words("english") #load the stop words (words to ignore list) for english
df = pd.DataFrame(pd.Series(fetch_20newsgroups(subset='train').data))

In [2]:
# Comment extraction

def extractComments(x):
    ''' INPUT: a string
        OUTPUT: the right side of the string after splitting it
            on the first double line break
    '''
    l = x.split('\n\n',1)
    return l[1]

df['comments'] = df[0].apply(lambda x: extractComments(x)).astype(str)

In [3]:
# Comment cleaning

def scrubString(x):
    ''' INPUT: a string
        OUTPUT: a string that has had links removed, then non-letters, then english stopwords
            This will produce a blank string if it only consisted of links, numbers, etc
    '''
    lemmatizer = WordNetLemmatizer()
    
    x = re.sub("\S*@\S*\s?","",x) #Remove email addresses
    x = re.sub("#\S+|&\S+|@\S+|https?:\S+|RT|[^A-Za-z0-9]+",' ', x) #Remove hyperlinks
    x = re.sub("&\S*|@\S+|https?:\S+",' ', x) #Remove more hyperlinks
    x = re.sub("[^A-Za-z']+",' ',x) #keep only letters

    if len(x)==0:
        return ''
    
    tokens = word_tokenize(x) # Convert the string into tokens
    
    # Lemmatize the words, and only keep non-stop words
    tokens = [lemmatizer.lemmatize(word).strip() for word in tokens if word not in stop_words]
    
    if len(tokens)==0:
        return ''
    
    return ' '.join(map(str,tokens))

df['cleaned'] = df['comments'].apply(lambda x: scrubString(x))

## Topic modelling
Now we get to use gensim to extract topics

In [4]:
# Extract and save the tokens of the cleaned text

def extract_tokens(x, min_len=5):
    ''' INPUT: a string
        OUTPUT: a list of tokens with that meet a minimum length
    '''
    tokens = word_tokenize(x)
    tokens = [token for token in tokens if len(token) >= min_len]
    return tokens

df['tokens'] = df['cleaned'].apply(lambda x: extract_tokens(x))

In [5]:
# Generate 

import gensim
from gensim import corpora

text_data = df['tokens'].to_list()

dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=15, id2word=dictionary, passes=15)
ldamodel.save('model_15.gensim')

topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.033*"entry" + 0.014*"section" + 0.013*"output" + 0.013*"program" + 0.009*"build"')
(1, '0.028*"image" + 0.020*"window" + 0.019*"server" + 0.016*"program" + 0.013*"format"')
(2, '0.016*"period" + 0.014*"militia" + 0.010*"Second" + 0.009*"Islanders" + 0.008*"Kings"')
(3, '0.034*"writes" + 0.031*"article" + 0.007*"would" + 0.007*"think" + 0.006*"engine"')
(4, '0.017*"player" + 0.012*"season" + 0.010*"writes" + 0.009*"think" + 0.008*"hockey"')
(5, '0.012*"would" + 0.012*"drive" + 0.012*"problem" + 0.009*"Thanks" + 0.009*"system"')
(6, '0.015*"device" + 0.011*"encryption" + 0.011*"RIPEM" + 0.010*"ground" + 0.009*"circuit"')
(7, '0.017*"space" + 0.010*"Space" + 0.009*"launch" + 0.008*"orbit" + 0.007*"would"')
(8, '0.028*"Jesus" + 0.013*"Christ" + 0.009*"church" + 0.007*"point" + 0.007*"Church"')
(9, '0.012*"Gordon" + 0.012*"Master" + 0.010*"Banks" + 0.009*"cover" + 0.008*"article"')
(10, '0.015*"Israel" + 0.011*"people" + 0.010*"Turkish" + 0.010*"Israeli" + 0.009*"Armenian"')
(11, '0.

In [19]:
# Generate visualizations for the topic model and save to an HTML file

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook(local=True)
vis = pyLDAvis.gensim_models.prepare(ldamodel,corpus,dictionary,sort_topics=False)
pyLDAvis.save_html(vis,'lda_15.html')

  and should_run_async(code)
