## Data preparation
Today we will use the 20newsgroups dataset as seen in the "bitezize-NLP-prep-20newsgroups" repository

In [1]:
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.datasets import fetch_20newsgroups

stop_words = stopwords.words("english") #load the stop words (words to ignore list) for english
df = pd.DataFrame(pd.Series(fetch_20newsgroups(subset='train').data))

In [2]:
# Comment extraction

def extractComments(x):
    ''' INPUT: a string
        OUTPUT: the right side of the string after splitting it
            on the first double line break
    '''
    l = x.split('\n\n',1)
    return l[1]

df['comments'] = df[0].apply(lambda x: extractComments(x)).astype(str)

In [4]:
# Comment cleaning

def scrubString(x):
    ''' INPUT: a string
        OUTPUT: a string that has had links removed, then non-letters, then english stopwords
            This will produce a blank string if it only consisted of links, numbers, etc
    '''
    lemmatizer = WordNetLemmatizer()
    
    x = re.sub("\S*@\S*\s?","",x) #Remove email addresses
    x = re.sub("#\S+|&\S+|@\S+|https?:\S+|RT|[^A-Za-z0-9]+",' ', x) #Remove hyperlinks
    x = re.sub("&\S*|@\S+|https?:\S+",' ', x) #Remove more hyperlinks
    x = re.sub("[^A-Za-z']+",' ',x) #keep only letters

    if len(x)==0:
        return ''
    
    tokens = word_tokenize(x) # Convert the string into tokens
    
    # Lemmatize the words, and only keep non-stop words
    tokens = [lemmatizer.lemmatize(word).strip() for word in tokens if word not in stop_words]
    
    if len(tokens)==0:
        return ''
    
    return ' '.join(map(str,tokens))

df['cleaned'] = df['comments'].apply(lambda x: scrubString(x))

## Topic modelling
Now we get to use gensim to extract topics

In [6]:
# Extract and save the tokens of the cleaned text

def extract_tokens(x, min_len=5):
    ''' INPUT: a string
        OUTPUT: a list of tokens with that meet a minimum length
    '''
    tokens = word_tokenize(x)
    tokens = [token for token in tokens if len(token) >= min_len]
    return tokens

df['tokens'] = df['cleaned'].apply(lambda x: extract_tokens(x))

In [7]:
import gensim

ModuleNotFoundError: No module named 'gensim'