## LDA topic modeling with sklearn

In [1]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
import string
import csv

import warnings
warnings.filterwarnings("ignore")

In [2]:
# stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

def read_in_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp, delimiter=',', quotechar='"')
        data_read = [row for row in reader]
    return data_read

def get_stopwords(path):
    stopwords = read_in_csv(path)
    stopwords = [word[0] for word in stopwords]
    stemmed_stopwords = [stemmer.stem(word) for word in stopwords]
    stopwords = stopwords + stemmed_stopwords
    return stopwords

stopwords = get_stopwords('stopwords.csv')

In [3]:
def tokenize_and_stem(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered_token = [t for t in tokens if t not in string.punctuation]
    stems = [stemmer.stem(t) for t in filtered_token]
    return stems

In [4]:
bbc_dataset = '../Chapter04/bbc-text.csv'

In [5]:
def create_count_vectorizer(documents):
    count_vectorizer = CountVectorizer(stop_words=stopwords, tokenizer=tokenize_and_stem)
    data = count_vectorizer.fit_transform(documents)
    return count_vectorizer, data

In [6]:
def clean_data(df):
    df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
    df['text'] = \
    df['text'].apply(lambda x: re.sub(r'\d', '', x))
    return df

In [7]:
def create_and_fit_lda(data, num_topics):
    lda = LDA(n_components=num_topics, n_jobs=-1)
    lda.fit(data)
    return lda

In [8]:
def get_most_common_words_for_topics(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names()
    word_dict = {}
    for topic_index, topic in enumerate(model.components_):
        this_topic_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        word_dict[topic_index] = this_topic_words
    return word_dict

In [9]:
def print_topic_words(word_dict):
    for key in word_dict.keys():
        print(f'topic {key}')
        print("\t", word_dict[key])

In [10]:
df = pd.read_csv(bbc_dataset)
df = clean_data(df)
documents = df['text']

In [11]:
num_topic = 5

In [12]:
vectorizer, data = create_count_vectorizer(documents)
lda = create_and_fit_lda(data, num_topic)

In [13]:
topic_words = get_most_common_words_for_topics(lda, vectorizer, 10)
print_topic_words(topic_words)

topic 0
	 ['game', 'play', 'player', 'time', 'england', 'year', 'first', 'back', 'win', 'get']
topic 1
	 ['elect', 'labour', 'year', 'parti', 'say', 'blair', 'minist', 'm', 'peopl', 'tori']
topic 2
	 ['peopl', 'year', 'use', 'servic', 'uk', 'new', 'mobil', 'phone', 'technolog', 'music']
topic 3
	 ['film', 'best', 'award', 'm', 'star', 'year', 'includ', 'show', 'director', 'actor']
topic 4
	 ['compani', 'bn', 'year', 'firm', 'm', 'share', 'market', 'new', 'sale', 'bank']


In [14]:
import pickle

In [15]:
model_path = "lda_sklearn.pkl"
vectorizer_path = "vectorizer.pkl"

In [16]:
new_example = """Manchester United players slumped
to the turf at full-time in Germany on Tuesday in
acknowledgement of what their latest pedestrian firsthalf display had cost them. The 3-2 loss at RB Leipzig
means United will not be one of the 16 teams in the draw
for the knockout stages of the Champions League. And
this is not the only price for failure. The damage will
be felt in the accounts, in the dealings they have with
current and potentially future players and in the faith
the fans have placed in manager Ole Gunnar Solskjaer.
With Paul Pogba's agent angling for a move for his
client and ex-United defender Phil Neville speaking of a
"witchhunt" against his former team-mate Solskjaer, BBC
Sport looks at the ramifications and reaction to a big
loss for United."""

In [17]:
def save_model(lda, lda_path, vect, vect_path):
    pickle.dump(lda, open(lda_path, 'wb'))
    pickle.dump(vect, open(vect_path, 'wb'))

In [18]:
def test_new_example(lda, vect, example):
    vectorized = vect.transform([example])
    topic = lda.transform(vectorized)
    print(topic)
    return topic

In [19]:
test_new_example(lda, vectorizer, new_example)

[[0.67490796 0.09460827 0.00351055 0.00353303 0.22344018]]


array([[0.67490796, 0.09460827, 0.00351055, 0.00353303, 0.22344018]])

## LDA topic modeling with gensim

In [21]:
import re
import pandas as pd
from gensim.models.ldamodel import LdaModel
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import matplotlib.pyplot as plt
from pprint import pprint


In [22]:
bbc_dataset = '../Chapter04/bbc-text.csv'
stopwords_file_path = "stopwords.csv"

In [23]:
# stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

def read_in_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp, delimiter=',', quotechar='"')
        data_read = [row for row in reader]
    return data_read

def get_stopwords(path):
    stopwords = read_in_csv(path)
    stopwords = [word[0] for word in stopwords]
    stemmed_stopwords = [stemmer.stem(word) for word in stopwords]
    stopwords = stopwords + stemmed_stopwords
    return stopwords

stopwords = get_stopwords(stopwords_file_path)

In [24]:
def clean_data(df):
    df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
    df['text'] = df['text'].apply(lambda x: re.sub(r'\d', '', x))
    return df

In [25]:
def preprocess(df):
    df = clean_data(df)
    df['text'] = df['text'].apply(lambda x: simple_preprocess(x, deacc=True))
    df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stopwords])
    return df

In [26]:
def create_lda_model(id_dict, corpus, num_topics):
    lda_model = LdaModel(corpus=corpus, id2word=id_dict, num_topics=num_topics,
                        random_state=100, chunksize=100, passes=10)
    return lda_model

In [27]:
df = pd.read_csv(bbc_dataset)
df = preprocess(df)

In [30]:
texts = df['text'].values
id_dict = corpora.Dictionary(texts)
corpus = [id_dict.doc2bow(text) for text in texts]

In [32]:
number_topics = 5
lda_model = create_lda_model(id_dict, corpus, number_topics)

In [33]:
pprint(lda_model.print_topics())

[(0,
  '0.010*"net" + 0.008*"software" + 0.007*"users" + 0.007*"information" + '
  '0.007*"people" + 0.006*"attacks" + 0.006*"computer" + 0.006*"data" + '
  '0.006*"use" + 0.005*"firms"'),
 (1,
  '0.012*"people" + 0.006*"blair" + 0.005*"labour" + 0.005*"new" + '
  '0.005*"mobile" + 0.005*"party" + 0.004*"get" + 0.004*"government" + '
  '0.004*"uk" + 0.004*"election"'),
 (2,
  '0.012*"film" + 0.009*"best" + 0.006*"music" + 0.006*"year" + 0.005*"show" + '
  '0.005*"new" + 0.004*"uk" + 0.004*"awards" + 0.004*"films" + 0.004*"last"'),
 (3,
  '0.008*"game" + 0.006*"england" + 0.006*"first" + 0.006*"time" + '
  '0.006*"year" + 0.005*"players" + 0.005*"win" + 0.005*"world" + 0.005*"back" '
  '+ 0.005*"last"'),
 (4,
  '0.010*"bn" + 0.010*"year" + 0.007*"sales" + 0.005*"last" + '
  '0.004*"government" + 0.004*"new" + 0.004*"market" + 0.004*"growth" + '
  '0.004*"spending" + 0.004*"economic"')]


In [34]:
new_example = """Manchester United players slumped to the
turf
at full-time in Germany on Tuesday in acknowledgement of
what their
latest pedestrian first-half display had cost them. The
3-2 loss at
RB Leipzig means United will not be one of the 16 teams
in the draw
for the knockout stages of the Champions League. And this
is not the
only price for failure. The damage will be felt in the
accounts, in
the dealings they have with current and potentially
future players
and in the faith the fans have placed in manager Ole
Gunnar Solskjaer.
With Paul Pogba's agent angling for a move for his client
and ex-United
defender Phil Neville speaking of a "witchhunt" against
his former team-mate
Solskjaer, BBC Sport looks at the ramifications and
reaction to a big loss for United."""

In [35]:
def save_model(lda, lda_path, id_dict, dict_path):
    lda.save(lda_path)
    id_dict.save(dict_path)

In [36]:
def load_model(lda_path, dict_path):
    lda = LdaModel.load(lda_path)
    id_dict = corpora.Dictionary.load(dict_path)
    return (lda, id_dict)

In [37]:
def test_new_example(lda, id_dict, input_string):
    input_list = clean_text(input_string)
    bow = id_dict.doc2bow(input_list)
    topics = lda[bow]
    print(topics)
    return topics

In [44]:
def clean_text(input_string):
    input_string = re.sub(r'[^\w\s]', ' ', input_string)
    input_string = re.sub(r'\d', '', input_string)
    input_list = simple_preprocess(input_string)
    input_list = [word for word in input_list if word not in stopwords]
    return input_list

In [45]:
model_path = "lda_gensim.pkl"
dict_path = "id_dict.pkl"

In [46]:
save_model(lda_model, model_path, id_dict, dict_path)

In [47]:
test_new_example(lda_model, id_dict, new_example)

[(0, 0.023436217), (1, 0.036407087), (3, 0.7584859), (4, 0.17845576)]


[(0, 0.023436217), (1, 0.036407087), (3, 0.7584859), (4, 0.17845576)]

## NMF topic modeling