Following this tutorial: https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925

Let's see if there are any patterns in the data and visualise the results

In [1]:
import os
import pandas as pd
import numpy as np
np.random.seed(42)
# nlp
import string
from nltk.corpus import stopwords
import spacy
# visualisation
import pyLDAvis.sklearn
# machine learning
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### Load Data

In [2]:
home_path = os.path.expanduser("~")
fp = f'{home_path}/git/30-days-of-NLP/notebooks/Data/bbc-text.csv'
df = pd.read_csv(fp)

There are 5 target topics in the dataset

In [3]:
print(df['category'].unique())

['tech' 'business' 'sport' 'entertainment' 'politics']


### Preprocess

In [4]:
%%time
mystopwords = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm')
def clean(doc):
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()]) # remove punctuation and numbers
    doc = " ".join([token.lower() for token in doc.split() if token not in mystopwords])
    doc = " ".join([w.lemma_ for word in doc.split() for w in nlp(word)])
    return doc

df_preprop_fp = f'{home_path}/git/30-days-of-NLP/notebooks/Data/bbc-text-preprocessed.csv'
if not os.path.exists(df_preprop_fp):
    df['prepro'] = df['text'].apply(clean)
    df.to_csv(df_preprop_fp, index=False)
    
else: # load preprocessed(lemmatisated) dataframe
    df = pd.read_csv(df_preprop_fp)
    
# tokenise documents in dataframe
df['prepro'] = df['prepro'].apply(lambda x: x.split(' '))

CPU times: user 442 ms, sys: 43.9 ms, total: 486 ms
Wall time: 485 ms


Example of original and preprocessed data

### Split data into training and testing

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['prepro'], df['category'], 
                                                    test_size=0.25, random_state=42)
X_train.shape, X_test.shape

((1668,), (557,))

In [6]:
document_num = 50
doc_sample = X_train[document_num]
print(doc_sample)

['lewsey', 'puzzle', 'disallow', 'try', 'england', 'josh', 'lewsey', 'claim', 'deny', 'late', 'try', 'side', 'six', 'nation', 'loss', 'ireland', 'wasp', 'wing', 'insist', 'ground', 'ball', 'bundle', 'line', 'say', 'referee', 'jonathan', 'kaplan', 'make', 'wrong', 'decision', 'positive', 'touch', 'ball', 'line', 'lewsey', 'tell', 'bbc', 'sport', 'certainly', 'turnover', 'drive', 'put', 'ball', 'ground', 'whistle', 'go', 'let', 'go', 'ball', 'lewsey', 'add', 'one', 'irish', 'player', 'scoop', 'back', 'whistle', 'surprise', 'referee', 'give', 'turnover', 'far', 'concerned', 'incident', 'mark', 'cueto', 'effort', 'charlie', 'hodgson', 'crossfield', 'kick', 'lead', 'look', 'like', 'good', 'try', 'two', 'key', 'element', 'game', 'cueto', 'also', 'puzzle', 'try', 'disallow', 'kaplan', 'think', 'could', 'offside', 'without', 'doubt', 'behind', 'ball', 'say', 'sale', 'player', 'move', 'plan', 'technique', 'cuff', 'rehearse', 'time', 'time', 'say', 'rob', 'decision', 'go', 'go', 'today', 'go', '

### Text Vectorisation
Using sklearns implementation

In [7]:
# count_vect = TfidfVectorizer()#max_df=0.9, min_df=2)
count_vect = CountVectorizer()#max_df=0.9, min_df=2)
vec = count_vect.fit_transform(df['prepro'].astype(str))

In [8]:
vec

<2225x23431 sparse matrix of type '<class 'numpy.int64'>'
	with 315944 stored elements in Compressed Sparse Row format>

### Latent Dirichlet Allocation (LDA) modelling
I model to 5 topics:

In [9]:
lda = LatentDirichletAllocation(n_components=5, random_state=42, max_iter=100, n_jobs=-1)
lda.fit(vec)

LatentDirichletAllocation(max_iter=100, n_components=5, n_jobs=-1,
                          random_state=42)

In [10]:
print("Log Likelihood: ", lda.score(vec))
print("Perplexity: ", lda.perplexity(vec))

Log Likelihood:  -3784694.387679375
Perplexity:  2731.906127502113


In [11]:
for index, topic in enumerate(lda.components_):
    print(f'Top 5 words for Topic #{index}')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-5:]])
    print('\n')

Top 5 words for Topic #0
['say', 'year', 'award', 'good', 'film']


Top 5 words for Topic #1
['labour', 'government', 'would', 'mr', 'say']


Top 5 words for Topic #2
['company', 'bn', 'we', 'year', 'say']


Top 5 words for Topic #3
['go', 'win', 'play', 'game', 'say']


Top 5 words for Topic #4
['game', 'technology', 'people', 'use', 'say']




### Visualisation

In [12]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, vec, count_vect, mds='tsne')
panel

  by='saliency', ascending=False).head(R).drop('saliency', 1)


Bag of words makes a lot more sense than tf-idf for topic modelling. tf-idf contains the most obscure words