# Topic Modelling

In [6]:
import numpy as np
import pandas as pd
import spacy
import random
from tqdm import tqdm
from collections import Counter  

In [7]:
df = pd.read_csv("emails.csv")

In [8]:
df.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [8]:
# dirichlet , hyperparameters , tokenizer

ALPHA = 0.1 
BETA = 0.1
NUM_TOPICS = 20 # we can use coherence score it measures how similar are the words inside a topic and based on that we can change the number of topics 

sp = spacy.load("en_core_web_sm")

np.random.seed(42)
random.seed(42)

In [9]:
# to get to know how many time a word occurs 
def generate_frequencies(data, max_docs = 10000): 
    freqs = Counter()
    all_stopwords = sp.Defaults.stop_words
    all_stopwords.add("enron")
    nr_tokens = 0
    
    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)
        for token in tokens:
            token_text = token.text.lower()
            #(contains alphabets and no number)
            if token_text not in all_stopwords and token.is_alpha: #(contains alphabets and no number)
                nr_tokens += 1
                freqs[token_text] += 1
    return freqs



def get_vocab(freqs, freq_thresholds = 3):
    vocab = {}
    vocab_idx_str = {}
    vocab_idx = 0

    for word in freqs:
        if freqs[word] >= freq_thresholds:
            vocab[word] = vocab_idx
            vocab_idx_str[vocab_idx] = word
            vocab_idx += 1
    return vocab, vocab_idx_str


def tokenize_dataset(data, vocab, max_docs = 10000):
    nr_tokens = 0
    nr_docs = 0
    docs =[]

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)

        if len(tokens) > 1:
            doc = []
            for token in tokens:
                token_text = token.text.lower()
                if token_text in vocab:
                    doc.append(token_text)
                    nr_tokens +=1
            nr_docs +=1
            docs.append(doc)
    print(f"Number of documents : {nr_docs}")
    print(f"Number of tokens : {nr_tokens}")

    # Numericalise

    corpus = []
    for doc in docs:
        corpus_d = []

        for token in doc:
            corpus_d.append(vocab[token])

        corpus.append(np.asarray(corpus_d))
    
    return docs, corpus 




In [10]:
data = df['message'].sample(frac = 0.001, random_state = 42).values # take 50% of the data, and convert that to an array

In [11]:
freqs = generate_frequencies(data)
vocab, vocab_idx_str = get_vocab(freqs)
docs, corpus = tokenize_dataset(data, vocab)
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

Number of documents : 517
Number of tokens : 88538
Vocab size: 4966


In [12]:
print(f"Vocab size: {vocab_size}")

Vocab size: 4966


In [13]:
def LDA_Collapsed_Gibbs(corpus, max_iter = 200):

    # PART1: initialise counts and Z(topcis)
    Z =[]
    num_docs = len(corpus)
    for _, doc in enumerate(corpus):
        # for each documnet in the corpus
        Zd = np.random.randint(low =0, high = NUM_TOPICS, size = len(doc)) # assign a topic to each word at random
        Z.append(Zd)
    
    # ndk -for a particular document how are the topcis disributed
    ndk = np.zeros((num_docs,NUM_TOPICS))
    for d in range(num_docs):
        for k in range(NUM_TOPICS):
            ndk[d,k] = np.sum(Z[d] == k)

    # nwk - the topics and its words
    nkw = np.zeros((NUM_TOPICS, vocab_size))
    for doc_idx, doc in enumerate(corpus):
        for i, word in enumerate(doc):
            topic = Z[doc_idx][i]
            nkw[topic, word] +=1


    nk = np.sum(nkw, axis = 1) # how many words are there in each topics 
    topic_list = [i for i in range(NUM_TOPICS)]

    # PART2: loop

    for _ in tqdm(range(max_iter)):
        for doc_idx, doc in enumerate(corpus):
            for i in range(len(doc)): # for all the words in that document 
                word = doc[i]
                topic = Z[doc_idx][i]

                # remove z_i because conditioned on z_(-i)
                ndk[doc_idx, topic] -= 1
                nkw[topic, word] -= 1
                nk[topic] -= 1

            # prob of a word belonging to a certain topic given others
                p_z = (ndk[doc_idx, :] + ALPHA) * (nkw[:, word] + BETA) / (nk[:] + BETA * vocab_size) 
                topic = random.choices(topic_list, weights = p_z, k = 1)[0]

                # update nn parameters 
                Z[doc_idx][i] = topic
                ndk[doc_idx, topic] += 1
                nkw[topic, word] += 1
                nk[topic] += 1

    return Z, ndk, nkw, nk

Z, ndk, nkw, nk = LDA_Collapsed_Gibbs(corpus)  


100%|██████████| 200/200 [05:14<00:00,  1.57s/it]


In [14]:
phi = nkw / np.reshape(NUM_TOPICS, 1) # to get probability distribution 

num_words = 10
for k in range(NUM_TOPICS):
    most_common_words = np.argsort(phi[k])[::-1][:num_words]
    print(f"Topic {k} most common words")

    for word in most_common_words:
        print(vocab_idx_str[word])

    print("\n")

Topic 0 most common words
program
going
day
time
way
school
good
florida
national
people


Topic 1 most common words
cost
energy
month
customers
price
customer
rate
renewable
database
generation


Topic 2 most common words
subject
sent
intended
e
recipient
mail
use
data
information
confidential


Topic 3 most common words
houston
pm
beck
december
tuesday
vince
sally
tickets
available
chris


Topic 4 most common words
jeff
dasovich
james
richard
california
steffes
e
ferc
market
mail


Topic 5 most common words
said
riordan
section
business
freeman
working
meeting
meetings
nemec
staff


Topic 6 most common words
iso
ferc
market
data
ufe
settlement
fee
report
project
december


Topic 7 most common words
x
cc
agreement
content
subject
attached
jones
tana
bcc
charset


Topic 8 most common words
x
content
subject
cc
bcc
date
message
version
origin
mime


Topic 9 most common words
said
company
credit
financial
billion
gas
rating
markets
dow
million


Topic 10 most common words
image
communica