# "Text Classification"

In [1]:
# Packages 

import os
import xml.etree.ElementTree as ET
import numpy as np
import spacy
import random
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [2]:
# Extra installations

!python3.10 -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.7.0
    Uninstalling en-core-web-sm-3.7.0:
      Successfully uninstalled en-core-web-sm-3.7.0
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Importing the data

In [3]:
# Specify the directory containing your text files
directory_path = './data/'

# Initialize an empty list to store the data
data_list = []

# Iterate through each file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.xml'):
        file_path = os.path.join(directory_path, filename)
        
        # Read the text file and append its content to the list
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            data_list.append({'Filename': filename, 'Content': content})

# Create a DataFrame from the list
df = pd.DataFrame(data_list)

## Creating the model

We will use **Latent Dirichlet Allocation (LDA)** is an unsupervised technique for topic modeling.

In [4]:
# Setting up hyperparams

ALPHA = 0.1
BETA = 0.1
NUM_TOPICS = 10 # This number is a personal assumption may variate according to the results
sp = spacy.load("en_core_web_sm") # tokenizer

# reproducibility

np.random.seed(12)
random.seed(12)

In [5]:
def generate_frequencies(data, max_docs=10000):
    """This function allows us to know the frequency of a word in a text"""
    freqs = Counter()
    all_stopwords = sp.Defaults.stop_words # Words that don't add any special value to the classification process
    # all_stopwords.add() --> to add words that we consider do not add value
    nr_tokens = 0

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)
        for token in tokens:
            token_text = token.text.lower()
            if token_text not in all_stopwords and token.is_alpha:
                nr_tokens += 1
                freqs[token_text] += 1
                
    return freqs
    
def get_vocab(freqs, freq_threshold=3):
    """
    Select the words with more than 3 appearances on a text and 
    add the to the vocabulary
    """
    vocab = {}
    vocab_idx_str = {}
    vocab_idx = 0

    for word in freqs:
        if freqs[word] >= freq_threshold:
            vocab[word] = vocab_idx
            vocab_idx_str[vocab_idx] = word
            vocab_idx += 1

    return vocab, vocab_idx_str

def tokenize_dataset(data, vocab, max_docs=10000):
    nr_tokens = 0
    nr_docs = 0
    docs = []

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)

        if len(tokens) > 1:
            doc = []
            for token in tokens:
                token_text = token.text.lower()
                if token_text in vocab:
                    doc.append(token_text)
                    nr_tokens += 1
            nr_docs += 1
            docs.append(doc)

    print(f"Number of text: {nr_docs}")
    print(f"Number of tokens: {nr_tokens}")

    # Numericalize
    corpus = []
    for doc in docs:
        corpus_d = []

        for token in doc:
            corpus_d.append(vocab[token])

        corpus.append(np.asarray(corpus_d))

    return docs, corpus

In [6]:
data = df["Content"].sample(frac=0.3, random_state=12).values
freqs = generate_frequencies(data)
vocab, vocab_idx_str = get_vocab(freqs)
docs, corpus = tokenize_dataset(data, vocab)
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

Number of text: 3990
Number of tokens: 1827365
Vocab size: 22282


In [7]:
def LDA_Collapsed_Gibbs(corpus, num_iter=200):
    # Initialize counts and Z
    Z = []
    num_docs = len(corpus)
    for _, doc in enumerate(corpus):
        Zd = np.random.randint(low=0, high=NUM_TOPICS, size=(len(doc)))
        Z.append(Zd)

    ndk = np.zeros((num_docs, NUM_TOPICS))
    for d in range(num_docs):
        for k in range(NUM_TOPICS):
            ndk[d, k] = np.sum(Z[d] == k)

    nkw = np.zeros((NUM_TOPICS, vocab_size))
    for doc_idx, doc in enumerate(corpus):
        for i, word in enumerate(doc):
            topic = Z[doc_idx][i]
            nkw[topic, word] += 1        

    nk = np.sum(nkw, axis=1)
    topic_list = [i for i in range(NUM_TOPICS)]
    
    # Loop
    for _ in tqdm(range(num_iter)):
        for doc_idx, doc in enumerate(corpus):
            for i in range(len(doc)):
                word = doc[i]
                topic = Z[doc_idx][i]

                # remove z_i because conditioned on z_(-i)
                ndk[doc_idx, topic] -= 1
                nkw[topic, word] -= 1
                nk[topic] -= 1

                p_z = (ndk[doc_idx, :] + ALPHA) * (nkw[:, word] + BETA) / (nk[:] + BETA*vocab_size)
                topic = random.choices(topic_list, weights=p_z, k=1)[0]

                # update n parametersdoc
                Z[doc_idx][i] = topic
                ndk[doc_idx, topic] += 1
                nkw[topic, word] += 1
                nk[topic] += 1

    return Z, ndk, nkw, nk

Z, ndk, nkw, nk = LDA_Collapsed_Gibbs(corpus)    

100%|█████████████████████████████████████████| 200/200 [53:56<00:00, 16.18s/it]


In [9]:
phi = nkw / nk.reshape(NUM_TOPICS,1) # To get the probability distribution

num_words = 10
for k in range(NUM_TOPICS):
    most_common_words = np.argsort(phi[k])[::-1][:num_words]
    print(f"Topic {k} most common words: ")

    for word in most_common_words:
        print(vocab_idx_str[word])

    print("\n")

Topic 0 most common words: 
research
materials
text
energy
project
programreference
high
related
amp
new


Topic 1 most common words: 
research
related
amp
award
quantum
text
mathematical
project
division
longname


Topic 2 most common words: 
data
research
project
text
amp
learning
related
programreference
systems
nsf


Topic 3 most common words: 
research
ocean
project
related
text
nsf
earth
united
directorate
award


Topic 4 most common words: 
research
water
project
text
environmental
related
climate
programreference
united
change


Topic 5 most common words: 
project
sars
phase
virus
div
p
technology
modified
system
health


Topic 6 most common words: 
div
p
images
research
reports
modified
conference
students
workshop
li


Topic 7 most common words: 
research
text
biological
project
related
programreference
species
award
font
cell


Topic 8 most common words: 
students
research
stem
project
education
science
text
learning
investigator
lastname


Topic 9 most common words: 
social

## Analysis