# BERT Clustering

Unsupervised clustering of forum posts using the text body. A simple approach to try and find similar posts without strict classification.

Right now the model doesn't do anything, which is very sad.

References

1. [Topic Modeling with BERT](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6)
2. [Multi-label Text Classification using BERT - The Mighty Transformer](https://medium.com/huggingface/multi-label-text-classification-using-bert-the-mighty-transformer-69714fa3fb3d)
3. [BERT Fine-Tuning Tutorial with PyTorch](https://mccormickml.com/2019/07/22/BERT-fine-tuning)
4. [Usering BERT For Classifying Documents with Long Texts](https://medium.com/@armandj.olivares/using-bert-for-classifying-documents-with-long-texts-5c3e7b04573d)

In [None]:
!pip install transformers -q

import re
import math
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
from tokenizers.normalizers import BertNormalizer

In [None]:
np.random.RandomState(123)
tqdm.pandas()

if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f'using {torch.cuda.get_device_name(0)}')
else:
    print('no GPU avaiable')

## Load Data

In [None]:
df_ori = pd.read_json('https://github.com/charlotte-zhuang/forum-recommender/blob/master/data/sitepoint.json?raw=true')
df_ori.head()

## Preprocessing

In [None]:
def pre_tokenize_post(text_l: list) -> str:
    '''Converts a list of text into a single string that
        can be tokenized.
    '''
    
    text = ' '.join(text_l)
    
    # remove links
    text = re.sub(r'https?:\/\/[^\s]*', r'', text)
    
    # remove closed single quotes
    text = re.sub(r"‘([^‘’]*)’", r'\1', text)

    # remove unwanted characters
    text = re.sub(r"[^\w\s’]+", r' ', text)
    
    # turn whitespace into a space
    text = re.sub(r'\s+', r' ', text)

    # lowercase
    text = text.lower()
    
    return text

In [None]:
df = df_ori.drop(labels=['title', 'tags'], axis=1)
df['post'] = df['post'].apply(pre_tokenize_post)

for i in range(5):
    print(df.iat[i, 1], end='\n\n')

In [None]:
# optionally drop data for convenience
# df = df[:1000]

## Tokenize

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer.normalizer = BertNormalizer()

Split samples up to avoid truncation, then tokenize each sample.

In [None]:
def tokenize_text(text: str) -> list:
    '''Tokenizes text using chunks of up to 300 words. This does the exact
        same thing as encode_text(), but step by step.

    Returns:
        numpy.ndarray: 2D array of IDs: axis0=chunks, axis1=tokens.
        numpy.ndarray: 2D array of attention masks: axis0=chunks, axis1=mask.
    '''

    # break up sentences into chunks of 300 words with 50 overlapping
    res = []
    text = text.split()
    n = int(math.ceil((len(text) - 100) / 200))

    res.append(' '.join(text[:300]))

    for i in range(1, n):
        res.append(' '.join(text[i * 250 : i * 250 + 300]))

    # tokenize
    res = list(map(lambda x: ['[CLS]'] + tokenizer.tokenize(x), res))

    # convert to ids
    res = list(map(tokenizer.convert_tokens_to_ids, res))

    # pad sentences to be 512 characters
    res = np.array(list(map(lambda x: x + [0] * (512 - len(x)), res)))

    # attention mask
    attention_mask = np.where(res != 0, 1, 0)

    return res , attention_mask

In [None]:
def encode_text(text):
    '''Tokenizes text using chunks of up to 300 words. This does the exact
        same thing as tokenize_text(), using the encode() method instead.

    Returns:
        numpy.ndarray: 2D array of IDs: axis0=chunks, axis1=tokens.
        numpy.ndarray: 2D array of attention masks: axis0=chunks, axis1=mask.
    '''
    

    # break up sentences into chunks of 300 words with 50 overlapping
    res = []
    text = text.split()
    n = int(math.ceil((len(text) - 100) / 200))

    res.append(' '.join(text[:300]))

    for i in range(1, n):
        res.append(' '.join(text[i * 250 : i * 250 + 300]))
    
    res = np.array(res)

    # tokenize
    res = list(map(
        lambda x: tokenizer.encode(x, add_special_tokens=True),
        res))

    # pad sentences to be 512 characters
    res = np.array(list(map(lambda x: x + [0] * (512 - len(x)), res)))
    
    # mask
    attention_mask = np.where(res != 0, 1, 0)
    
    return res , attention_mask

In [None]:
tokens = list(map(encode_text, df['post']))
df['tokens'] = [x[0] for x in tokens]
df['mask'] = [x[1] for x in tokens]
df.head()

## Classification

I'm actually not using the BERT classifier anymore.

In [None]:
bert_model = AutoModel.from_pretrained('distilbert-base-uncased')

Get class embeddings. Each chunk from a sample gets a vector of 768 embeddings (from the 768 hidden layers).

In [None]:
def embed_tokens(sample):

    tokens = torch.tensor(sample['tokens'])
    attention_mask = torch.tensor(sample['mask'])

    with torch.no_grad():
        last_hidden_states = bert_model(tokens, attention_mask=attention_mask)

    embeddings = last_hidden_states[0][:,0,:].numpy()

    return embeddings

This step takes a while.

In [None]:
df['embed'] = df.progress_apply(embed_tokens, axis=1)

In [None]:
df.head()