### Keyword Clustering Feature Engineering
This notebook demonstrates how keyword clustering can be used as a form of feature engineering to improve classification results.

In [19]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from tqdm.notebook import tqdm
from sklearn.cluster import AffinityPropagation
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import fasttext as ft
import re
sw = stopwords.words('english')

In [2]:
ft_model = ft.load_model('C:/Users/Christian/Desktop/crawl-300d-2M-subword.bin')



In [3]:
df = pd.read_csv('res/bbc.csv')

In [4]:
df = df.groupby('label').sample(25)

In [5]:
def preprocess(text: str) -> list:
    tokens = []
    text = re.sub('["—;/()•,.\[\]\-\'0123456789]', ' ', text)
    for token in text.split():
        token = token.lower()
        if len(token) > 2 and token not in sw:
            tokens.append(token)
    
    return tokens

In [6]:
tokens = set([token for text in df.text.tolist() for token in preprocess(text)])

In [7]:
embeddings = np.array([ft_model[token] for token in tokens])

In [8]:
af = AffinityPropagation()
labels = af.fit_predict(embeddings)



In [9]:
max(labels)

1021

In [10]:
token_to_idx = {token: i for token, i in zip(tokens, labels)}

In [12]:
bag_of_ids = [
    ' '.join([str(token_to_idx[token]) for token in preprocess(text)]) for text in df.text.tolist()
]

In [26]:
# replace words with a cluster label so we can count vectorize
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(bag_of_ids)

In [27]:
y = df.label.values

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [29]:
clf = LinearSVC()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)



0.75

In [30]:
# with mean embedding vectors
X, y = tuple(
    map(
        np.array, 
        list(
            zip(
                *[
                    (
                        np.mean(
                            [ft_model[token] for token in preprocess(x.text)], 
                            axis=0
                        ),
                        x.label
                    ) for x in df.itertuples()
                ]
            )
        )
    )
)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [32]:
clf = LinearSVC()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6875

### NOTE
It is demonstrated that by first performing keyword clustering on the raw documents, the clusters can be used to induce features which improve the training objective as compared with the mean token embeddings.