In [9]:
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
import pandas as pd
import re
import nltk
nltk.download('popular')
from nltk.tokenize import word_tokenize
from language_detector import detect_language

import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
if sym_spell.word_count:
    pass
else:
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# cohernece
from collections import Counter
from sklearn.metrics import silhouette_score
from gensim.models.coherencemodel import CoherenceModel    

# lda
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from gensim import corpora
import gensim
import numpy as np
from datetime import datetime

# autoencoder
import keras
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to C:\Users\Devdatta
[nltk_data]    |     Supnekar\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to C:\Users\Devdatta
[nltk_data]    |     Supnekar\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to C:\Users\Devdatta
[nltk_data]    |     Supnekar\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to C:\Users\Devdatta
[nltk_data]    |     Supnekar\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to C:\Users\Devdatta
[nltk_data]    |     Supnekar\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_da

### Preprocessing

In [3]:
# preprocessing.py

###################################
#### sentence level preprocess ####
###################################

# lowercase + base filter
# some basic normalization
def f_base(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # normalization 1: xxxThis is a --> xxx. This is a (missing delimiter)
    s = re.sub(r'([a-z])([A-Z])', r'\1\. \2', s)  # before lower case
    # normalization 2: lower case
    s = s.lower()
    # normalization 3: "&gt", "&lt"
    s = re.sub(r'&gt|&lt', ' ', s)
    # normalization 4: letter repetition (if more than 2)
    s = re.sub(r'([a-z])\1{2,}', r'\1', s)
    # normalization 5: non-word repetition (if more than 1)
    s = re.sub(r'([\W+])\1{1,}', r'\1', s)
    # normalization 6: string * as delimiter
    s = re.sub(r'\*|\W\*|\*\W', '. ', s)
    # normalization 7: stuff in parenthesis, assumed to be less informal
    s = re.sub(r'\(.*?\)', '. ', s)
    # normalization 8: xxx[?!]. -- > xxx.
    s = re.sub(r'\W+?\.', '.', s)
    # normalization 9: [.?!] --> [.?!] xxx
    s = re.sub(r'(\.|\?|!)(\w)', r'\1 \2', s)
    # normalization 10: ' ing ', noise text
    s = re.sub(r' ing ', ' ', s)
    # normalization 11: noise text
    s = re.sub(r'product received for free[.| ]', ' ', s)
    # normalization 12: phrase repetition
    s = re.sub(r'(.{2,}?)\1{1,}', r'\1', s)

    return s.strip()


# language detection
def f_lan(s):
    """
    :param s: string to be processed
    :return: boolean (s is English)
    """

    # some reviews are actually english but biased toward french
    return detect_language(s) in {'English', 'French','Spanish','Chinese'}


###############################
#### word level preprocess ####
###############################

# filtering out punctuations and numbers
def f_punct(w_list):
    """
    :param w_list: word list to be processed
    :return: w_list with punct and number filter out
    """
    return [word for word in w_list if word.isalpha()]


# selecting nouns
def f_noun(w_list):
    """
    :param w_list: word list to be processed
    :return: w_list with only nouns selected
    """
    return [word for (word, pos) in nltk.pos_tag(w_list) if pos[:2] == 'NN']


# typo correction
def f_typo(w_list):
    """
    :param w_list: word list to be processed
    :return: w_list with typo fixed by symspell. words with no match up will be dropped
    """
    w_list_fixed = []
    for word in w_list:
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=3)
        if suggestions:
            w_list_fixed.append(suggestions[0].term)
        else:
            pass
            # do word segmentation, deprecated for inefficiency
            # w_seg = sym_spell.word_segmentation(phrase=word)
            # w_list_fixed.extend(w_seg.corrected_string.split())
    return w_list_fixed


# stemming if doing word-wise
p_stemmer = PorterStemmer()


def f_stem(w_list):
    """
    :param w_list: word list to be processed
    :return: w_list with stemming
    """
    return [p_stemmer.stem(word) for word in w_list]


# filtering out stop words
# create English stop words list

stop_words = (list(
    set(get_stop_words('en'))
    |set(get_stop_words('es'))
    |set(get_stop_words('de'))
    |set(get_stop_words('it'))
    |set(get_stop_words('ca'))
    #|set(get_stop_words('cy'))
    |set(get_stop_words('pt'))
    #|set(get_stop_words('tl'))
    |set(get_stop_words('pl'))
    #|set(get_stop_words('et'))
    |set(get_stop_words('da'))
    |set(get_stop_words('ru'))
    #|set(get_stop_words('so'))
    |set(get_stop_words('sv'))
    |set(get_stop_words('sk'))
    #|set(get_stop_words('cs'))
    |set(get_stop_words('nl'))
    #|set(get_stop_words('sl'))
    #|set(get_stop_words('no'))
    #|set(get_stop_words('zh-cn'))
))

def f_stopw(w_list):
    """
    filtering out stop words
    """
    return [word for word in w_list if word not in stop_words]


def preprocess_sent(rw):
    """
    Get sentence level preprocessed data from raw review texts
    :param rw: review to be processed
    :return: sentence level pre-processed review
    """
    s = f_base(rw)
    if not f_lan(s):
        return None
    return s


def preprocess_word(s):
    """
    Get word level preprocessed data from preprocessed sentences
    including: remove punctuation, select noun, fix typo, stem, stop_words
    :param s: sentence to be processed
    :return: word level pre-processed review
    """
    if not s:
        return None
    w_list = word_tokenize(s)
    w_list = f_punct(w_list)
    w_list = f_noun(w_list)
    w_list = f_typo(w_list)
    w_list = f_stem(w_list)
    w_list = f_stopw(w_list)

    return w_list

def preprocess(docs, samp_size=None):
    """
    Preprocess the data
    """
    if not samp_size:
        samp_size = 1000

    print('Preprocessing raw texts ...')
    n_docs = len(docs)
    sentences = []  # sentence level preprocessed
    token_lists = []  # word level preprocessed
    idx_in = []  # index of sample selected
    #     samp = list(range(100))
    samp = np.random.choice(n_docs, samp_size)
    for i, idx in enumerate(samp):
        sentence = preprocess_sent(docs[idx])
        token_list = preprocess_word(sentence)
        if token_list:
            idx_in.append(idx)
            sentences.append(sentence)
            token_lists.append(token_list)
        print('{} %'.format(str(np.round((i + 1) / len(samp) * 100, 2))), end='\r')
    print('Preprocessing raw texts. Done!')
    return sentences, token_lists, idx_in


### Utuility 

In [4]:
# utils.py

def get_topic_words(token_lists, labels, k=None):
    """
    get top words within each topic from clustering results
    """
    if k is None:
        k = len(np.unique(labels))
    topics = ['' for _ in range(k)]
    for i, c in enumerate(token_lists):
        topics[labels[i]] += (' ' + ' '.join(c))
    word_counts = list(map(lambda x: Counter(x.split()).items(), topics))
    # get sorted word counts
    word_counts = list(map(lambda x: sorted(x, key=lambda x: x[1], reverse=True), word_counts))
    # get topics
    topics = list(map(lambda x: list(map(lambda x: x[0], x[:10])), word_counts))
    return topics

def get_coherence(model, token_lists, measure='c_v'):
    """
    Get model coherence from gensim.models.coherencemodel
    :param model: Topic_Model object
    :param token_lists: token lists of docs
    :param topics: topics as top words
    :param measure: coherence metrics
    :return: coherence score
    """
    if model.method == 'LDA':
        cm = CoherenceModel(model=model.ldamodel, texts=token_lists, corpus=model.corpus, dictionary=model.dictionary,
                            coherence=measure)
    else:
        topics = get_topic_words(token_lists, model.cluster_model.labels_)
        cm = CoherenceModel(topics=topics, texts=token_lists, corpus=model.corpus, dictionary=model.dictionary,
                            coherence=measure)
    return cm.get_coherence()

def get_silhouette(model):
    """
    Get silhouette score from model
    :param model: Topic_Model object
    :return: silhouette score
    """
    if model.method == 'LDA':
        return
    lbs = model.cluster_model.labels_
    vec = model.vec[model.method]
    return silhouette_score(vec, lbs)

### Autoencoder

In [5]:


class Autoencoder:
    """
    Autoencoder for learning latent space representation
    architecture simplified for only one hidden layer
    """

    def __init__(self, latent_dim=32, activation='relu', epochs=200, batch_size=128):
        self.latent_dim = latent_dim
        self.activation = activation
        self.epochs = epochs
        self.batch_size = batch_size
        self.autoencoder = None
        self.encoder = None
        self.decoder = None
        self.his = None

    def _compile(self, input_dim):
        """
        compile the computational graph
        """
        input_vec = Input(shape=(input_dim,))
        encoded = Dense(self.latent_dim, activation=self.activation)(input_vec)
        decoded = Dense(input_dim, activation=self.activation)(encoded)
        self.autoencoder = Model(input_vec, decoded)
        self.encoder = Model(input_vec, encoded)
        encoded_input = Input(shape=(self.latent_dim,))
        decoder_layer = self.autoencoder.layers[-1]
        self.decoder = Model(encoded_input, self.autoencoder.layers[-1](encoded_input))
        self.autoencoder.compile(optimizer='adam', loss=keras.losses.mean_squared_error)

    def fit(self, X):
        if not self.autoencoder:
            self._compile(X.shape[1])
        X_train, X_test = train_test_split(X)
        self.his = self.autoencoder.fit(X_train, X_train,
                                        epochs=self.epochs,
                                        batch_size=self.batch_size,
                                        shuffle=True,
                                        validation_data=(X_test, X_test), verbose=0)
 

### BASE TM

In [6]:
### base tm


# define model object
class Topic_Model:
    def __init__(self, k=10, method='TFIDF'):
        """
        :param k: number of topics
        :param method: method chosen for the topic model
        """
        if method not in {'TFIDF', 'LDA', 'BERT', 'LDA_BERT'}:
            raise Exception('Invalid method!')
        self.k = k
        self.dictionary = None
        self.corpus = None
        #self.stopwords = None
        self.cluster_model = None
        self.ldamodel = None
        self.vec = {}
        self.gamma = 15  # parameter for relative importance of lda
        self.method = method
        self.AE = None
        self.id = method + '_' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

    def vectorize(self, sentences, token_lists, method=None):
        """
        Get vector representations from selected methods
        """
        # Default method
        if method is None:
            method = self.method

        # turn tokenized documents into a id <-> term dictionary
        self.dictionary = corpora.Dictionary(token_lists)
        # convert tokenized documents into a document-term matrix
        self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]

        if method == 'TFIDF':
            print('Getting vector representations for TF-IDF ...')
            tfidf = TfidfVectorizer()
            vec = tfidf.fit_transform(sentences)
            print('Getting vector representations for TF-IDF. Done!')
            return vec

        elif method == 'LDA':
            print('Getting vector representations for LDA ...')
            if not self.ldamodel:
                self.ldamodel = gensim.models.ldamodel.LdaModel(self.corpus, num_topics=self.k, id2word=self.dictionary,
                                                                passes=20)

            def get_vec_lda(model, corpus, k):
                """
                Get the LDA vector representation (probabilistic topic assignments for all documents)
                :return: vec_lda with dimension: (n_doc * n_topic)
                """
                n_doc = len(corpus)
                vec_lda = np.zeros((n_doc, k))
                for i in range(n_doc):
                    # get the distribution for the i-th document in corpus
                    for topic, prob in model.get_document_topics(corpus[i]):
                        vec_lda[i, topic] = prob

                return vec_lda

            vec = get_vec_lda(self.ldamodel, self.corpus, self.k)
            print('Getting vector representations for LDA. Done!')
            return vec

        elif method == 'BERT':

            print('Getting vector representations for BERT ...')
            from sentence_transformers import SentenceTransformer
            model = SentenceTransformer('bert-base-nli-max-tokens')
            vec = np.array(model.encode(sentences, show_progress_bar=True))
            print('Getting vector representations for BERT. Done!')
            return vec


        elif method == 'LDA_BERT':
            # else:
            vec_lda = self.vectorize(sentences, token_lists, method='LDA')
            vec_bert = self.vectorize(sentences, token_lists, method='BERT')
            vec_ldabert = np.c_[vec_lda * self.gamma, vec_bert]
            self.vec['LDA_BERT_FULL'] = vec_ldabert
            if not self.AE:
                self.AE = Autoencoder()
                print('Fitting Autoencoder ...')
                self.AE.fit(vec_ldabert)
                print('Fitting Autoencoder Done!')
            vec = self.AE.encoder.predict(vec_ldabert)
            return vec

    def fit(self, sentences, token_lists, method=None, m_clustering=None):
        """
        Fit the topic model for selected method given the preprocessed data
        :docs: list of documents, each doc is preprocessed as tokens
        :return:
        """
        # Default method
        if method is None:
            method = self.method
        # Default clustering method
        if m_clustering is None:
            m_clustering = KMeans

        # turn tokenized documents into a id <-> term dictionary
        if not self.dictionary:
            self.dictionary = corpora.Dictionary(token_lists)
            # convert tokenized documents into a document-term matrix
            self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]

        ####################################################
        #### Getting ldamodel or vector representations ####
        ####################################################

        if method == 'LDA':
            if not self.ldamodel:
                print('Fitting LDA ...')
                self.ldamodel = gensim.models.ldamodel.LdaModel(self.corpus, num_topics=self.k, id2word=self.dictionary,
                                                                passes=20)
                print('Fitting LDA Done!')
        else:
            print('Clustering embeddings ...')
            self.cluster_model = m_clustering(self.k)
            self.vec[method] = self.vectorize(sentences, token_lists, method)
            self.cluster_model.fit(self.vec[method])
            print('Clustering embeddings. Done!')

    def predict(self, sentences, token_lists, out_of_sample=None):
        """
        Predict topics for new_documents
        """
        # Default as False
        out_of_sample = out_of_sample is not None

        if out_of_sample:
            corpus = [self.dictionary.doc2bow(text) for text in token_lists]
            if self.method != 'LDA':
                vec = self.vectorize(sentences, token_lists)
                print(vec)
        else:
            corpus = self.corpus
            vec = self.vec.get(self.method, None)

        if self.method == "LDA":
            lbs = np.array(list(map(lambda x: sorted(self.ldamodel.get_document_topics(x),
                                                     key=lambda x: x[1], reverse=True)[0][0],
                                    corpus)))
        else:
            lbs = self.cluster_model.predict(vec)
        return lbs

### main

In [10]:
input_data_path = "data/bbc-text.csv"
method ='LDA_BERT'
ntopic = 10
save=True, 
output_data_path = "output/bbc-text_reviews.csv"

In [11]:
documents = pd.read_csv(input_data_path)
# keeping only the first two columns
documents = documents.iloc[:100, 0:2]

In [12]:
documents.head()

Unnamed: 0,index,text
0,1,tv future in the hands of viewers with home th...
1,2,worldcom boss left books alone former worldc...
2,3,tigers wary of farrell gamble leicester say ...
3,4,yeading face newcastle in fa cup premiership s...
4,5,ocean s twelve raids box office ocean s twelve...


In [13]:
documents.isna().sum()

index    0
text     0
dtype: int64

In [14]:
documents.dropna(inplace=True)
samp_size = len(documents)

In [15]:
samp_size

100

In [16]:
print(documents.columns)
documents.columns = ['index', 'abstract']
print(documents.columns)
documents.abstract = documents.abstract.astype(str)

Index(['index', 'text'], dtype='object')
Index(['index', 'abstract'], dtype='object')


In [17]:
rws = documents.abstract

In [18]:
rws[:2]

0    tv future in the hands of viewers with home th...
1    worldcom boss  left books alone  former worldc...
Name: abstract, dtype: object

In [19]:
sentences, token_lists, idx_in = preprocess(rws, samp_size=samp_size)

Preprocessing raw texts ...
Preprocessing raw texts. Done!


In [22]:
print(sentences[0])
print()
print(token_lists[0])
print()
print(idx_in[0])

labour targets hardcore truants a fresh crackdown on persistent truants in england has been launched by education secretary ruth kelly. serial truants make up one in 13 pupils. previous initiatives brought 40 000 pupils back to school since 1997 according to official statistics. parenting contracts penalty notices and fast track prosecution systems have been used to tackle what has been a stubborn problem. it is thought that nearly half a million children skip school each day. tories say labour s previous success regarding the issue came because it tackled the easy part of the problem by reducing authorised absence where parents are permitted to take children out of school. such absences are often due to family holidays. however serial truants avoid the classroom despite government schemes costing ? 885m. those missing classes are more likely to become involved in crime as well as failing academically. measures such as parenting contracts and penalty notices were adopted by most local 

In [23]:
 tm = Topic_Model(k=ntopic, method=method)

In [24]:
# Fit the topic model by chosen method
tm.fit(sentences, token_lists)

Clustering embeddings ...
Getting vector representations for LDA ...
Getting vector representations for LDA. Done!
Getting vector representations for BERT ...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 13/13 [00:25<00:00,  1.97s/it]


Getting vector representations for BERT. Done!
Fitting Autoencoder ...






Fitting Autoencoder Done!
Clustering embeddings. Done!


In [25]:
print(tm.k)

10


In [26]:
print(tm.cluster_model)

KMeans(n_clusters=10)


In [27]:
print(tm.method)

LDA_BERT


In [28]:
print(tm.ldamodel)

LdaModel(num_terms=1957, num_topics=10, decay=0.5, chunksize=2000)


In [31]:
print(tm.corpus[0])

[(0, 2), (1, 1), (2, 3), (3, 3), (4, 2), (5, 1), (6, 2), (7, 2), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 6), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 2), (30, 1), (31, 3), (32, 1), (33, 1), (34, 3), (35, 1), (36, 1), (37, 3), (38, 1), (39, 1), (40, 3), (41, 1), (42, 1), (43, 5), (44, 1), (45, 1), (46, 1), (47, 9), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 2), (59, 1), (60, 1), (61, 1), (62, 1), (63, 3), (64, 5), (65, 1), (66, 1), (67, 1)]


In [29]:
def get_vec_lda(model, corpus, k):
    """
    Get the LDA vector representation (probabilistic topic assignments for all documents)
    :return: vec_lda with dimension: (n_doc * n_topic)
    """
    n_doc = len(corpus)
    vec_lda = np.zeros((n_doc, k))
    for i in range(n_doc):
        # get the distribution for the i-th document in corpus
        for topic, prob in model.get_document_topics(corpus[i]):
            vec_lda[i, topic] = prob

    return vec_lda

In [38]:
lda_vec = get_vec_lda(tm.ldamodel, tm.corpus, tm.k)
print('Getting vector representations for LDA. Done!')

Getting vector representations for LDA. Done!


In [39]:
lda_vec[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.99203366, 0.        , 0.        ])

In [41]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-max-tokens')
bert_vec  = np.array(model.encode(sentences, show_progress_bar=True))

Batches: 100%|█████████████████████████████████████████████████████████████████████████| 13/13 [00:24<00:00,  1.89s/it]


In [45]:
len(bert_vec[2])

768

In [42]:
bert_vec[0]

array([ 1.61987841e-01,  6.14772618e-01,  7.95406044e-01,  2.89551347e-01,
        4.30810064e-01,  3.85483772e-01,  1.79240155e+00,  4.46105599e-01,
        9.03564095e-01,  2.38364935e-01,  5.34198403e-01,  5.93950748e-01,
        1.26073563e+00,  2.42247999e-01, -9.56908166e-02,  3.09282899e-01,
        1.45288455e+00,  6.46177351e-01,  6.98070168e-01,  6.96181595e-01,
        5.99174261e-01, -1.50477827e-01,  1.62503028e+00,  8.16035628e-01,
        2.07537675e+00,  1.52503550e+00,  7.04982162e-01,  1.07531130e+00,
        1.99210346e-01,  1.08706087e-01,  3.83886695e-01,  1.38526356e+00,
        7.06467927e-01,  9.07281339e-01,  1.74387181e+00,  1.10470212e+00,
        5.97168744e-01,  5.31288207e-01,  8.49426985e-01,  2.58426331e-02,
        1.86572206e+00,  4.22031850e-01, -1.29834026e-01, -3.32735777e-01,
        4.11015689e-01, -2.63165627e-02,  1.07907951e-02,  1.12351251e+00,
        9.35332000e-01,  2.25399375e-01,  3.45867909e-02,  1.51082242e+00,
        1.39099312e+00, -

In [46]:
len(bert_vec) # 100 * 768

100

In [48]:
len(lda_vec)# 100 * 10 (topic)

100

In [50]:
vec_ldabert = np.c_[lda_vec * tm.gamma, bert_vec]

In [51]:
vec_ldabert[0] 

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.48805049e+01,
        0.00000000e+00,  0.00000000e+00,  1.61987841e-01,  6.14772618e-01,
        7.95406044e-01,  2.89551347e-01,  4.30810064e-01,  3.85483772e-01,
        1.79240155e+00,  4.46105599e-01,  9.03564095e-01,  2.38364935e-01,
        5.34198403e-01,  5.93950748e-01,  1.26073563e+00,  2.42247999e-01,
       -9.56908166e-02,  3.09282899e-01,  1.45288455e+00,  6.46177351e-01,
        6.98070168e-01,  6.96181595e-01,  5.99174261e-01, -1.50477827e-01,
        1.62503028e+00,  8.16035628e-01,  2.07537675e+00,  1.52503550e+00,
        7.04982162e-01,  1.07531130e+00,  1.99210346e-01,  1.08706087e-01,
        3.83886695e-01,  1.38526356e+00,  7.06467927e-01,  9.07281339e-01,
        1.74387181e+00,  1.10470212e+00,  5.97168744e-01,  5.31288207e-01,
        8.49426985e-01,  2.58426331e-02,  1.86572206e+00,  4.22031850e-01,
       -1.29834026e-01, -

In [52]:
len(vec_ldabert[0]) 

778

In [53]:
tm.vec.keys()

dict_keys(['LDA_BERT_FULL', 'LDA_BERT'])

In [55]:
len(tm.vec["LDA_BERT_FULL"][0])

778

In [56]:
# autoencoder output
len(tm.vec["LDA_BERT"][0])

32

In [57]:
len(tm.vec["LDA_BERT"])

100

In [58]:
tm.vec["LDA_BERT"][0]

array([3.453227 , 0.       , 7.9297504, 4.522833 , 0.       , 2.7703547,
       3.0376356, 5.244283 , 3.9629917, 5.554574 , 1.8028933, 1.9948114,
       5.202819 , 0.       , 4.646662 , 0.       , 0.       , 2.657138 ,
       2.3739424, 0.       , 0.       , 5.9313416, 0.       , 2.3671377,
       3.8265424, 2.8723917, 8.546598 , 3.8753705, 2.2664616, 0.       ,
       4.145443 , 2.8545773], dtype=float32)

In [59]:
tm.method

'LDA_BERT'

In [61]:
 # Evaluate using metrics
if tm.method == "LDA":
    topics = []
    for topic in tm.ldamodel.print_topics(num_topics=tm.ntopic):
        topic_list = [re.search(r"[a-zA-z]+", w).group(0) for w in topic[1].split('+')]
        topics.append(topic_list)
else:
    topics = get_topic_words(token_lists, tm.cluster_model.labels_)
    
    

In [62]:
topics

[['year',
  'govern',
  'film',
  'life',
  'club',
  'music',
  'share',
  'parti',
  'money',
  'drug'],
 ['technolog',
  'network',
  'panda',
  'sound',
  'audio',
  'phone',
  'world',
  'research',
  'game',
  'reserv'],
 ['peopl',
  'law',
  'terror',
  'power',
  'right',
  'blog',
  'clark',
  'hous',
  'evid',
  'viru'],
 ['game',
  'ireland',
  'england',
  'tri',
  'ball',
  'compani',
  'jersey',
  'coach',
  'way',
  'refere'],
 ['price',
  'growth',
  'quarter',
  'year',
  'economi',
  'number',
  'sequel',
  'singapor',
  'hous',
  'effect'],
 ['peopl',
  'film',
  'ranger',
  'gorg',
  'project',
  'children',
  'wale',
  'health',
  'ferguson',
  'construct'],
 ['player',
  'coach',
  'connor',
  'relationship',
  'kid',
  'week',
  'wilkinson',
  'number',
  'jimmi',
  'practic'],
 ['break',
  'robinson',
  'leagu',
  'dodgson',
  'england',
  'winter',
  'player',
  'australia',
  'britain',
  'premiership'],
 ['parti',
  'answer',
  'lord',
  'hagu',
  'goldsmith'

In [63]:
tm.cluster_model.labels_

array([0, 3, 3, 5, 0, 4, 1, 0, 6, 3, 4, 8, 0, 0, 4, 0, 7, 0, 1, 3, 7, 0,
       0, 3, 9, 9, 6, 5, 2, 2, 0, 1, 0, 0, 1, 4, 4, 6, 0, 0, 4, 3, 4, 6,
       2, 2, 0, 2, 3, 5, 4, 5, 4, 6, 0, 3, 0, 5, 9, 7, 8, 6, 0, 6, 7, 8,
       7, 0, 3, 8, 0, 5, 4, 5, 0, 2, 5, 0, 1, 5, 9, 0, 0, 0, 6, 0, 1, 9,
       1, 0, 2, 9, 7, 5, 5, 2, 7, 0, 8, 8])

In [67]:
len(tm.cluster_model.labels_)

100

In [None]:
def get_topic_words(token_lists, labels, k=None):
    """
    get top words within each topic from clustering results
    """
    if k is None:
        k = len(np.unique(labels))
    topics = ['' for _ in range(k)]
    for i, c in enumerate(token_lists):
        topics[labels[i]] += (' ' + ' '.join(c))
    word_counts = list(map(lambda x: Counter(x.split()).items(), topics))
    # get sorted word counts
    word_counts = list(map(lambda x: sorted(x, key=lambda x: x[1], reverse=True), word_counts))
    # get topics
    topics = list(map(lambda x: list(map(lambda x: x[0], x[:10])), word_counts))
    return topics

In [64]:
k = len(np.unique(tm.cluster_model.labels_))
print(k)

10


In [65]:
a = ['' for _ in range(k)]

In [66]:
a

['', '', '', '', '', '', '', '', '', '']

In [71]:
token_lists[:2]

[['target',
  'truant',
  'crackdown',
  'truant',
  'england',
  'educ',
  'secretari',
  'ruth',
  'truant',
  'pupil',
  'initi',
  'pupil',
  'school',
  'statist',
  'contract',
  'penalti',
  'notic',
  'track',
  'prosecut',
  'system',
  'problem',
  'children',
  'school',
  'day',
  'tori',
  'success',
  'issu',
  'part',
  'problem',
  'absenc',
  'parent',
  'children',
  'school',
  'absenc',
  'famili',
  'holiday',
  'truant',
  'classroom',
  'govern',
  'scheme',
  'class',
  'crime',
  'measur',
  'contract',
  'penalti',
  'notic',
  'educ',
  'author',
  'term',
  'forc',
  'remaind',
  'term',
  'educ',
  'author',
  'parent',
  'penalti',
  'notic',
  'child',
  'attend',
  'stanc',
  'attend',
  'case',
  'truanci',
  'hit',
  'school',
  'govern',
  'improv',
  'programm',
  'equival',
  'pupil',
  'class',
  'measur',
  'top',
  'truanci',
  'sixth',
  'place',
  'polic',
  'educ',
  'welfar',
  'offic',
  'problem',
  'hotshot',
  'truant',
  'school',
  'yea

In [75]:
a = ['' for _ in range(k)]
for i, c in enumerate(token_lists):
        a[tm.cluster_model.labels_[i]] += (' ' + ' '.join(c))

In [77]:
type(a)

list

In [79]:
len(a)

10

In [80]:
word_counts = list(map(lambda x: Counter(x.split()).items(), a))

In [81]:
word_counts

[dict_items([('target', 1), ('truant', 5), ('crackdown', 1), ('england', 4), ('educ', 10), ('secretari', 4), ('ruth', 1), ('pupil', 5), ('initi', 1), ('school', 13), ('statist', 1), ('contract', 2), ('penalti', 3), ('notic', 3), ('track', 1), ('prosecut', 1), ('system', 4), ('problem', 12), ('children', 3), ('day', 7), ('tori', 2), ('success', 8), ('issu', 2), ('part', 16), ('absenc', 2), ('parent', 5), ('famili', 6), ('holiday', 1), ('classroom', 1), ('govern', 37), ('scheme', 3), ('class', 4), ('crime', 2), ('measur', 4), ('author', 4), ('term', 9), ('forc', 2), ('remaind', 1), ('child', 4), ('attend', 3), ('stanc', 2), ('case', 7), ('truanci', 3), ('hit', 3), ('improv', 1), ('programm', 5), ('equival', 1), ('top', 6), ('sixth', 1), ('place', 7), ('polic', 1), ('welfar', 1), ('offic', 4), ('hotshot', 1), ('year', 40), ('addit', 4), ('patrol', 2), ('sweep', 1), ('youngster', 1), ('depart', 2), ('skill', 1), ('sourc', 1), ('count', 1), ('figur', 5), ('tool', 1), ('number', 7), ('half',

In [82]:
word_counts = list(map(lambda x: sorted(x, key=lambda x: x[1], reverse=True), word_counts))
word_counts

[[('year', 40),
  ('govern', 37),
  ('film', 36),
  ('life', 27),
  ('club', 26),
  ('music', 26),
  ('share', 24),
  ('parti', 21),
  ('money', 20),
  ('drug', 20),
  ('compani', 19),
  ('invest', 19),
  ('market', 19),
  ('firm', 18),
  ('peopl', 18),
  ('sioux', 18),
  ('part', 16),
  ('stock', 16),
  ('time', 15),
  ('fund', 15),
  ('award', 15),
  ('polici', 15),
  ('takeov', 14),
  ('pm', 14),
  ('school', 13),
  ('deal', 13),
  ('heart', 13),
  ('rate', 13),
  ('problem', 12),
  ('bate', 12),
  ('leed', 12),
  ('stake', 12),
  ('director', 12),
  ('power', 12),
  ('elect', 12),
  ('risk', 12),
  ('side', 12),
  ('week', 11),
  ('countri', 11),
  ('herman', 11),
  ('howard', 11),
  ('educ', 10),
  ('analyst', 10),
  ('car', 10),
  ('merck', 10),
  ('term', 9),
  ('board', 9),
  ('month', 9),
  ('debut', 9),
  ('sale', 9),
  ('world', 9),
  ('way', 9),
  ('match', 9),
  ('pension', 9),
  ('festiv', 9),
  ('success', 8),
  ('chelsea', 8),
  ('peter', 8),
  ('need', 8),
  ('donat', 

In [83]:
len(word_counts)

10

In [84]:
word_counts[0]

[('year', 40),
 ('govern', 37),
 ('film', 36),
 ('life', 27),
 ('club', 26),
 ('music', 26),
 ('share', 24),
 ('parti', 21),
 ('money', 20),
 ('drug', 20),
 ('compani', 19),
 ('invest', 19),
 ('market', 19),
 ('firm', 18),
 ('peopl', 18),
 ('sioux', 18),
 ('part', 16),
 ('stock', 16),
 ('time', 15),
 ('fund', 15),
 ('award', 15),
 ('polici', 15),
 ('takeov', 14),
 ('pm', 14),
 ('school', 13),
 ('deal', 13),
 ('heart', 13),
 ('rate', 13),
 ('problem', 12),
 ('bate', 12),
 ('leed', 12),
 ('stake', 12),
 ('director', 12),
 ('power', 12),
 ('elect', 12),
 ('risk', 12),
 ('side', 12),
 ('week', 11),
 ('countri', 11),
 ('herman', 11),
 ('howard', 11),
 ('educ', 10),
 ('analyst', 10),
 ('car', 10),
 ('merck', 10),
 ('term', 9),
 ('board', 9),
 ('month', 9),
 ('debut', 9),
 ('sale', 9),
 ('world', 9),
 ('way', 9),
 ('match', 9),
 ('pension', 9),
 ('festiv', 9),
 ('success', 8),
 ('chelsea', 8),
 ('peter', 8),
 ('need', 8),
 ('donat', 8),
 ('report', 8),
 ('state', 8),
 ('level', 8),
 ('seed', 

In [85]:
topics = list(map(lambda x: list(map(lambda x: x[0], x[:10])), word_counts))
topics

[['year',
  'govern',
  'film',
  'life',
  'club',
  'music',
  'share',
  'parti',
  'money',
  'drug'],
 ['technolog',
  'network',
  'panda',
  'sound',
  'audio',
  'phone',
  'world',
  'research',
  'game',
  'reserv'],
 ['peopl',
  'law',
  'terror',
  'power',
  'right',
  'blog',
  'clark',
  'hous',
  'evid',
  'viru'],
 ['game',
  'ireland',
  'england',
  'tri',
  'ball',
  'compani',
  'jersey',
  'coach',
  'way',
  'refere'],
 ['price',
  'growth',
  'quarter',
  'year',
  'economi',
  'number',
  'sequel',
  'singapor',
  'hous',
  'effect'],
 ['peopl',
  'film',
  'ranger',
  'gorg',
  'project',
  'children',
  'wale',
  'health',
  'ferguson',
  'construct'],
 ['player',
  'coach',
  'connor',
  'relationship',
  'kid',
  'week',
  'wilkinson',
  'number',
  'jimmi',
  'practic'],
 ['break',
  'robinson',
  'leagu',
  'dodgson',
  'england',
  'winter',
  'player',
  'australia',
  'britain',
  'premiership'],
 ['parti',
  'answer',
  'lord',
  'hagu',
  'goldsmith'

In [86]:
labels = tm.predict(sentences, token_lists)

In [87]:
labels

array([0, 3, 3, 5, 0, 4, 1, 0, 6, 3, 4, 8, 0, 0, 4, 0, 7, 0, 1, 3, 7, 0,
       0, 3, 9, 9, 6, 5, 2, 2, 0, 1, 0, 0, 1, 4, 4, 6, 0, 0, 4, 3, 4, 6,
       2, 2, 0, 2, 3, 5, 4, 5, 4, 6, 0, 3, 0, 5, 9, 7, 8, 6, 0, 6, 7, 8,
       7, 0, 3, 8, 0, 5, 4, 5, 0, 2, 5, 0, 1, 5, 9, 0, 0, 0, 6, 0, 1, 9,
       1, 0, 2, 9, 7, 5, 5, 2, 7, 0, 8, 8])

In [None]:
def predict(self, sentences, token_lists, out_of_sample=None):
        """
        Predict topics for new_documents
        """
        # Default as False
        out_of_sample = out_of_sample is not None

        if out_of_sample:
            corpus = [self.dictionary.doc2bow(text) for text in token_lists]
            if self.method != 'LDA':
                vec = self.vectorize(sentences, token_lists)
                print(vec)
        else:
            corpus = self.corpus
            vec = self.vec.get(self.method, None)

        if self.method == "LDA":
            lbs = np.array(list(map(lambda x: sorted(self.ldamodel.get_document_topics(x),
                                                     key=lambda x: x[1], reverse=True)[0][0],
                                    corpus)))
        else:
            lbs = self.cluster_model.predict(vec)
        return lbs

In [89]:
out_of_sample=None

In [90]:
out_of_sample = out_of_sample is not None

In [91]:
out_of_sample

False

In [92]:
tm.method

'LDA_BERT'

In [93]:
vec = tm.vec.get(tm.method, None)

In [95]:
len(vec)

100

In [96]:
len(vec[0])

32

In [None]:

def get_coherence(model, token_lists, measure='c_v'):
    """
    Get model coherence from gensim.models.coherencemodel
    :param model: Topic_Model object
    :param token_lists: token lists of docs
    :param topics: topics as top words
    :param measure: coherence metrics
    :return: coherence score
    """
    if model.method == 'LDA':
        cm = CoherenceModel(model=model.ldamodel, texts=token_lists, corpus=model.corpus, dictionary=model.dictionary,
                            coherence=measure)
    else:
        topics = get_topic_words(token_lists, model.cluster_model.labels_)
        cm = CoherenceModel(topics=topics, texts=token_lists, corpus=model.corpus, dictionary=model.dictionary,
                            coherence=measure)
    return cm.get_coherence()

def get_silhouette(model):
    """
    Get silhouette score from model
    :param model: Topic_Model object
    :return: silhouette score
    """
    if model.method == 'LDA':
        return
    lbs = model.cluster_model.labels_
    vec = model.vec[model.method]
    return silhouette_score(vec, lbs)

In [98]:

c = get_coherence(tm, token_lists, 'c_v')
s = get_silhouette(tm)
print('Number of Topics:', ntopic)
print('Coherence:', c)
print('Silhouette Score:', s)

Number of Topics: 10
Coherence: 0.45509118401342763
Silhouette Score: 0.54595


In [99]:
documents.head()

Unnamed: 0,index,abstract
0,1,tv future in the hands of viewers with home th...
1,2,worldcom boss left books alone former worldc...
2,3,tigers wary of farrell gamble leicester say ...
3,4,yeading face newcastle in fa cup premiership s...
4,5,ocean s twelve raids box office ocean s twelve...


In [100]:
documents.shape

(100, 2)

In [101]:
predictions = documents.copy()
predictions['topic_no'] = np.NaN

In [102]:
predictions.head()

Unnamed: 0,index,abstract,topic_no
0,1,tv future in the hands of viewers with home th...,
1,2,worldcom boss left books alone former worldc...,
2,3,tigers wary of farrell gamble leicester say ...,
3,4,yeading face newcastle in fa cup premiership s...,
4,5,ocean s twelve raids box office ocean s twelve...,


In [103]:
idx_in

[40,
 97,
 97,
 54,
 22,
 71,
 44,
 64,
 2,
 19,
 11,
 92,
 38,
 7,
 62,
 43,
 77,
 7,
 44,
 50,
 61,
 45,
 66,
 95,
 60,
 78,
 57,
 87,
 28,
 16,
 3,
 24,
 74,
 22,
 24,
 23,
 71,
 17,
 84,
 74,
 4,
 80,
 42,
 17,
 76,
 68,
 41,
 20,
 50,
 9,
 4,
 87,
 71,
 8,
 84,
 80,
 93,
 91,
 78,
 49,
 85,
 37,
 75,
 17,
 61,
 92,
 49,
 52,
 97,
 55,
 59,
 65,
 96,
 54,
 84,
 20,
 10,
 86,
 44,
 30,
 60,
 64,
 52,
 66,
 8,
 5,
 44,
 83,
 24,
 59,
 28,
 78,
 77,
 48,
 70,
 68,
 61,
 3,
 47,
 85]

In [104]:
labels

array([0, 3, 3, 5, 0, 4, 1, 0, 6, 3, 4, 8, 0, 0, 4, 0, 7, 0, 1, 3, 7, 0,
       0, 3, 9, 9, 6, 5, 2, 2, 0, 1, 0, 0, 1, 4, 4, 6, 0, 0, 4, 3, 4, 6,
       2, 2, 0, 2, 3, 5, 4, 5, 4, 6, 0, 3, 0, 5, 9, 7, 8, 6, 0, 6, 7, 8,
       7, 0, 3, 8, 0, 5, 4, 5, 0, 2, 5, 0, 1, 5, 9, 0, 0, 0, 6, 0, 1, 9,
       1, 0, 2, 9, 7, 5, 5, 2, 7, 0, 8, 8])

In [106]:
count = 0
for idx in idx_in:
    predictions['topic_no'][idx] = labels[count]
    count = count + 1

In [107]:
predictions.head()

Unnamed: 0,index,abstract,topic_no
0,1,tv future in the hands of viewers with home th...,
1,2,worldcom boss left books alone former worldc...,
2,3,tigers wary of farrell gamble leicester say ...,6.0
3,4,yeading face newcastle in fa cup premiership s...,0.0
4,5,ocean s twelve raids box office ocean s twelve...,4.0


In [108]:
predictions["topic_no"].value_counts()

0.0    18
5.0     9
4.0     7
3.0     5
2.0     5
6.0     5
8.0     4
7.0     3
9.0     3
1.0     2
Name: topic_no, dtype: int64

In [109]:
predictions.isna().sum()

index        0
abstract     0
topic_no    39
dtype: int64

In [116]:
mapped_predictions = predictions.groupby('topic_no')['abstract'].apply(list)

In [117]:
mapped_predictions

topic_no
0.0    [yeading face newcastle in fa cup premiership ...
1.0    [mobile audio enters new dimension as mobile p...
2.0    [howard backs stem cell research michael howar...
3.0    [games maker fights for survival one of britai...
4.0    [ocean s twelve raids box office ocean s twelv...
5.0    [last star wars  not for children  the sixth a...
6.0    [tigers wary of farrell  gamble  leicester say...
7.0    [stock market eyes japan recovery japanese sha...
8.0    [campaign  cold calls  questioned labour and t...
9.0    [telegraph newspapers axe 90 jobs the daily an...
Name: abstract, dtype: object

In [118]:
mapped_predictions = mapped_predictions.reset_index()
mapped_predictions['keywords_list'] = topics

In [119]:
mapped_predictions.head()

Unnamed: 0,topic_no,abstract,keywords_list
0,0.0,[yeading face newcastle in fa cup premiership ...,"[year, govern, film, life, club, music, share,..."
1,1.0,[mobile audio enters new dimension as mobile p...,"[technolog, network, panda, sound, audio, phon..."
2,2.0,[howard backs stem cell research michael howar...,"[peopl, law, terror, power, right, blog, clark..."
3,3.0,[games maker fights for survival one of britai...,"[game, ireland, england, tri, ball, compani, j..."
4,4.0,[ocean s twelve raids box office ocean s twelv...,"[price, growth, quarter, year, economi, number..."


In [121]:
len(topics)

10