# Topic modelling with KMeans

In [None]:
import numpy  as np
import pandas as pd
from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS


import nltk
# to update the package
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import matplotlib.pyplot as plt
%matplotlib inline


from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [None]:
# watermark is optional - it shows the versions of installed libraries
# so it is useful to confirm your library versions when you submit bug reports to projects
# install watermark using
# %install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
# show a watermark for this environment
%watermark -d -m -v -p numpy,matplotlib,sklearn -g

In [None]:
def load_dfs(word_list, date):
    df = pd.DataFrame()
    for word in word_list:
        dfaux = pd.read_csv( date +'\/' + word + '.csv'  , encoding='ISO-8859-1'   )
        dfaux['word'] = word
        df = df.append(dfaux)
    return df.reset_index(drop=True)

def lemmatize_stemming(text):
    # lemmatize
    # Stemm
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result


# Load data

In [None]:
date               = '2019-10-15'
example            = 3450
word_list          = ['furniture', 'homedecor', 'interiordesign']
df_tweets          = load_dfs(word_list, date).fillna(0)
df_tweets          = df_tweets[['tweet.text']]
df_tweets['index'] = df_tweets.index


print('Number of tweets:',len(df_tweets))
df_tweets.head()

# Clean Text in Tweets

In [None]:
# Tokenize, remove stopwords, remove short words, lemmatize, stemm

stemmer = SnowballStemmer('english')
processed_docs = df_tweets['tweet.text'].map(preprocess)


print('original document: ')
print(df_tweets[df_tweets['index'] == example]['tweet.text'].values)
print('\n\n words in original document:')
words = []
for word in df_tweets[df_tweets['index'] == example].values[0][0].split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(processed_docs[example])


vectorizer = TfidfVectorizer(stop_words='english',
                             max_df=0.500,
                             min_df=0.005,
                             lowercase=True)
X = [' '.join(x) for x in processed_docs]
X = vectorizer.fit_transform(X)
dictionary = vectorizer.get_feature_names()



print('\n\nDictionary:',len(dictionary))
for w in enumerate(dictionary):
    if w[0] <= 10:
        print(w[0],w[1])
    else:
        break

# Dimensionality reduction

In [None]:
def to_dense(sparse_matrix):
    dense = []
    for item in sparse_matrix.todense():
        dense.append(np.array(item)[0])
    return pd.DataFrame(dense)

def drop_low_variance(df,dictin):
    var      = df.var()
    cols     = df.columns
    dict_out = []
    dict_dis = []
    vars     = []
    for i in range(0,len(var)):
        if var[i]>=1/1000:   #setting the threshold
            vars.append(cols[i])
            dict_out.append(dictin[i])
        else:
            dict_dis.append(dictin[i])
    dfout         = df[vars]
    dfout.columns = np.arange(len(vars))
    return dfout, dict_out, dict_dis

def drop_correlated(df,dictin):
    dfcorr     = df.corr()
    correlated = []
    for i in range(len(dfcorr.columns)):
        aux   = dfcorr.loc[i]
        aux   = aux.drop(aux.index[i])
        index = aux.idxmax()
        if (aux[index] > 0.5) | (aux[index] < -0.5):
            correlated.append(sorted([i,index]))

    indices = sorted(pd.DataFrame(correlated).drop_duplicates()[1].values)
    indices = list( dict.fromkeys(indices) )

    df_out   = df.copy()
    dict_out = dictin.copy()
    dict_dis = []
    for idx in indices:
        df_out = df_out.drop(idx, axis = 1)
        dict_dis.append(dict_out[idx])
        del dict_out[idx]
    return df_out, dict_out, dict_dis


In [None]:
example   = 300
df_dense  = to_dense(X)
df_scaled = StandardScaler().fit_transform(df_dense.values)
df_dense  = pd.DataFrame(df_scaled)

print('Number of tweets                      :',len(df_dense))
print('Number of columns (i.e. feature words):',len(df_dense.columns))
print('\nTweet example:')
print('Tweet                                 :', df_tweets[df_tweets['index'] == example]['tweet.text'].values)
print('Processed                             :', processed_docs[example])
print('Vectorized                            :', df_dense.loc[example].tolist())
print()
df_dense.head()

In [None]:
df_reduced, dict_reduced, dict_discarted = drop_low_variance(df_dense, dictionary)
df_reduced, dict_reduced, dict_aux       = drop_correlated(df_reduced, dict_reduced)
dict_discarted.append(dict_aux)

print(len(df_reduced.columns),len(dict_reduced))
print(dict_reduced)
print()
print(dict_discarted)
df_reduced.head()


# Cluster tweets with KMeans

In [None]:
true_k = 3


newX            = sparse.csr_matrix(df_reduced.values)
model_tw        = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
#model_tw        = KMeans(n_clusters=true_k)
#Xtrans          = model_tw.fit(newX)
Xtrans          = model_tw.fit_transform(newX)
y               = model_tw.predict(newX)
order_centroids = model_tw.cluster_centers_.argsort()[:, ::-1]
unique, counts  = np.unique(y, return_counts=True)

print('Top terms per cluster:')
for i in range(true_k):
    idx = order_centroids[i, :5]
    print("Cluster %d" % i,'size:',counts[i],[dict_reduced[j] for j in idx] )

indices_0 = [i for i, x in enumerate(y) if x == 0]
indices_1 = [i for i, x in enumerate(y) if x == 1]
indices_2 = [i for i, x in enumerate(y) if x == 2]

In [None]:
example = 300
print('\nEXAMPLE')
print('original document:')
print(df_tweets[df_tweets['index'] == example]['tweet.text'].values)
print('\n\nTokenized and lemmatized document: ')
print(processed_docs[example])
print('\n\nExample predicted in cluster:', model_tw.predict(newX[example])[0])

In [None]:
for i in indices_1[0:5]:
    print('\n\nEXAMPLE:', i)
    print('Predicted cluster:', y[i])
    print('original: ')
    print(df_tweets[df_tweets['index'] == i]['tweet.text'].values)
    print('Tokenized and lemmatized: ')
    print(processed_docs[i])


# Check stability of clustering

In [None]:
# First cluster tweets nrnd times
# then cluster the k_true x nrnd centroids
# Well separated centroids in a 2D plot build using TSNE is used to infer repeatibility, i.e. quality, of the tweets clustering process

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
%matplotlib inline

import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})

from sklearn.manifold import TSNE

In [None]:
def fashion_scatter(x, colors, title):
    # choose a color palette with seaborn.
    num_classes = len(np.unique(colors))
    palette = np.array(sns.color_palette("dark", num_classes))
#    palette = np.array(sns.color_palette("hls", num_classes))

    # create a scatter plot.
    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect='equal')
    ax.set_title(title)

    sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40, c=palette[colors.astype(np.int)])
    plt.xlim(-25, 25)
    plt.ylim(-25, 25)
    ax.set_yticks([])
    ax.set_xticks([])
#    ax.axis('off')
    ax.axis('tight')

    # add the labels for each digit corresponding to the label
    txts = []

    for i in range(num_classes):
        # Position of each label at median of data points.
        xtext, ytext = np.median(x[colors == i, :], axis=0)
        txt = ax.text(xtext, ytext, str(i), fontsize=24)
        txt.set_path_effects([
            PathEffects.Stroke(linewidth=5, foreground="w"),
            PathEffects.Normal()])
        txts.append(txt)

    return f, ax, sc, txts


def reorder_centroids(czero,ci):
    ncentrs = len(czero)
    cout    = []
    order0  = []
    order1  = []
    
    # order centroids
    for i in range(ncentrs):
        row   = []
        for j in range(ncentrs):
            row.append(np.linalg.norm(czero[i]-ci[j]))
        order0.append(row.index(min(row)))
        cout.append(ci[order0[-1]])

    # check order worked
    for i in range(ncentrs):
        row   = []
        for j in range(ncentrs):
            row.append(np.linalg.norm(czero[i]-cout[j]))
        order1.append(row.index(min(row)))
    return np.array(cout), order0,order1

In [None]:
# Cluster tweets nrnd times
# Cluster centroids
# Plot centroids in 2D with TSNE
nrnd = 50

clist = []
for j in range(nrnd):
    print('Rnd #:',j+1)
    model_tw  = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
#    model_tw  = KMeans(n_clusters=true_k)
    Xtrans    = model_tw.fit(X)
    centroids = model_tw.cluster_centers_
    clist.append(centroids)
flattened_list = [y for x in clist for y in x]



# Cluster the centroids
model_cn   = KMeans(n_clusters=true_k)
y          = model_cn.fit_predict(flattened_list)

# Use TSNE to plot centroids in a 2D space
fashion_tsne = TSNE(random_state=123).fit_transform(flattened_list)
fashion_scatter(fashion_tsne, y, 'TSNE Plot')
plt.show()