In [1]:
import pandas as pd
import numpy as np

# set seed for reproducibility
np.random.seed(493)

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
ps = nltk.porter.PorterStemmer()

import unicodedata
import re

from gsdmm import MovieGroupProcess

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# read the tweets info a dataframe
df = pd.read_csv('cs.csv')

# remove  null values
df = df.loc[df.cleaned.notnull()]

In [4]:
def basic_clean(original):
    word = original.lower()
    word = unicodedata.normalize('NFKD', word)\
                                .encode('ascii', 'ignore')\
                                .decode('utf-8', 'ignore')
    word = re.sub(r"[^a-z0-9'\s]", '', word)
    word = word.replace('\n',' ')
    word = word.replace('\t',' ')
    return word
  
def remove_stopwords(original, extra_words=[], exclude_words=[]):
    stopword_list = stopwords.words('english')

    for word in extra_words:
        stopword_list.append(word)
    for word in exclude_words:
        stopword_list.remove(word)

    words = original.split()
    filtered_words = [w for w in words if w not in stopword_list]

    original_nostop = ' '.join(filtered_words)

    return original_nostop
 
def stem(original):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in original.split()]
    original_stemmed = ' '.join(stems)
    return original_stemmed

docs = []
for sentence in df.cleaned:
    words = word_tokenize(stem(remove_stopwords(basic_clean(sentence))))
    docs.append(words)

In [33]:
mgp = MovieGroupProcess(K=15, alpha=0.1, beta=1, n_iters=5)

vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)

y = mgp.fit(docs, n_terms)

In stage 0: transferred 1388 clusters with 15 clusters populated
In stage 1: transferred 650 clusters with 6 clusters populated
In stage 2: transferred 132 clusters with 4 clusters populated
In stage 3: transferred 52 clusters with 4 clusters populated
In stage 4: transferred 48 clusters with 4 clusters populated


In [34]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :', doc_count)

Number of documents per topic : [   0    3    0    0  325    0    0    0    2    0    0    0    0    0
 1203]


In [35]:
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

Most important clusters (by number of docs inside): [14  4  1  8 13 12 11 10  9  7  6  5  3  2  0]


In [36]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print('-'*120)

In [37]:
# Show the top 7 words in term frequency for each cluster 
top_words(mgp.cluster_word_distribution, top_index, 7)

Cluster 14 : [('citect', 1330), ('server', 1109), ('alarm', 802), ('scada', 438), ('tag', 425), ('driver', 349), ('run', 345)]
------------------------------------------------------------------------------------------------------------------------
Cluster 4 : [('licens', 582), ('citect', 315), ('server', 186), ('key', 151), ('activ', 147), ('scada', 115), ('updat', 103)]
------------------------------------------------------------------------------------------------------------------------
Cluster 1 : [('test', 8), ('prod', 8), ('ignor', 8)]
------------------------------------------------------------------------------------------------------------------------
Cluster 8 : [('de', 6), ('plc', 6), ('en', 5), ('se', 5), ('la', 4), ('modnet', 4), ('intermitencia', 4)]
------------------------------------------------------------------------------------------------------------------------
Cluster 13 : []
----------------------------------------------------------------------------------------

In [38]:
topic_dict = {}
topic_names = ['Topic #1',
               'Topic #2',
               'Topic #3',
               'Topic #4',
               'Topic #5',
               'Topic #6',
               'Topic #7',
               'Topic #8',
               'Topic #9',
               'Topic #10',
               'Topic #11',
               'Topic #12',
               'Topic #13',
               'Topic #14',
               'Topic #15'
              ]
for i, topic_num in enumerate(top_index):
    topic_dict[topic_num]=topic_names[i]

In [39]:
def create_topics_dataframe(data_text=df.cleaned,  mgp=mgp, threshold=0.3, topic_dict=topic_dict, stem_text=docs):
    result = pd.DataFrame(columns=['text', 'topic', 'stems'])
    for i, text in enumerate(data_text):
        result.at[i, 'text'] = text
        result.at[i, 'stems'] = stem_text[i]
        prob = mgp.choose_best_label(stem_text[i])
        if prob[1] >= threshold:
            result.at[i, 'topic'] = topic_dict[prob[0]]
        else:
            result.at[i, 'topic'] = 'Other'
    return result

In [40]:
dfx = create_topics_dataframe(data_text=df.cleaned,  mgp=mgp, threshold=0.3, topic_dict=topic_dict, stem_text=docs)

In [41]:
dfx.topic.value_counts(dropna=False)

Topic #1    1162
Topic #2     320
Other         46
Topic #3       3
Topic #4       2
Name: topic, dtype: int64