In [1]:
import pandas as pd
import umap
import hdbscan
import numpy as np
from collections import OrderedDict
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def word_clouds(top_words):
    my_dict = dict()
    word_cloud = list()
    for key, value in top_words.items():
        words = list()
        for i in value:
            my_dict["text"] = i[0]
            my_dict["weight"] = i[1]
            words.append(my_dict.copy())       
        word_cloud.append(words)
    return word_cloud

In [3]:
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

In [4]:
def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, docs, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    topic_sizes = extract_topic_sizes(docs)
    # sort the dict to get the HOT topics ordered
    top_n_words = OrderedDict(top_n_words)
    key_order = list(topic_sizes["Topic"].values)
    for k in key_order:
        top_n_words.move_to_end(k)
    top_n_words = dict(top_n_words)    
    return word_clouds(top_n_words)
    

In [5]:
def insert_space(title):
    title = ' '+title
    title += ' '
    return title

In [6]:
def build_model(csv_file):
    df = pd.read_csv(csv_file, delimiter=";")
    df['title']=df['title'].apply(lambda x: insert_space(x))
    data = df["body"].values  + df["title"].values
    
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    embeddings = model.encode(data, show_progress_bar=True)  
    # umap is a  dimensionality reduction algorithm
    umap_embeddings = umap.UMAP(n_neighbors=2, 
                            n_components=3, 
                            metric='cosine').fit_transform(embeddings)
    
    cluster = hdbscan.HDBSCAN(min_cluster_size=2,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)
    
    docs_df = pd.DataFrame(data, columns=["Doc"])
    docs_df['Topic'] = cluster.labels_
    docs_df['Doc_ID'] = range(len(docs_df))
    docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(data))
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, docs_df, n=10)  
    topic_sizes = extract_topic_sizes(docs_df)
    
    return top_n_words


In [7]:
top_n_words = build_model("posts.csv")

  return torch._C._cuda_getDeviceCount() > 0


HBox(children=(FloatProgress(value=0.0, description='Batches', max=2.0, style=ProgressStyle(description_width=…




In [9]:
import datetime

current_date = datetime.date.today().isoformat()

In [10]:
current_date = {"current_date" : current_date}
top_n_words.append(current_date)

In [11]:
import pymongo as pm

myclient = pm.MongoClient(host="localhost",
                    port=27017,
                    username="accretioadmin",
                    password="adminaccretio&2017",
                   authSource="admin")

mydb = myclient["topic_detection"]
mycol = mydb["word_clouds"]

for i in top_n_words:
    d = i
    mycol.update({}, {"$push": {"word-clouds":d }}, True)

  
