In [1]:
import collections
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
from collections import OrderedDict
import pandas as pd
from pymongo import MongoClient
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
import re
import emoji
import spacy
from spacy_lefff import LefffLemmatizer, POSTagger
from spacy.language import Language
from collections import OrderedDict
from sklearn.feature_extraction.text import CountVectorizer
import datetime

# Collect data

In [2]:
host = "localhost"
port = "27017"
username = "accretioadmin"
password = "adminaccretio&2017"
authSource = "admin"

def _connect_mongo(db):
    if username and password:
        conn = MongoClient(host=host,
                            port=int(port),
                            username=username,
                            password=password,
                           authSource=authSource)
    else:
        conn = MongoClient(host, port)
        
    return conn[db]    


def read_mongo(db, collection, query={}):
    db = _connect_mongo(db=db)
    cursor = db[collection].find(query)
    df =  pd.DataFrame(list(cursor))
    return df

In [3]:
df = read_mongo("connectTimeline", "post")
df["post_id"] = df["_id"].astype(str)
df = df[["post_id","registrationNumber","body"]]
df["registrationNumber"] = df["registrationNumber"].astype(str)

df = df.dropna()
df = df[:300]

# Data preparation

In [4]:
@Language.factory('french_lemmatizer')
def create_lemmatizer(nlp, name):
    return LefffLemmatizer()   

nlp = spacy.load('fr_core_news_md')
nlp.add_pipe('french_lemmatizer', name='lefff')  
def replace_lemma(text):  
    doc = nlp(text)
    for d in doc:
        if(d.pos_ != "PROPN" and d.pos_ != "NUM" and d.pos_ != "AUX" and d.pos_ !='DET' and d.pos_ !='ADJ' and d.pos_ !='ADV'):
            text = text.replace(d.text,  d.lemma_ )
        else:
            text = text.replace(d.text, '')
    return text   

In [5]:
def _remove_emoji(string):
    return emoji.get_emoji_regexp().sub(u'', string)

final_stopwords_list = list(fr_stop) + list(en_stop)
def custom_stopwords(text):
    #common_words    
    text = " ".join([w for w in text.split() if w.lower() not in final_stopwords_list])
    #HTML TAGS
    text = re.sub(r'<.*?>', '', text)
    #links
    text = re.sub("https?:\/\/.*[\r\n]*", "", text)  
    #alpha_numeric
    text = re.sub(r'[^a-zA-Z0-9 àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔ\'ŒÙÛÜŸÇ]+', '', text)
    # remove emojis
    text = _remove_emoji(text)
    #remove duplicate space
    clear_text = " ".join(text.split())
    
    return clear_text

In [6]:
def prepare():
    global df
    df["body"] = df["body"].apply(custom_stopwords)
    df = df[df['body'].map(len)>1]
    df["body"] = df["body"].apply(replace_lemma)
prepare()    

In [7]:
def word_clouds(top_words, df):
    current_date = datetime.date.today().isoformat()
    df["Topic"] = df["Topic"].astype(str)     
    topics = list()
    for topic, value in top_words.items():
        if str(topic ) != "-1":
            topic_dict = dict()  
            topic_dict["idtopic"] = topic
            topic_dict["assigned_name"] = value[1][0]
            topic_dict["date"] = current_date
            topic_dict["importance"] = len(list(df[df["Topic"] == str(topic)]["posts"])[0])
            topic_dict["posts"] =  list(df[df["Topic"] == str(topic)]["posts"])[0]
            words = list()
            word_dict = dict()
            for i in value:
                word_dict["text"] = i[0]
                word_dict["weight"] = i[1]   
                words.append(word_dict.copy())
            words.append({"idtopic":topic, "assigned_name": value[1][0]})    
            topic_dict["word-cloud"]  = words
            topics.append(topic_dict.copy())
    return topics

In [8]:
def order_people(ch):
    l = ch.split()
    temp = l[:]
    l.sort(key=lambda x:temp.count(x), reverse= True)
    return list(dict.fromkeys(l))  

In [9]:
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words=final_stopwords_list).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

In [10]:
def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, docs, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    topic_sizes = extract_topic_sizes(docs)
    # sort the dict to get the HOT topics ordered
    top_n_words = OrderedDict(top_n_words)
    key_order = list(topic_sizes["Topic"].values)
    for k in key_order:
        top_n_words.move_to_end(k)
    top_n_words = dict(top_n_words)    
    return word_clouds(top_n_words, docs_per_topic)
    

In [11]:
def build_model():
    data = df["body"].values  
    #embeddings
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    embeddings = model.encode(data)  
    # umap is a  dimensionality reduction algorithm
    umap_embeddings = umap.UMAP(n_neighbors=2, 
                            n_components=3, 
                            metric='cosine').fit_transform(embeddings)
    #Clustering
    cluster = hdbscan.HDBSCAN(min_cluster_size=17,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)
    
    docs_df = pd.DataFrame(data, columns=["Doc"])
    docs_df["posts"] = df["post_id"].astype("str").values
    docs_df['Topic'] = cluster.labels_
    docs_df['Doc_ID'] = range(len(docs_df))
    docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join, 'posts': ' '.join})#
    # concatenate docs with same topic (with space) ---->  'Doc': ' '.join
    docs_per_topic["posts"] =docs_per_topic["posts"].apply(lambda x: order_people(x))
    #TF-IDF
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(data))
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, docs_df, n=10)  
       
    return top_n_words


In [12]:
top_n_words = build_model()

  return torch._C._cuda_getDeviceCount() > 0
  'stop_words.' % sorted(inconsistent))


In [13]:
top_n_words

[{'idtopic': 0,
  'assigned_name': 'année',
  'date': '2021-06-10',
  'importance': 33,
  'posts': ['5bb35438a5b16a0001ef4ef4',
   '5bb629efa5b16a0001ef4f9c',
   '5bbe1010a5b16a0001ef50dc',
   '5bbf0694a5b16a0001ef50ee',
   '5bbf7b4fa5b16a0001ef5116',
   '5bc06672a5b16a0001ef5134',
   '5bc09ecba5b16a0001ef514a',
   '5bc0bd91a5b16a0001ef515e',
   '5bc0fec1a5b16a0001ef5179',
   '5bc84bc804031a0001cf64eb',
   '5bcd8f4104031a0001cf6541',
   '5bcde4bb04031a0001cf655b',
   '5bd318af04031a0001cf65e3',
   '5bd8866a04031a0001cf662c',
   '5bd9692f04031a0001cf6641',
   '5be465c604031a0001cf670b',
   '5be69cca04031a0001cf6763',
   '5be6e57b04031a0001cf676f',
   '5bea9d2e04031a0001cf6792',
   '5bedde2c04031a0001cf67d0',
   '5beeed8d04031a0001cf67e1',
   '5bf2c5af04031a0001cf67fe',
   '5bf3c79004031a0001cf680c',
   '5c0662b004031a0001cf68e3',
   '5c08efe604031a0001cf6903',
   '5c1a1cc9a4791700013bfd9b',
   '5c1a2869a4791700013bfda6',
   '5c1b6737a4791700013bfdd5',
   '5c1bb7e9a4791700013bfded',
   '

In [14]:
doc = nlp(u"Apple mieux vaut tard que jamais cherche a acheter une startup anglaise pour 1 milliard de dollard")
for d in doc:
    print(d.text, d.pos_, d._.lefff_lemma, d.tag_, d.lemma_)

Apple PROPN None PROPN Apple
mieux ADV mieux ADV mieux
vaut VERB valoir VERB valoir
tard ADV tard ADV tard
que SCONJ None SCONJ que
jamais ADV jamais ADV jamais
cherche VERB chercher VERB cherche
a AUX None AUX avoir
acheter VERB acheter VERB acheter
une DET un DET un
startup NOUN None NOUN startup
anglaise ADJ anglais ADJ anglais
pour ADP None ADP pour
1 NUM None NUM 1
milliard NOUN milliard NOUN milliard
de ADP un ADP de
dollard NOUN None NOUN dollard


In [15]:
import pymongo as pm

myclient = pm.MongoClient(host="localhost",
                    port=27017,
                    username="accretioadmin",
                    password="adminaccretio&2017",
                   authSource="admin")

mydb = myclient["topic_detection"]
mycol = mydb["topic"]
mycol.insert_many(top_n_words)

# for i in top_n_words:
#     d = i
#     mycol.update({}, {"$push": {"word-clouds":d }}, True)

<pymongo.results.InsertManyResult at 0x7f516c1a1a00>

In [16]:
len(top_n_words)

5

In [17]:
top_n_words[2]

{'idtopic': 2,
 'assigned_name': 'venir',
 'date': '2021-06-10',
 'importance': 27,
 'posts': ['5bae96f8a5b16a0001ef4e52',
  '5bb341aea5b16a0001ef4ee9',
  '5bb34bb4a5b16a0001ef4ef0',
  '5bbbc99ca5b16a0001ef5045',
  '5bbbcaefa5b16a0001ef5046',
  '5bbc4d12a5b16a0001ef5053',
  '5bbc6861a5b16a0001ef5069',
  '5bbdb881a5b16a0001ef50b2',
  '5bbf3b79a5b16a0001ef50fb',
  '5bbf6976a5b16a0001ef5109',
  '5bc142eaa5b16a0001ef518d',
  '5bc6faaea5b16a0001ef5215',
  '5bc8413204031a0001cf64e5',
  '5bc85b5b04031a0001cf64ed',
  '5bc8843804031a0001cf64f5',
  '5bc98c0004031a0001cf6501',
  '5bcd9cda04031a0001cf6542',
  '5be4758804031a0001cf6715',
  '5be55c5b04031a0001cf6729',
  '5be5888d04031a0001cf674c',
  '5bead8eb04031a0001cf67a2',
  '5bf2937d04031a0001cf67f2',
  '5bf7e22204031a0001cf6865',
  '5c0954da04031a0001cf6906',
  '5c0e9272a4791700013bfcf6',
  '5c1ba863a4791700013bfdeb',
  '5c2353d6a4791700013bfe5d'],
 'word-cloud': [{'text': 'day', 'weight': 0.18199389874164668},
  {'text': 'venir', 'weight': 0.