In [None]:
import pandas as pd
import numpy as np
import glob
import re
from pprint import pprint

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

import scipy.cluster.hierarchy as shc
import scipy.spatial.distance as spd
from scipy.sparse.csr import csr_matrix

#!pip install tensorflow_hub
#!pip install tensorflow_text

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

import itertools

np.set_printoptions(precision=2, suppress=True)
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 100

#HC
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering

#kmeans
from sklearn.cluster import KMeans

#LDA
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
path = "leetchi-lepot-sentence.csv"
data = pd.read_csv(path, index_col = 0)
print(data.shape)
data.dropna(subset=["comment_sentence"],inplace = True)
print(data.shape)

data1 = data.comment_sentence

### Feature extraction

##### TFIDF

In [None]:
tf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 1), max_features=600)
tfidf_matrix =  tf.fit_transform(data1)
print(tfidf_matrix.shape)
print(type(tfidf_matrix))

X = tfidf_matrix
X.shape
#no problem with 100K data

##### TFHUB

In [None]:
# use grouper for 100K sentences (20K sentences)

def grouper(lst, n):
    start = 0
    stop = n
    length = len(lst)
    while start < length:
        yield lst[start : stop]
        start += n
        stop += n

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
reviews = data.comment_sentence.values.tolist()

embeddings = []
# Had to use this solution to avoid kernel dying.
for chunk in grouper(reviews, 1000):
    embeddings.append(np.array(embed(chunk)))
    
X = np.vstack(embeddings)
X.shape

### Clustering

##### k-means + elbow

In [None]:
#choose optimal k by elbow method
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
    print(k)
    km = KMeans(n_clusters=k)
    km = km.fit(X)
    Sum_of_squared_distances.append(km.inertia_)
    
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
# k = 3
kmeans = KMeans(n_clusters=3).fit(tfidf_matrix)
data['Kmeans_TFIDF'] = kmeans.labels_
print(data['Kmeans_TFIDF'].value_counts())

#sentence samples in each cluster
for cluster_id in range(3):
    print(cluster_id)
    print(data[data["Kmeans_TFIDF"] == cluster_id].sample(5)["comment_sentence"])
    print("")

data.to_csv('Kmeans_TFIDF_3.csv')

##### hierarchical clustering (HC)

In [None]:
#cosine distance

#with 20K data, use the following codes to calc cos dist 
X = cosine_similarity(tfidf_matrix)
cos_dist = 1 - X
for i in range(cos_dist.shape[0]):
    cos_dist[i, i] = 0.
cos_dist = np.maximum(0., cos_dist)
print(cos_dist[:5, :5])
X_flat = spd.squareform(cos_dist)

In [None]:
#HC: choose thres according to the plot
Z = shc.linkage(X_flat, method='ward')
plt.figure(figsize=(10, 7))  
plt.title("Dendrograms")  
dend = shc.dendrogram(Z)
#plt.axhline(y=150, color='black', linestyle='--')

thres = 27

In [None]:
#results
C = shc.fcluster(Z, thres , criterion="distance") # 600
data['HC_TFHUB_3'] = C
data.to_csv('HC_TFHUB_3.csv')

#print examples
for cluster_id in range(1, C.max()+1):
    print(cluster_id)
    print(data2[data2["cluster"] == cluster_id].sample(5)["comment_text"])
    print("")

### LDA

In [None]:
text = data.comment_sentence

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))

data_words = list(sent_to_words(text))

print(data_words[10])

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=3, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[1]]])

nlp = spacy.load('fr', disable=['parser', 'ner'])
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def filter_by_pos(texts, allowed_postags):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.text for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'fr' model, keeping only tagger component (for efficiency)
# python3 -m spacy download fr


# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = filter_by_pos(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'PROPN'])  # 'VERB', 'ADV', 'X', 'ADJ'

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics(num_words=30))
doc_lda = lda_model[corpus]

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

vis #n_topic = 3

In [None]:
#save results
pyLDAvis.save_html(vis, 'lda.html')

#cluster label = the cluster with max posibility
data['LDA_S12345'] = [np.array(doc_lda[i][0]).argmax(axis = 0)[1] for i in range(data.shape[0])]

data.to_csv("LDA-3.csv")