In [2]:

from text_preprocessing import text_processing

import string
import sys

from pathlib import Path
from gensim import corpora, models
import pandas as pd
from nltk import WordNetLemmatizer
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import scipy.cluster.hierarchy as sch
from nltk.corpus import *
from nltk.tokenize import *
import contractions
from sklearn.feature_extraction import *
from sklearn.metrics import *
import re
import nltk



def return_dict_corpus(train_df):
    texts = list(train_df.lemmatize_title_w_pos.values)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    print(f'number of unique tokens: {len(dictionary)}')
    print(f'number of documents : {len(corpus)}')

    return texts, dictionary, corpus


path = Path(r'../Tawosi_Dataset')

train, valid, test = pd.read_csv(path / 'DM-train.csv'), pd.read_csv(path / 'DM-valid.csv'), pd.read_csv(
    path / 'DM-test.csv')

data = pd.concat([train, valid])

texts = []

# print(train)

train = text_processing(train)

texts, dictionary, corpus = return_dict_corpus(train)

num_topics = 20

lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=100,
    chunksize=2000,
    iterations=400,
    passes=20,
    per_word_topics=True,
    alpha='auto',
    eta='auto',
    eval_every=True
)

topics = lda_model[corpus]

train_ik = train.issuekey.values


test_prob = {}

topics_number = set(f'topic_{i}' for i in range(num_topics))

for key, prob in zip(train_ik, topics):
    top_preds = {}

    for (topic_no, value) in prob[0]:
        top_preds[f'topic_{topic_no}'] = value


    for tn in topics_number:
        if not top_preds.get(tn, None):
            top_preds[tn] = 0.0

    test_prob[key] = top_preds


prob_df_cols = sorted(list(topics_number), key=lambda x: int(x.split('_')[1]))

prob_df = pd.DataFrame.from_dict(test_prob, orient='index')

prob_df.index.name = 'issuekey'

prob_df.to_csv('prob_df.csv')

dendrogram = sch.dendrogram(sch.linkage(prob_df.values, method='ward'), no_plot=True)

cn = len(set(dendrogram['color_list'])) - 1

print(f"cluster no: {cn}")

ac_m = AgglomerativeClustering(n_clusters=cn, affinity='euclidean', linkage='ward')

preds = ac_m.fit_predict(prob_df.values)

prob_df['labels'] = preds

print(prob_df.labels.value_counts())



number of unique tokens: 4183
number of documents : 3227
cluster no: 9
0    1508
1     327
7     246
4     222
3     211
2     194
5     190
6     187
8     142
Name: labels, dtype: int64


In [4]:
test_df = text_processing(test)
t_texts, t_dictionary, t_corpus = return_dict_corpus(test)

number of unique tokens: 1936
number of documents : 1078


In [5]:
t_topics = lda_model[t_corpus[0]]

In [13]:
t_topics

([(3, 0.17525354),
  (4, 0.5068754),
  (6, 0.010642398),
  (11, 0.010100877),
  (16, 0.010015527),
  (18, 0.17296453)],
 [(0, [4]), (1, [4]), (2, [4]), (3, [18]), (4, [3])],
 [(0, [(4, 0.9999258)]),
  (1, [(4, 0.9999708)]),
  (2, [(4, 0.9997431)]),
  (3, [(18, 0.9996705)]),
  (4, [(3, 0.99963593)])])

In [14]:
def return_prob_list(probs):
    ind_prob = {}

    for (topic_no, prob) in probs[0]:
        ind_prob[f'topic_{topic_no}'] = prob

    for tn in topics_number:
        if not ind_prob.get(tn, None): ind_prob[tn] = 0.0

    return ind_prob


In [15]:
return_prob_list(t_topics)

{'topic_3': 0.17525354,
 'topic_4': 0.5068754,
 'topic_6': 0.010642398,
 'topic_11': 0.010100877,
 'topic_16': 0.010015527,
 'topic_18': 0.17296453,
 'topic_12': 0.0,
 'topic_19': 0.0,
 'topic_15': 0.0,
 'topic_9': 0.0,
 'topic_14': 0.0,
 'topic_10': 0.0,
 'topic_0': 0.0,
 'topic_17': 0.0,
 'topic_13': 0.0,
 'topic_1': 0.0,
 'topic_2': 0.0,
 'topic_5': 0.0,
 'topic_7': 0.0,
 'topic_8': 0.0}

In [26]:
inp = list(dict(sorted(return_prob_list(t_topics).items(), key= lambda item: int(item[0].split('_')[1]))).values())

In [47]:
inps = {
    'inp-6767': return_prob_list(t_topics)
}

inp_df = pd.DataFrame.from_dict(inps, orient='index')
inp_df.index.name = 'issue_key'

In [48]:
# ac_m.fit_predict(inp_df.values)

In [49]:
inp_df.values

array([[0.17525354, 0.5068754 , 0.0106424 , 0.01010088, 0.01001553,
        0.17296453, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

In [97]:
prob_df['index_number'] = [i for i in range(len(prob_df))]
src_df = prob_df.drop(['labels', 'index_number'], axis=1)

In [98]:
src_df.values

array([[0.17525356, 0.50687534, 0.01064244, ..., 0.        , 0.        ,
        0.        ],
       [0.01109526, 0.0104981 , 0.01275935, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.42361313, 0.01064141, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01843554, 0.01744332, 0.02120055, ..., 0.01389301, 0.01601211,
        0.67558736],
       [0.34927103, 0.01744479, 0.35192987, ..., 0.01389418, 0.01601346,
        0.01399854]])

In [99]:
arr2 = inp_df.values[0]

In [100]:
euclidean_distance = {ind: np.linalg.norm(arr1 - arr2) for ind, arr1 in enumerate(src_df.values)}

In [101]:
dict(sorted(euclidean_distance.items(), key=lambda x: float(x[1]), reverse=True))

{1429: 1.0880053150253521,
 1619: 1.0829137379533318,
 1430: 1.0776496469345949,
 1624: 1.0718388243096886,
 1807: 1.0718080541668644,
 1809: 1.0718060264922202,
 378: 1.0651420104184561,
 1588: 1.064393424171033,
 1530: 1.064348480130638,
 860: 1.063690430520844,
 906: 1.0568276720726424,
 1332: 1.05681204270801,
 1929: 1.0557050823428136,
 1067: 1.0554981266080672,
 1313: 1.055497320384136,
 62: 1.055490568268364,
 188: 1.0553737694728629,
 1331: 1.0552220596770958,
 1529: 1.05521238601683,
 228: 1.0552112775788538,
 1307: 1.0550267795238037,
 1601: 1.0478321693302182,
 1705: 1.0478319683530113,
 223: 1.0478262405092154,
 130: 1.0478040706557559,
 403: 1.047799397985654,
 1832: 1.044846442251425,
 339: 1.0447408605582071,
 14: 1.0443160690386204,
 1071: 1.044175089684111,
 1688: 1.0440423956328317,
 67: 1.0440356733212135,
 413: 1.0439176846038256,
 479: 1.043869126216511,
 1967: 1.043853575689962,
 766: 1.043852572433461,
 1306: 1.0438524721078328,
 1010: 1.0438080282451114,
 282: 1

In [102]:
prob_df[prob_df.index_number==1429]

Unnamed: 0_level_0,topic_3,topic_4,topic_6,topic_11,topic_16,topic_18,topic_12,topic_19,topic_15,topic_9,...,topic_0,topic_17,topic_13,topic_1,topic_2,topic_5,topic_7,topic_8,labels,index_number
issuekey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DM-7275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.93053,0.0,0.0,0.0,0.0,0.0,0.0,0,1429
