In [25]:
import csv
import os
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from mlxtend.frequent_patterns import fpgrowth
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
df = pd.read_csv("../data/JIRA_Operation_issue.csv", usecols=['Summary', 'Description'])
df.shape

(290, 2)

In [27]:
df.head()

Unnamed: 0,Summary,Description
0,5718391 - [Prod]| E38 |[Measurement/Activity M...,"Hello team,\n\nDetails:\nDB: mes_e38_01\nServe..."
1,Machine Gantt Report V2 - Remove completed SN ...,"Hello Team,\n\nBelieve a misunderstanding happ..."
2,[External Non Medical PROD] - Query on MFG Lin...,Site: Bedford PROD\n\nMESR Server: [42qrpt1.42...
3,5672686 - [PROD [S13 MES16] [LABEL-ENGINE] -SC...,"Hello Antonio,\n\nDetails:\nLabel name: Scrap_..."
4,Issue with split and removal of components com...,


In [28]:
class LemmaTokenizer(object):

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()

    def __call__(self, document):
        lemmas = []
        
        # Pre-proccessing of one document at the time
        
        # Removing puntuation
        translator_1 = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
        document = document.translate(translator_1)

        # Removing numbers
        document = re.sub(r'\d+', ' ', document)

        # Removing special characters
        document = re.sub(r"[^a-zA-Z0-9]+", ' ', document)

        # The document is a string up to now, after word_tokenize(document) we'll work on every word one at the time
        for token in word_tokenize(document):
            
            # Removing spaces
            token = token.strip()
            
            # Lemmatizing
            token = self.lemmatizer.lemmatize(token)

            # Removing stopwords
            if token not in stopwords and len(token) > 2:
                lemmas.append(token)
        return lemmas

In [29]:
def generate_wordclouds(X, in_X_tfidf, k, in_word_positions):

    # Clustering
    in_model = KMeans(n_clusters=k, random_state=42, n_jobs=-1)
    in_y_pred = in_model.fit_predict(X)
    in_cluster_ids = set(in_y_pred)
    silhouette_avg = silhouette_score(X, in_y_pred)
    print("For n_clusters =", k, "The average silhouette_score is :", silhouette_avg)

    # Number of words with highest tfidf score to display
    top_count = 100

    for in_cluster_id in in_cluster_ids:
        # compute the total tfidf for each term in the cluster
        in_tfidf = in_X_tfidf[in_y_pred == in_cluster_id]
        # numpy.matrix
        tfidf_sum = np.sum(in_tfidf, axis=0)
        # numpy.array of shape (1, X.shape[1])
        tfidf_sum = np.asarray(tfidf_sum).reshape(-1)
        top_indices = tfidf_sum.argsort()[-top_count:]
        term_weights = {in_word_positions[in_idx]: tfidf_sum[in_idx] for in_idx in top_indices}
        wc = WordCloud(width=1200, height=800, background_color="white")
        wordcloud = wc.generate_from_frequencies(term_weights)
        fig, ax = plt.subplots(figsize=(10, 6), dpi=100)
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.axis("off")
        fig.suptitle(f"Cluster {in_cluster_id}")
        plt.show()

    return in_cluster_ids

# 
def custom_import_stopwords(filename):
    in_stopword_list = []
    in_flag = 0
    in_word_cnt = 0

    with open(filename, encoding="utf8") as f:
        for row in csv.reader(f):
            if in_flag == 0:
                in_flag = 1
            else:
                in_stopword_list.append(row[0])
                in_word_cnt += 1

    print(f"{in_word_cnt} stopwords imported")
    return in_stopword_list

In [30]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ryan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [31]:
custom_stopwords = custom_import_stopwords('../data/custom_stopwords.csv')

9 stopwords imported


In [32]:
all_stopwords = custom_stopwords + stopwords.words("english")

In [34]:
contents = df['Summary']
contents.to

0      5718391 - [Prod]| E38 |[Measurement/Activity M...
1      Machine Gantt Report V2 - Remove completed SN ...
2      [External Non Medical PROD] - Query on MFG Lin...
3      5672686 - [PROD [S13 MES16] [LABEL-ENGINE] -SC...
4      Issue with split and removal of components com...
                             ...                        
285       Tomcat restart when adding new plants into PTS
286          Print queues not registered in Label Engine
287                            Issues restoring database
288    Print queue not configured properly in the pro...
289       Problems when creating or patching MESR DBs.  
Name: Summary, Length: 290, dtype: object

In [None]:
# Custom tokenizer for tfidf representation
vectorizer = TfidfVectorizer(input='filename', tokenizer=LemmaTokenizer())