# SOMBR: Using SOM to Triage Software Bug Reports

## Setup

### Install specific versions of packages

```
matplotlib                3.6.3                    pypi_0    pypi
scikit-learn              1.2.0                    pypi_0    pypi
torch                     1.13.1                   pypi_0    pypi
torchvision               0.14.1                   pypi_0    pypi
gensim                    4.3.0                    pypi_0    pypi
sentence-transformers     2.2.2                    pypi_0    pypi
transformers              4.25.1                   pypi_0    pypi
minisom                   2.3.0                    pypi_0    pypi
```

In [1]:
# !pip freeze

In [2]:
!python --version

Python 3.10.11


In [3]:
# !pip install scikit-learn==1.2.0
# !pip install matplotlib==3.6.3
# !pip install torch
# !pip install torchvision


In [4]:
# !pip install gensim==4.3.0


In [5]:
# !pip install transformers==4.25.1
# !pip install sentence-transformers==2.2.2
# !pip install minisom==2.3.0

In [6]:
# !pip install nltk

### Import package dependencies

In [7]:
import os
import re
import sys
import sqlite3
import pandas as pd
import numpy as np
# from tqdm import tqdm
from tqdm.notebook import trange, tqdm
import time
import datetime
 

In [8]:

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords


In [9]:
if os.path.abspath('..') not in sys.path:
    sys.path.append(os.path.join(os.path.abspath('..')))
    
    print('parent not in path, appending')
    
if os.path.abspath('.') not in sys.path:
    sys.path.append(os.path.join(os.path.abspath('.')))
    
    print('parent not in path, appending')
    
if os.path.abspath('/gcs') not in sys.path:
    sys.path.append(os.path.join(os.path.abspath('/gcs')))
    
    print('gcs not in path, appending')

parent not in path, appending
gcs not in path, appending


In [10]:
sys.path

['/home/far/work',
 '/opt/conda/lib/python310.zip',
 '/opt/conda/lib/python3.10',
 '/opt/conda/lib/python3.10/lib-dynload',
 '',
 '/home/far/.local/lib/python3.10/site-packages',
 '/opt/conda/lib/python3.10/site-packages',
 '/home/far',
 '/gcs']

In [11]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


In [12]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
lemmatizer = WordNetLemmatizer()

In [13]:
import torch

In [14]:
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel

In [15]:
import gensim
import gensim.downloader as api
from gensim.utils import save_as_line_sentence

from gensim.models.word2vec import Word2Vec
from gensim.models.word2vec import LineSentence

from gensim.models.doc2vec import Doc2Vec
from gensim.models import KeyedVectors
from gensim import models as gensim_models


print(gensim.models.word2vec.CORPUSFILE_VERSION)  # must be >= 0, i.e. optimized compiled version


1


In [16]:
print(gensim.__version__, gensim.__file__)


4.3.1 /opt/conda/lib/python3.10/site-packages/gensim/__init__.py


In [17]:
from minisom import MiniSom

In [18]:
from IPython.display import Audio

sound_file = './../../../../SOUNDS/WAV_Sharp.wav'

In [19]:
def timenow():
    return datetime.datetime.now().strftime("%H:%M:%S")

def play_done(s=sound_file):
    print(timenow())
    # display(Audio(s, autoplay=True))
    return

In [20]:
model_base = "./models/"
csv_base = "./csv/"

### Load pre-trained models

### BERT

In [21]:
## BERT
bert_modelname = "nreimers/MiniLM-L6-H384-uncased"
bert_modelname = "nreimers/MiniLM-L6-H384-uncased"
bert_tokzr = BertTokenizer.from_pretrained(bert_modelname)
bert_model = BertModel.from_pretrained(bert_modelname)

### SBERT

In [22]:
## SBERT
sbert_modelname = 'all-MiniLM-L6-v2'
sbert_model = SentenceTransformer(sbert_modelname)

# sbert_model.encode(df['summary'].values)
sbert_model.encode("Duplicate Bug Report").shape


(384,)

### Word2Vec

In [23]:
%%time
## Word2Vec CBOW
cbow_modelpath = model_base + "word2vec_cbow.model"
cbow_model = Word2Vec.load(cbow_modelpath,mmap='r')

CPU times: user 7.53 s, sys: 49.8 ms, total: 7.58 s
Wall time: 7.57 s


In [24]:
## Test it loaded
cbow_model.wv['computer'].shape

(300,)

In [25]:
gcs_base = "./../gcs/"

### DOC2VEC

In [26]:
%%time
## Doc2Vec DBOW
dbow_modelpath = model_base + "doc2vec_dbow.model"
dbow_model = Doc2Vec.load(dbow_modelpath, mmap='r')

CPU times: user 5.48 s, sys: 98.4 ms, total: 5.58 s
Wall time: 5.59 s


In [27]:
# CPU times: user 7.2 s, sys: 202 ms, total: 7.4 s
# Wall time: 7.52 s

In [28]:
## Test it loaded
dbow_model.infer_vector("Duplicate Bug Report".split()).shape

(300,)

### Define variables

In [29]:
BR_QUERY_ALL = """SELECT * from issue"""

KEEP_ISSUE_COLS = ['issue_id', 'type', 'created_date', 'resolved_date', 'summary', 'description', 'priority', 'status', 'resolution', 'assignee_username']
KEEP_RESOLUTION = ['Done', 'Duplicate', 'Fixed', 'Resolved']
KEEP_STATUS = ['Resolved', 'Closed']
KEEP_TYPE = ['Bug']


THRESHOLD_CORPUS_SIZE = 3000

KEEP_DB_COLS_QUERY = """
SELECT 
  {} 
from issue
""".format((",\n").join(KEEP_ISSUE_COLS))

In [30]:
# db_path = "./db"
# db_list = ['railo.sqlite3',
#  'seam2.sqlite3',
#  'kafka.sqlite3',
#  'resteasy.sqlite3',
#  'jboss-tm.sqlite3',
#  'hbase.sqlite3',
#  'hive.sqlite3',
#  'hornetq.sqlite3',
#  'zookeeper.sqlite3',
#  'jbpm.sqlite3',
#  'teiid.sqlite3',
#  'wildfly.sqlite3',
#  'izpack.sqlite3',
#  'weld.sqlite3',
#  'log4j2.sqlite3',
#  'lucene.sqlite3',
#  'infinispan.sqlite3',
#  'hadoop.sqlite3',
#  'flink.sqlite3',
#  'errai.sqlite3',
#  'axis2.sqlite3',
#  'jbehave.sqlite3',
#  'groovy.sqlite3',
#  'pig.sqlite3',
#  'cassandra.sqlite3',
#  'archiva.sqlite3',
#  'drools.sqlite3',
#  'derby.sqlite3',
#  'keycloak.sqlite3',
#  'hibernate.sqlite3',
#  'switchyard.sqlite3',
#  'maven.sqlite3',
#  'spark.sqlite3']

# db_list.sort()
# db_names = [i.split(".")[0] for i in db_list]


## Extract text from DB

### Filter data

In [31]:
# def get_filtered_db(db_list, db_path_base, csv_path_base, csv_filtered_path):
#     f_counter = {}
#     f_counts = {}
    

#     for db_i in tqdm(db_list):
#         db_path = os.path.join(db_path_base, db_i)
        
#         con = sqlite3.connect(db_path)
#         cur = con.cursor()

#         db_name = db_i.split(".")[0]
#         df_star = pd.read_sql_query(BR_QUERY_ALL, con)
#         df_star.sort_values(by="resolved_date", inplace=True)
#         df_star.reset_index(drop=True, inplace=True)
        
#         df_star['isbug']          = df_star.apply(lambda row: (1 if (row['type']=='Bug') else 0), axis=1 )
#         df_star['statusdone']     = df_star.apply(lambda row: (1 if (row['status'] in KEEP_STATUS) else 0), axis=1 )
#         df_star['resolutiondone'] = df_star.apply(lambda row: (1 if (row['resolution'] in KEEP_RESOLUTION) else 0), axis=1 )
#         df_star['hasassigned']    = (~df_star['assignee_username'].isna()).astype(int)
        
#         df_filt = df_star[(df_star['isbug']==1) & 
#                 (df_star['statusdone']==1) &
#                 (df_star['resolutiondone']==1) & 
#                 (df_star['hasassigned']==1)
#                ]
#         df_filt.reset_index(drop=True, inplace=True)

#         if df_filt.shape[0] > THRESHOLD_CORPUS_SIZE:
#             ## Keep all rows
#             csv_name = db_name + "_issue.csv"

#             csv_path_b = os.path.join(csv_path_base, csv_name)
#             df_star[KEEP_ISSUE_COLS].to_csv(csv_path_b, index=False)
            
#             ## Keep filtered rows
#             csv_name = db_name + "_filtered.csv"

#             csv_path_f = os.path.join(csv_filtered_path, csv_name)
#             df_filt[KEEP_ISSUE_COLS].to_csv(csv_path_f, index=False)


#             f_counter[db_name] = df_filt[KEEP_ISSUE_COLS]
    
#     return f_counter






In [32]:
def get_filtered_from_csv(csv_path_base, csv_path_dest):
    f_counter = {}
    f_counts = {}
    
    csv_files = os.listdir(csv_path_base)
    files = [f for f in csv_files if os.path.isfile(csv_path_base+'/'+f)] #Filtering only the files.
    # print(files)
    
    tqdm_csv = tqdm(files)
    for f in tqdm_csv:
        tqdm_csv.set_description(f)
        csv_path = csv_path_base+f
        df_filt = pd.read_csv(csv_path)
        db_name = f.split(".")[0].split("_")[0]
        # print(db_name)
        # print(csv_path)
        # print("---")

        df_star = pd.read_csv(csv_path)
        df_star.sort_values(by="resolved_date", inplace=True)
        df_star.reset_index(drop=True, inplace=True)
        
        df_star['isbug']          = df_star.apply(lambda row: (1 if (row['type']=='Bug') else 0), axis=1 )
        df_star['statusdone']     = df_star.apply(lambda row: (1 if (row['status'] in KEEP_STATUS) else 0), axis=1 )
        df_star['resolutiondone'] = df_star.apply(lambda row: (1 if (row['resolution'] in KEEP_RESOLUTION) else 0), axis=1 )
        df_star['hasassigned']    = (~df_star['assignee_username'].isna()).astype(int)
        
        df_filt = df_star[(df_star['isbug']==1) & 
                (df_star['statusdone']==1) &
                (df_star['resolutiondone']==1) & 
                (df_star['hasassigned']==1)
               ]
        df_filt.reset_index(drop=True, inplace=True)
        
        ## Keep filtered rows
        csv_name = db_name + "_filtered.csv"

        csv_path_f = os.path.join(csv_path_dest, csv_name)
        df_filt[KEEP_ISSUE_COLS].to_csv(csv_path_f, index=False)


        f_counter[db_name] = df_filt[KEEP_ISSUE_COLS]
        
        # print(df_filt.shape[0])
        # print(df_star.shape[0])
        # print("=======")
        
    
    return 


extract_base_path = csv_base + "DB_extract/"
csv_path_dest = csv_base + "DB_filtered/"


# get_filtered_from_csv(extract_base_path, csv_path_dest)


In [33]:
gcs_base

'./../gcs/'

In [34]:
filtered_base_path = csv_base + "DB_filtered/"
csv_files = os.listdir(filtered_base_path)
files = [f for f in csv_files if os.path.isfile(filtered_base_path+'/'+f)] #Filtering only the files.
files = [f for f in files if not f.startswith('.')]
files

['lucene_filtered.csv',
 'hive_filtered.csv',
 'infinispan_filtered.csv',
 'spark_filtered.csv',
 'jbpm_filtered.csv',
 'hibernate_filtered.csv',
 'cassandra_filtered.csv',
 'hadoop_filtered.csv',
 'groovy_filtered.csv',
 'hbase_filtered.csv']

In [35]:
corpus_names = [f.split("_")[0] for f in files]
corpus_names

['lucene',
 'hive',
 'infinispan',
 'spark',
 'jbpm',
 'hibernate',
 'cassandra',
 'hadoop',
 'groovy',
 'hbase']

In [36]:
def get_filtered_csv(filtered_base_path):
    f_counter = {}
    f_counts = {}
    
    csv_files = os.listdir(filtered_base_path)
    files = [f for f in csv_files if os.path.isfile(filtered_base_path+'/'+f)] #Filtering only the files.
    files = [f for f in files if not f.startswith('.') ]

    tqdm_csv = tqdm(files)
    for f in tqdm_csv:
        tqdm_csv.set_description(f)
        csv_path = filtered_base_path+f
        df_filt = pd.read_csv(csv_path)
        db_name = f.split(".")[0].split("_")[0]
        f_counter[db_name] = df_filt
    
    return f_counter


In [37]:
filtered_base_path = csv_base + "DB_filtered/"
filtered_csv = get_filtered_csv(filtered_base_path)

  0%|          | 0/10 [00:00<?, ?it/s]

In [38]:
cass_df = filtered_csv['cassandra'].copy()

## Vectorizer functions

In [39]:
KEEP_META_COLS = ["summary","issue_id", "assignee_username", "resolved_date"]
VECTORIZE_COL = "summary"
LABEL_COL = "assignee_username"
TRAIN_PCT = .7
TEST_PCT = 1 - TRAIN_PCT
RANDOM_STATE = 0

TFIDF_MIN_DF = 0.005

In [40]:
def split_and_label_df(df, xcol, ycol):
    X = df[[xcol]]
    y = df[ycol]
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        shuffle=False,
                                                        test_size=TEST_PCT, 
                                                        random_state=RANDOM_STATE)
    
    oe = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=-1)
    
    oe.fit(np.array(y_train).reshape(-1,1))
    
    y_train_encoded = oe.transform(np.array(y_train).reshape(-1,1))
    y_test_encoded  = oe.transform(np.array(y_test).reshape(-1,1))

    return X_train, X_test, y_train_encoded, y_test_encoded



### Vectorizer: TFIDF

In [41]:

def no_pipe_fn(x):
    return x


def vectorize_tfidf(df, xcol, ycol, split=True, pp_fn=no_pipe_fn):

    if split:
    
        X_train, X_test, y_train_encoded, y_test_encoded = split_and_label_df(df, xcol, ycol)

        tv       = TfidfVectorizer()
        
        X_train_ = X_train[xcol].apply(lambda x: pp_fn(x))
        X_test_  = X_test[xcol].apply(lambda x: pp_fn(x))

        tv.fit(X_train_)
        
        X_train_tv = tv.transform(X_train_).toarray()
        X_test_tv  = tv.transform(X_test_).toarray()

        return X_train_tv, X_test_tv, y_train_encoded, y_test_encoded

    else:
        tv = TfidfVectorizer()
        
        X_ = df[xcol].apply(lambda x: pp_fn(x))
        X_ = tv.fit_transform(X_).toarray()

        oe = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=-1)
        y_ = oe.fit_transform(np.array(df[ycol]).reshape(-1,1))
        return X_, y_

### Vectorizer: Word2Vec

In [42]:
def get_cbow_vectors(w, model):
    try:
        return model.wv[w]
    except KeyError:
        return np.zeros((300,))

def get_cbow_sentence(s, model):

    vec = np.array([get_cbow_vectors(i, cbow_model) for i in s.split(" ")])
    return np.mean(vec, axis=0)




In [43]:
def vectorize_w2v(df, xcol, ycol, model, split=True, pp_fn=no_pipe_fn):

    if split:

        X_train, X_test, y_train_encoded, y_test_encoded = split_and_label_df(df, xcol, ycol)
        
        X_train_ = X_train[xcol].apply(lambda x: pp_fn(x))
        X_test_  = X_test[xcol].apply(lambda x: pp_fn(x))
        
        X_train_w = np.array([get_cbow_sentence(i, model) for i in X_train_])
        X_test_w = np.array([get_cbow_sentence(i, model) for i in X_test_])
        
        return X_train_w, X_test_w, y_train_encoded, y_test_encoded

    else:
        
        X_ = df[xcol].apply(lambda x: pp_fn(x))
        X_ = np.array([get_cbow_sentence(i, model) for i in X_])

        oe = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=-1)
        y_ = oe.fit_transform(np.array(df[ycol]).reshape(-1,1))
        return X_, y_

### Vectorizer: Doc2Vec

In [44]:
# dbow_model.infer_vector("Duplicate Bug Report Willthisbreak".split())


In [45]:
def vectorize_d2v(df, xcol, ycol, model, split=True, pp_fn=no_pipe_fn):

    if split:

        X_train, X_test, y_train_encoded, y_test_encoded = split_and_label_df(df, xcol, ycol)
        
        X_train_ = X_train[xcol].apply(lambda x: pp_fn(x))
        X_test_  = X_test[xcol].apply(lambda x: pp_fn(x))
        
        X_train_w = np.array([model.infer_vector(i.split()) for i in X_train_])
        X_test_w = np.array([model.infer_vector(i.split()) for i in X_test_])
        
        return X_train_w, X_test_w, y_train_encoded, y_test_encoded

    else:
        
        X_ = df[xcol].apply(lambda x: pp_fn(x))
        X_ = np.array([model.infer_vector(i.split()) for i in X_])

        oe = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=-1)
        y_ = oe.fit_transform(np.array(df[ycol]).reshape(-1,1))
        return X_, y_

### Vectorizer: BERT

In [46]:
# bert_tokenized = bert_tokzr.batch_encode_plus(cass_df['summary'].to_list(), 
#                                   add_special_tokens=True, 
#                                   padding='longest'
#                                 )
# bert_encoded = np.array(bert_tokenized['input_ids'])

# bert_input_ids = torch.tensor(bert_encoded)
# with torch.no_grad():
#     last_hidden_states = bert_model(bert_input_ids)
# bert_features = last_hidden_states[0][:,0,:].numpy()


In [47]:
def vectorize_bert(df, xcol, ycol, tokenizer, model, split=True, pp_fn=no_pipe_fn):

    def bertify(X):
        X_tkd = tokenizer.batch_encode_plus(X.to_list(), 
                                  add_special_tokens=True, 
                                  padding='longest'
                                )
        X_enc = np.array(X_tkd['input_ids'])
        X_ii = torch.tensor(X_enc)

        with torch.no_grad():
            X_lhs = model(X_ii)
        
        X_bert = X_lhs[0][:,0,:].numpy()
        
        return X_bert

    if split:

        X_train, X_test, y_train_encoded, y_test_encoded = split_and_label_df(df, xcol, ycol)
        
        X_train_ = X_train[xcol].apply(lambda x: pp_fn(x))
        X_test_  = X_test[xcol].apply(lambda x: pp_fn(x))

        X_train_bert = bertify(X_train_)
        X_test_bert  = bertify(X_test_)
                
        return X_train_bert, X_test_bert, y_train_encoded, y_test_encoded

    else:
        
        X_ = df[xcol].apply(lambda x: pp_fn(x))
        X_ = bertify(X_)

        oe = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=-1)
        y_ = oe.fit_transform(np.array(df[ycol]).reshape(-1,1))
        return X_, y_

In [48]:
# print(bert_encoded.shape)
# print(bert_features.shape)

### Vectorizer: SBERT

In [49]:
# cass_df_small = cass_df.copy()
# cass_df_small = cass_df_small.iloc[:200,:]

# # Test it works
# # dbow_model.infer_vector("Duplicate Bug Report".split()).shape

# x1, x2, y1, y2 = split_and_label_df(cass_df_small, VECTORIZE_COL, LABEL_COL)


# sbert_model.encode(x1.values.flatten()).shape

# sbert_model.encode(cass_df_small['summary'].values).shape

# cass_df_small[['summary']]

In [50]:
def vectorize_sbert(df, xcol, ycol, model, split=True, pp_fn=no_pipe_fn):

    if split:

        X_train, X_test, y_train_encoded, y_test_encoded = split_and_label_df(df, xcol, ycol)
        
        X_train_ = X_train[xcol].apply(lambda x: pp_fn(x))
        X_test_  = X_test[xcol].apply(lambda x: pp_fn(x))

        X_train_ = model.encode(X_train_.values.flatten())
        X_test_  = model.encode(X_test_.values.flatten())
                
        return X_train_, X_test_, y_train_encoded, y_test_encoded

    else:
        
        X_ = df[xcol].apply(lambda x: pp_fn(x))
        
        X_ = model.encode(X_.values.flatten())
        
        oe = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=-1)
        y_ = oe.fit_transform(np.array(df[ycol]).reshape(-1,1))
        return X_, y_

In [51]:
# v1, v2, = vectorize_sbert(cass_df_small, VECTORIZE_COL, LABEL_COL, sbert_model, split=False)

## Vectorize Corpora

In [52]:
filtered_csv.keys()

dict_keys(['lucene', 'hive', 'infinispan', 'spark', 'jbpm', 'hibernate', 'cassandra', 'hadoop', 'groovy', 'hbase'])

In [53]:
def vectorize_db(base_path, vectorizer_prefix, vectorizer_fn, *args, **kwargs):
    tqdm_bar = tqdm(filtered_csv.keys(), position=1, leave=False)

    for k in tqdm_bar:
        tqdm_bar.set_description("Processing DB: %s" % k)

        df_ = filtered_csv[k]
        X_tr, X_te, y_tr, y_te = vectorizer_fn(df_, *args, **kwargs)

        split_mapper = {
            'train': {'X': X_tr, 'y': y_tr},
            'test': {'X': X_te, 'y': y_te},
            # 'all': None,
        }

        to_concat = None
        for m in split_mapper:
            if m == 'all':
                splitkwargs = kwargs.copy()
                splitkwargs['split'] = False

                X_all, y_all = vectorizer_fn(df_, *args, **splitkwargs)# cbow_model, split=False)

                to_concat = pd.concat([pd.DataFrame(X_all), 
                                        pd.DataFrame(y_all, columns=[LABEL_COL]).astype(int)], 
                                       axis=1)
            else:
                to_concat = pd.concat([pd.DataFrame(split_mapper[m]['X']),
                                       pd.DataFrame(split_mapper[m]['y'] , 
                                                    columns=[LABEL_COL] ).astype(int)], 
                                      axis=1)
                # print(to_concat.shape[0])
                
                to_concat = to_concat[to_concat[LABEL_COL]>-1]
                # print(to_concat.shape[0],' ----')
            
            # base/cassandra_w2v_train.csv
            wfname = "{}/{}_{}_{}.csv".format(base_path, k, vectorizer_prefix, m)
            to_concat.to_csv(wfname, index=False)




In [54]:
timenow()

'20:43:46'

### Preprocessing pipeline functions

In [55]:
def remove_digits(t):
    t = re.sub(r'\d+', '', t)
    t = re.sub(' +', ' ', t)
    return t.strip()


def remove_symbols(t):
    ## TODO:
    ## consider quotation marks in various flavours
    ## consider converting dot to space? "some.example" => "some example"
    ## consider test case: write(s) or write[s] => writes ? write s ? 
    
    t = ''.join([re.sub(r'[\.;:\_,\"\'\/]','',t)])
    return t.strip()

def replace_symbols(t):    
    t = ''.join([re.sub(r'[^A-Za-z0-9 ]+',' ',t)])
    t = re.sub(' +', ' ', t)
    return t.strip()

def camelcase_splitter(t):
    """
    CamelCaseToComponentWords ==> Camel Case To Component Words
    """
    if t != '':
        t = re.sub('([A-Z])', r' \1', t)
        t = re.sub(' +', ' ', t)
        return t.strip()
    return t


def make_lowercase(t):
    """
    Lowercase text
    """
    if t != '':
        return t.lower()
    return t

def lemmatize_text(s):
    """
    Returns lemmatized sentence
    """
    
    pos_tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    new_s = []

    pos_tuples = nltk.pos_tag(nltk.word_tokenize(s)) 

    for word_idx, word in enumerate(nltk.word_tokenize(s)):
        nltk_word_pos = pos_tuples[word_idx][1]
        wordnet_word_pos = pos_tag_dict.get(
                          nltk_word_pos[0].upper(), None)
        if wordnet_word_pos is not None:
            new_word = lemmatizer.lemmatize(word, wordnet_word_pos)
        else:
            new_word = lemmatizer.lemmatize(word)

        new_s.append(new_word)

    new_sentence = " ".join(new_s)


    return new_sentence
    
def remove_stopwords(t):
    words = t.split(" ")
    filtered_words = [word for word in words if word not in stopwords.words('english')]

    return " ".join(filtered_words)

def truncate_sentence(s, l=300):
    ts = " ".join(s.split(" ")[:l])    
    return ts

def baseline_prep(t):
    # Baseline prep moved to extraction step
    # tokens = " ".join(word_tokenize(t))    
    # tokens = remove_digits(tokens)
    # tokens = replace_symbols(tokens)
    # tokens = truncate_sentence(tokens)
    return t

In [56]:
def pipe_lowercase(x):
    x = baseline_prep(x)
    return make_lowercase(x)

def pipe_camelcase(x):
    x = baseline_prep(x)
    return camelcase_splitter(x)

def pipe_camelcase_lower(x):
    x = baseline_prep(x)
    x = camelcase_splitter(x)
    return make_lowercase(x)

def pipe_stopwords(x):
    x = baseline_prep(x)
    x = camelcase_splitter(x)
    x = make_lowercase(x)
    
    return remove_stopwords(x)

def pipe_lemma(x):
    x = baseline_prep(x)
    x = camelcase_splitter(x)
    x = make_lowercase(x)
    x = remove_stopwords(x)
    return lemmatize_text(x)


In [57]:
s = "login request via Thrift PHP fails with Unexpected authentication problem in cassandra log Internal error processing login in Thrift"
" ".join([lemmatizer.lemmatize(t) for t in s.split(" ")])

'login request via Thrift PHP fails with Unexpected authentication problem in cassandra log Internal error processing login in Thrift'

In [58]:
' '.join([lemmatizer.lemmatize(i) for i in s.split(" ")])

'login request via Thrift PHP fails with Unexpected authentication problem in cassandra log Internal error processing login in Thrift'

### Test on subset

In [59]:
cass_294 = cass_df.loc[cass_df['issue_id']=="CASSANDRA-294"]

cass_sub = cass_df.loc[cass_df['issue_id'].isin(["CASSANDRA-294","CASSANDRA-966","CASSANDRA-935","CASSANDRA-5786"]),['issue_id','summary']].copy()
cass_sub.reset_index(inplace=True, drop=True)
cass_sub['Baseline'] = cass_sub.apply(lambda x: baseline_prep(x['summary']), axis=1)
cass_sub['Lowercase'] = cass_sub.apply(lambda x: pipe_lowercase(x['summary']), axis=1)
cass_sub['Camelcase'] = cass_sub.apply(lambda x: pipe_camelcase(x['summary']), axis=1)
cass_sub['Camel Lower'] = cass_sub.apply(lambda x: pipe_camelcase_lower(x['summary']), axis=1)
cass_sub['Stopwords'] = cass_sub.apply(lambda x: pipe_stopwords(x['summary']), axis=1)
cass_sub['Lemmatize'] = cass_sub.apply(lambda x: pipe_lemma(x['summary']), axis=1)

cass_sub.to_csv("./pipeline_example.csv", index=False)


In [60]:
cass_sub.rename(columns={'summary':'Original'}, inplace=True)

In [61]:
cass_sub[['Original','Baseline']]

Unnamed: 0,Original,Baseline
0,missing lib license google collect rc jar LICENSE,missing lib license google collect rc jar LICENSE
1,login request via Thrift PHP fails with Unexpe...,login request via Thrift PHP fails with Unexpe...
2,Thrift cas method crashes if input columns are...,Thrift cas method crashes if input columns are...


In [62]:
cass_long_keys = cass_sub.columns.to_list()*(len(cass_sub.index))

In [63]:
cass_t_melt = cass_sub.T.melt().copy()
cass_t_melt['key'] = cass_long_keys
cass_t_melt.drop(columns=['variable']).to_csv("pipeline_example_long.csv", index=False)

In [64]:
cass_t_melt.shape

(24, 3)

### Run Vectorizer Pipelines


#### NG

In [65]:
base_path_ng = csv_base + "NG/base"
base_path_ng = csv_base + "NG/base"

# NG_DF = pd.read_csv(csv_base + "NG/ng_all.csv")
NG_XCOL = "description"
NG_YCOL = "target"

NG_META_BASE = {
    'tfidf': {
        'model': vectorize_tfidf,
        'args': [NG_XCOL, NG_YCOL]
        },
    'w2v': {
        'model': vectorize_w2v,
        'args': [NG_XCOL, NG_YCOL, cbow_model]
        },
    'd2v': {
        'model': vectorize_d2v,
        'args': [NG_XCOL, NG_YCOL, dbow_model]
        },
    'bert': {
        'model': vectorize_bert,
        'args': [NG_XCOL, NG_YCOL, bert_tokzr, bert_model]
        },
    'sbert': {
        'model': vectorize_sbert,
        'args': [NG_XCOL, NG_YCOL, sbert_model]
        }
}


In [66]:
def vectorize_ng(base_path, vectorizer_prefix, vectorizer_fn, *args, **kwargs):

    df_ = NG_DF
    X_tr, X_te, y_tr, y_te = vectorizer_fn(df_, *args, **kwargs)

    split_mapper = {
        'train': {'X': X_tr, 'y': y_tr},
        'test': {'X': X_te, 'y': y_te},
        # 'all': None,
    }

    to_concat = None
    for m in split_mapper:
        if m == 'all':
            splitkwargs = kwargs.copy()
            splitkwargs['split'] = False

            X_all, y_all = vectorizer_fn(df_, *args, **splitkwargs)

            to_concat = pd.concat([pd.DataFrame(X_all), 
                                    pd.DataFrame(y_all, columns=[NG_YCOL]).astype(int)], 
                                   axis=1)
        else:
            to_concat = pd.concat([pd.DataFrame(split_mapper[m]['X']),
                                   pd.DataFrame(split_mapper[m]['y'] , 
                                                columns=[NG_YCOL] ).astype(int)], 
                                  axis=1)

        # base/cassandra_w2v_train.csv
        wfname = "{}/{}_{}_{}.csv".format(base_path, "NG", vectorizer_prefix, m)
        to_concat.to_csv(wfname, index=False)



def run_ng_vec_pipeline(vec_path=base_path_ng, pipe_fn=None):
    meta_ = NG_META_BASE.copy()
    ng_vectorizers = list(NG_META_BASE.keys())

    for v in ng_vectorizers:
        if pipe_fn != None:
            meta_[v]['kwargs'] = {'pp_fn': pipe_fn}
        else:
            meta_[v]['kwargs'] = {}
    
    tqdm_model = tqdm(meta_.keys(), position=0)

    for m in tqdm_model:
        tqdm_model.set_description("Processing model: %s" % m)
        vectorize_ng(vec_path, m, meta_[m]['model'], *meta_[m]['args'], **meta_[m]['kwargs'])


In [67]:
base_path_ng

'./csv/NG/base'

#### Baseline

In [68]:
base_path = csv_base + "vectors/base"

MODEL_META_BASE = {
    'tfidf': {
        'model': vectorize_tfidf,
        'args': [VECTORIZE_COL, LABEL_COL]
        },
    'w2v': {
        'model': vectorize_w2v,
        'args': [VECTORIZE_COL, LABEL_COL, cbow_model]
        },
    'd2v': {
        'model': vectorize_d2v,
        'args': [VECTORIZE_COL, LABEL_COL, dbow_model]
        },
    'bert': {
        'model': vectorize_bert,
        'args': [VECTORIZE_COL, LABEL_COL, bert_tokzr, bert_model]
        },
    'sbert': {
        'model': vectorize_sbert,
        'args': [VECTORIZE_COL, LABEL_COL, sbert_model]
        }
}

In [69]:
base_path

'./csv/vectors/base'

In [70]:
vectorizers = ['tfidf', 'w2v', 'd2v', 'bert', 'sbert']
vectorizers

['tfidf', 'w2v', 'd2v', 'bert', 'sbert']

In [71]:
def run_vectorization_pipeline(vec_path=base_path, pipe_fn=None):
    meta_ = MODEL_META_BASE.copy()

    for v in vectorizers:
        if pipe_fn != None:
            meta_[v]['kwargs'] = {'pp_fn': pipe_fn}
        else:
            meta_[v]['kwargs'] = {}
    
    tqdm_model = tqdm(meta_.keys(), position=0)

    for m in tqdm_model:
        tqdm_model.set_description("Processing model: %s" % m)
        vectorize_db(vec_path, m, meta_[m]['model'], *meta_[m]['args'], **meta_[m]['kwargs'])


# Run Before

In [72]:
play_done()


20:43:47


In [73]:
%%time

## BASELINE

run_vectorization_pipeline(pipe_fn=baseline_prep)

#play_done()


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

CPU times: user 25min 22s, sys: 3min 14s, total: 28min 36s
Wall time: 6min 44s


In [74]:
# CPU times: user 29min 26s, sys: 4min 4s, total: 33min 31s
# Wall time: 7min 52s

# CPU times: user 22min 53s, sys: 3min 16s, total: 26min 10s
# Wall time: 6min 41s

## C2D
# CPU times: user 52min 30s, sys: 1min 32s, total: 54min 2s
# Wall time: 41min 58s

## N2D
# CPU times: user 32min 37s, sys: 4min 27s, total: 37min 4s
# Wall time: 22min 53s

## Mac
# CPU times: user 17min 30s, sys: 3min 21s, total: 20min 51s
# Wall time: 7min 53s

In [75]:

# run_ng_vec_pipeline(vec_path=csv_base + "vectors/base", pipe_fn=no_pipe_fn)

play_done()


20:50:31


#### Symbols

In [76]:
# %%time

# removesymbols_path = csv_base + "vectors/removesymbols"

# def pipe_remove_symbols(x):
#     x = baseline_prep(x)
#     x = remove_digits(x)
#     return remove_symbols(x)

# run_vectorization_pipeline(removesymbols_path, pipe_remove_symbols)

# # CPU times: user 17min 15s, sys: 3min 39s, total: 20min 54s
# # Wall time: 7min 1s
# play_done()


In [77]:

# run_ng_vec_pipeline(vec_path=removesymbols_path, pipe_remove_symbols)
# play_done()


In [78]:
# %%time

# replacesymbols_path = csv_base + "vectors/replacesymbols"

# def pipe_replace_symbols(x):
#     x = baseline_prep(x)
#     x = remove_digits(x)
#     return replace_symbols(x)

# run_vectorization_pipeline(replacesymbols_path, pipe_replace_symbols)

# play_done()


In [79]:

# CPU times: user 16min 20s, sys: 3min 21s, total: 19min 42s
# Wall time: 6min 40s

In [80]:

# run_ng_vec_pipeline(replacesymbols_path, pipe_replace_symbols)
# play_done()


#### Case

In [81]:
%%time

lowercase_path = csv_base + "vectors/lowercase"


run_vectorization_pipeline(lowercase_path, pipe_lowercase)

play_done()

# N2D
# CPU times: user 32min 7s, sys: 4min 17s, total: 36min 25s
# Wall time: 22min 39s

# Mac
# CPU times: user 16min 40s, sys: 3min 24s, total: 20min 5s
# Wall time: 6min 40s

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

20:57:19
CPU times: user 25min 4s, sys: 3min 12s, total: 28min 16s
Wall time: 6min 47s


In [82]:
# CPU times: user 30min 12s, sys: 3min 57s, total: 34min 10s
# Wall time: 8min 6s

## C2D
# CPU times: user 52min 35s, sys: 1min 10s, total: 53min 45s
# Wall time: 30min 28s

In [83]:
# run_ng_vec_pipeline(lowercase_path, pipe_lowercase)

play_done()


20:57:19


In [84]:
%%time

camelcase_path = csv_base + "vectors/camel"



run_vectorization_pipeline(camelcase_path, pipe_camelcase)
play_done()

# CPU times: user 34min 45s, sys: 5min 11s, total: 39min 57s
# Wall time: 23min 27s

# CPU times: user 26min 21s, sys: 6min 47s, total: 33min 8s
# Wall time: 24min 47s

## Mac
# CPU times: user 18min 9s, sys: 3min 51s, total: 22min 1s
# Wall time: 7min 30s

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

21:04:16
CPU times: user 27min 16s, sys: 3min 58s, total: 31min 15s
Wall time: 6min 56s


In [85]:
# CPU times: user 40min 56s, sys: 5min 21s, total: 46min 18s
# Wall time: 10min 31s

In [86]:
# run_ng_vec_pipeline(camelcase_path, pipe_camelcase)
play_done()


21:04:16


#### Camel + Lower

In [87]:
%%time

camelcase_path = csv_base + "vectors/camellower"


run_vectorization_pipeline(camelcase_path, pipe_camelcase_lower)
play_done()

# CPU times: user 33min 19s, sys: 5min, total: 38min 20s
# Wall time: 22min 47s
# CPU times: user 18min 7s, sys: 3min 49s, total: 21min 57s
# Wall time: 7min 27s

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

21:11:07
CPU times: user 26min 53s, sys: 4min 1s, total: 30min 55s
Wall time: 6min 50s


In [88]:
# CPU times: user 38min 26s, sys: 4min 56s, total: 43min 23s
# Wall time: 9min 59s

In [89]:
# run_ng_vec_pipeline(camelcase_path, pipe_camelcase_lower)
play_done()


21:11:07


#### Stopwords

In [90]:
%%time

stopwords_path = csv_base + "vectors/stopwords"


run_vectorization_pipeline(stopwords_path, pipe_stopwords)
play_done()

# CPU times: user 33min 6s, sys: 3min 25s, total: 36min 31s
# Wall time: 23min 55s
# CPU times: user 15min 31s, sys: 3min 9s, total: 18min 41s
# Wall time: 8min 5s

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

21:18:50
CPU times: user 21min 54s, sys: 2min 53s, total: 24min 47s
Wall time: 7min 43s


In [91]:
# CPU times: user 32min 50s, sys: 3min 50s, total: 36min 41s
# Wall time: 11min 1s

In [92]:
# run_ng_vec_pipeline(stopwords_path, pipe_stopwords)
play_done()


21:18:50


#### Lemmatization

In [93]:
timenow()


'21:18:50'

In [94]:
%%time

print(timenow())
lemma_path = csv_base + "vectors/lemma"


run_vectorization_pipeline(lemma_path, pipe_lemma)
play_done()


print(timenow())

# 22:33:54
# 22:42:05

# CPU times: user 34min 42s, sys: 3min 27s, total: 38min 10s
# Wall time: 27min 4s
# CPU times: user 15min 41s, sys: 3min 10s, total: 18min 51s
# Wall time: 8min 10s

21:18:50


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

21:29:05
21:29:05
CPU times: user 26min 32s, sys: 2min 57s, total: 29min 29s
Wall time: 10min 14s


In [95]:
# CPU times: user 34min 13s, sys: 3min 33s, total: 37min 47s
# Wall time: 13min 6s

In [96]:
# run_ng_vec_pipeline(lemma_path, pipe_lemma)
play_done()


21:29:05


In [97]:
# s_final = './../../../../SOUNDS/WAV_Enharpment.wav'
# play_done(s = s_final)
