In [None]:
class InputExample(object):
    """A single training/test example for sequence classification."""

    def __init__(self, guid, text_a, text_b=None, labels=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            labels: (Optional) [string]. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.labels = labels

In [None]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids

In [None]:

class BertForMultiLabelSequenceClassification(PreTrainedBertModel):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    """
    def __init__(self, config, num_labels=2):
        super(BertForMultiLabelSequenceClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
            return loss
        else:
            return logits
        
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [None]:
BertForMultiLabelSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
#       12 BertLayers
        (11): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (dropout): Dropout(p=0.1)
  (classifier): Linear(in_features=768, out_features=6, bias=True)
)

In [None]:
def accuracy_thresh(y_pred:Tensor, y_true:Tensor, thresh:float=0.5, sigmoid:bool=True):
    "Compute accuracy when `y_pred` and `y_true` are the same size."
    if sigmoid: y_pred = y_pred.sigmoid()

    return np.mean(((y_pred>thresh)==y_true.byte()).float().cpu().numpy(), axis=1).sum()


In [None]:
from sklearn.metrics import roc_curve, auc

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(num_labels):
    fpr[i], tpr[i], _ = roc_curve(all_labels[:, i], all_logits[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(all_labels.ravel(), all_logits.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [14]:
import pandas as pd
import numpy as np
import csv 
import string
import nltk 
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

import time
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import warnings
from os import path
# project
proj_name       = "rxjava_2"
working_dir     = "/Users/fd252/OneDrive/Research4/MSRExt/"


# input
binaryBodyTitle = working_dir + "OSSPRMapper/outputs/" + proj_name + '/' + proj_name + '_' + "binaryBodyTitle.csv"
print( "Input Binary File: " + binaryBodyTitle )


Input Binary File: /Users/fd252/OneDrive/Research4/MSRExt/OSSPRMapper/outputs/rxjava_2/rxjava_2_binaryBodyTitle.csv


In [15]:
#organizing data_frame to issue order
def organize():
    try:
        #data = pd.read_csv('file1.csv',skiprows=line)
        data_classes = pd.read_csv( binaryBodyTitle, header = 0, sep=';')
    except:
        print ('number of columns varying. Skipping bad lines!')
        data_classes = pd.read_csv(binaryBodyTitle, header = 0, sep=';', error_bad_lines=False)

    # OBS!!!!!
    #line 62 mockito was deleted wrong number of columns!
    #line 1923 rxjava was deleted wrong number of columns!
    
    
    ## Code implemented below before modifications by Jacob ##
    # del data_classes['prIssue']
    # del data_classes['issueTitle']
    # del data_classes['issueBody']
    
    # these renames are listed in the order that the original name appears in the list of headers
    # coming in from the binary file
    # data_classes.rename(columns={ 'pr': 'prNumber',                                   
    #                               'Title': 'prTitle',                                 
    #                               'Body': 'prBody',                                   
    #                               'issue': 'issueNumber',                             
    #                               'issueComments': 'prComments',                      
    #                               'issueTitleLink': 'issueTitle',                     
    #                               'issueBodyLink': 'issueBody',                       
    #                               'issueCommentsLink': 'issue_Comments',              
    #                               'Comments': 'prCodeReviewComments' }, inplace=True) 
        
    
    # this list must match the headers coming in from the binary file in name and order, including
    # with the renames above
    # data_classes = data_classes[[ 'pr', 'Util', 'NLP', 'APM', 'Network', 'DB', 'Interpreter', 'Logging', 
    #                               'Thread', 'DataStructure', 'i18n', 'DevOps', 'Logic',  
    #                               'Microservices', 'ML', 'Test', 'Search', 'IO', 'UI', 'Parser', 'Security',
    #                               'Cloud', 'BigData', 'App', 'GIS', 'Title', 'Body', 'prIssue', 'issue', 
    #                               'issueTitle', 'issueBody', 'issueComments', 'issueTitleLink', 'issueBodyLink', 
    #                               'issueCommentsLink', 'isPR', 'isTrain', 'commitMessage', 'Comments']]  


    ## Code above after modifications by Jacob ##
    # these renames are listed in the order that the original name appears in the list of headers
    # coming in from the binary file
    data_classes.rename(columns={ 'pr': 'prNumber',                                   
                                  'Title': 'prTitle',                                 
                                  'Body': 'prBody',                                   
                                  'issue': 'issueNumber',                             
                                  'issueComments': 'prComments',                      
                                  #'issueTitleLink': 'issueTitle',                     
                                  #'issueBodyLink': 'issueBody',                       
                                  'issueCommentsLink': 'issue_Comments',              
                                  'Comments': 'prCodeReviewComments',
                                #'Data Structure': 'DataStructure',
                                #'Big Data': 'BigData'
                                }, inplace=True) 
    
    categories = data_classes.columns.values.tolist()
    print('categories:',categories)    
    
    # this list must match the headers coming in from the binary file in name and order, including
    # with the renames above
    #data_classes = data_classes[[ 'prNumber', 'Util', 'NLP', 'APM', 'Network', 'DB', 'Interpreter', 'Logging', 
    #                              'Thread', 'DataStructure', 'i18n', 'DevOps', 'Logic',
    #                              'Microservices', 'ML', 'Test', 'Search', 'IO', 'UI', 'Parser', 'Security',
    #                              'Cloud', 'BigData', 'App', 'GIS', 'prTitle', 'prBody', 'prIssue', 'issueNumber', 
    #                              'issueTitle', 'issueBody', 'prComments', 'issueTitleLink', 'issueBodyLink', 
    #                              'issue_Comments', 'isPR', 'isTrain', 'commitMessage', 'prCodeReviewComments']]    

    data_classes = data_classes[ categories]    

    
    data_classes['issueNumber'] = data_classes['issueNumber'].astype('Int64')
    print('before filtering out empty classes',data_classes.shape)
    
    #find rows with parse error
    data_classes_error = data_classes.loc[pd.isnull(data_classes.loc[:,'Util'])]
    print('rows filtered out empty classes (parse error)',data_classes_error.shape)
    
    col_data_classes = len(data_classes.columns)

    if (len(data_classes_error) > 0):
        data_classes_fixed= data_classes_error.iloc[:,0].str.split(';', expand=True)
        print('rows fixed after new parse - empty classes (parse error)',data_classes_fixed.shape)

        col_data_classes_fixed = len(data_classes_fixed.columns)
        
        #removing rows with problems 
        data_classes.dropna(subset = ["Util"], inplace=True)
        print('after filtering out empty classes',data_classes.shape)
        
        print('len columns data_classes:',col_data_classes)
        print('len columns data_classes_fixed:',col_data_classes_fixed)
        
        if (col_data_classes == col_data_classes_fixed):
            
            names =['prNumber','DB','Interpreter','Logging','Thread','DataStructure','DevOps','i18n','Logic',
                    'Microservices','ML','Test','Search','IO','UI','Parser','Security','Cloud','BigData','App',
                    'GIS','Util','NLP','APM','Network','prTitle','prBody','prIssue','issueNumber','issueTitle',
                    'issueBody','prComments','issueTitleLink','issueBodyLink','issue_Comments','isPR','isTrain',
                    'commitMessage','prCodeReviewComments']                     
            data_classes_fixed.columns = names

            #drop data with error after parsing
            index_names = data_classes_fixed[ (data_classes_fixed['Util'] != '0') & (data_classes_fixed['Util'] != '1') |
                                     (data_classes_fixed['NLP'] != '0') & (data_classes_fixed['NLP'] != '1') |
                                     (data_classes_fixed['APM'] != '0') & (data_classes_fixed['APM'] != '1') |
                                     (data_classes_fixed['Network'] != '0') & (data_classes_fixed['Network'] != '1') |
                                     (data_classes_fixed['DB'] != '0') & (data_classes_fixed['DB'] != '1') |
                                     (data_classes_fixed['Interpreter'] != '0') & (data_classes_fixed['Interpreter'] != '1') |
                                     (data_classes_fixed['Logging'] != '0') & (data_classes_fixed['Logging'] != '1') |
                                     (data_classes_fixed['Thread'] != '0') & (data_classes_fixed['Thread'] != '1') |
                                     (data_classes_fixed['DataStructure'] != '0') & (data_classes_fixed['DataStructure'] != '1') |
                                     (data_classes_fixed['i18n'] != '0') & (data_classes_fixed['i18n'] != '1') |

                                     (data_classes_fixed['DevOps'] != '0') & (data_classes_fixed['DevOps'] != '1') |
                                     (data_classes_fixed['Logic'] != '0') & (data_classes_fixed['Logic'] != '1') |
                                   (data_classes_fixed['Microservices'] != '0') & (data_classes_fixed['Microservices'] != '1') |
                                   (data_classes_fixed['ML'] != '0') & (data_classes_fixed['ML'] != '1') |
                                   (data_classes_fixed['Test'] != '0') & (data_classes_fixed['Test'] != '1') |
                                   (data_classes_fixed['Search'] != '0') & (data_classes_fixed['Search'] != '1') |
                                   (data_classes_fixed['IO'] != '0') & (data_classes_fixed['IO'] != '1') |
                                   (data_classes_fixed['UI'] != '0') & (data_classes_fixed['UI'] != '1') |
                                   (data_classes_fixed['Parser'] != '0') & (data_classes_fixed['Parser'] != '1') |
                                 (data_classes_fixed['Security'] != '0') & (data_classes_fixed['Security'] != '1') |
                                 (data_classes_fixed['Cloud'] != '0') & (data_classes_fixed['Cloud'] != '1') |
                                 (data_classes_fixed['BigData'] != '0') & (data_classes_fixed['BigData'] != '1') |
                                 (data_classes_fixed['App'] != '0') & (data_classes_fixed['App'] != '1') |
                                 (data_classes_fixed['GIS'] != '0') & (data_classes_fixed['GIS'] != '1') |
                                     (data_classes_fixed['Error Handling'] != '0') & (data_classes_fixed['Error Handling'] != '1') |
                                     (data_classes_fixed['Event Handling'] != '0') & (data_classes_fixed['Event Handling'] != '1') |
                                     (data_classes_fixed['Lang'] != '0') & (data_classes_fixed['Lang'] != '1') |
                                     (data_classes_fixed['Setup'] != '0') & (data_classes_fixed['Setup'] != '1')
                                    ].index

            # drop these given row
            # indexes from dataFrame
            data_classes_fixed.drop(index_names, inplace = True)
            print('data fixed after dropping parse fix errors',data_classes_fixed.shape)

            #back to float
            data_classes_fixed['Util'] = data_classes_fixed['Util'].astype(str).astype('Float64')
            data_classes_fixed['NLP'] = data_classes_fixed['NLP'].astype(str).astype('Float64')
            data_classes_fixed['APM'] = data_classes_fixed['APM'].astype(str).astype('Float64')
            data_classes_fixed['Network'] = data_classes_fixed['Network'].astype(str).astype('Float64')
            data_classes_fixed['DB'] = data_classes_fixed['DB'].astype(str).astype('Float64')
            data_classes_fixed['Interpreter'] = data_classes_fixed['Interpreter'].astype(str).astype('Float64')
            data_classes_fixed['Logging'] = data_classes_fixed['Logging'].astype(str).astype('Float64')
            data_classes_fixed['Thread'] = data_classes_fixed['Thread'].astype(str).astype('Float64')
            data_classes_fixed['DataStructure'] = data_classes_fixed['DataStructure'].astype(str).astype('Float64')
            data_classes_fixed['i18n'] = data_classes_fixed['i18n'].astype(str).astype('Float64')
            data_classes_fixed['DevOps'] = data_classes_fixed['DevOps'].astype(str).astype('Float64')
            data_classes_fixed['Logic'] = data_classes_fixed['Logic'].astype(str).astype('Float64')
            data_classes_fixed['Microservices'] = data_classes_fixed['Microservices'].astype(str).astype('Float64')
            data_classes_fixed['ML'] = data_classes_fixed['ML'].astype(str).astype('Float64')
            data_classes_fixed['Test'] = data_classes_fixed['Test'].astype(str).astype('Float64')
            data_classes_fixed['Search'] = data_classes_fixed['Search'].astype(str).astype('Float64')
            data_classes_fixed['IO'] = data_classes_fixed['IO'].astype(str).astype('Float64')
            data_classes_fixed['UI'] = data_classes_fixed['UI'].astype(str).astype('Float64')
            data_classes_fixed['Parser'] = data_classes_fixed['Parser'].astype(str).astype('Float64')
            data_classes_fixed['Security'] = data_classes_fixed['Security'].astype(str).astype('Float64')
            data_classes_fixed['Cloud'] = data_classes_fixed['Cloud'].astype(str).astype('Float64')
            data_classes_fixed['BigData'] = data_classes_fixed['BigData'].astype(str).astype('Float64')
            data_classes_fixed['App'] = data_classes_fixed['App'].astype(str).astype('Float64')
            data_classes_fixed['GIS'] = data_classes_fixed['GIS'].astype(str).astype('Float64')
            data_classes_fixed['Error Handling'] = data_classes_fixed['Error Handling'].astype(str).astype('Float64')
            data_classes_fixed['Event Handling'] = data_classes_fixed['Event Handling'].astype(str).astype('Float64')
            data_classes_fixed['Lang'] = data_classes_fixed['Lang'].astype(str).astype('Float64')
            data_classes_fixed['Setup'] = data_classes_fixed['Setup'].astype(str).astype('Float64')

            # appending fixed rows
            data_classes_new = data_classes.append(data_classes_fixed)
            print('after appending fixed rows',data_classes_new.shape)

            return data_classes_new
        
        else:
            print('fixing parse errors failed')

            return data_classes
    
    else:
        
        print('no parse errors found')
        return data_classes

In [16]:
#Filtering issues with PRs
def filtering(data_classes):
    
    print('before filtering out isTrain == 0',data_classes.shape)

    IssuePRDataset = data_classes[data_classes["isTrain"] == 0]
    
    print('after filtering out isTrain == 0',IssuePRDataset.shape)


    #invalid number of issue = NaN
    # IssuePRDataset = IssuePRDataset.drop([1805])

    categories = IssuePRDataset.columns.values.tolist()
    
    return categories, IssuePRDataset

In [17]:
# RQ1.a - o quão sensível o resultado é em relação ao algoritmo? 
#vários algoritmos - BinaryRelevance
#todas as palavras, bootstrap, unigram 
#somente o título
def dataset_config(IssuePRDataset):
    # ORIGINAL
    # data_test1 = IssuePRDataset[['issueNumber','prNumber','issueTitle','Google Common', 
    #                              'Test', 'SO', 'IO', 'UI', 'Network', 'Security', 
    #                              'OpenOffice Documents', 'Database', 'Utils', 'PDF', 
    #                              'Logging', 'Latex']].copy()
    
    # WORKS WITH NEW INPUTS
    # data_test1 = IssuePRDataset[['issueNumber','prNumber','issueTitle', 'Test','IO', 'UI', 'Network', 'Security', 'Logging' ]].copy() 

    #data_test1 = IssuePRDataset[[ 'issueNumber','prNumber','issueTitle','issueBody', 'prTitle', 'prBody',
    #                         'issueTitleLink','issueBodyLink','commitMessage','prComments',
    #                         'Util', 
    #                              'NLP', 'APM', 'Network', 'DB', 'Interpreter',
    #                              'Logging', 'Thread', 'DataStructure', 'i18n', 
    #                              'DevOps', 'Logic', 'Microservices', 'ML',
    #                              'Test', 'Search', 'IO', 'UI', 'Parser', 'Security',
    #                              'Cloud', 'BigData', 'App', 'GIS' ]].copy()
    
    data_test1 = IssuePRDataset[[ 'issueNumber','prNumber','issueTitle','issueBody', 'prTitle', 'prBody',
                             'issueTitleLink','issueBodyLink','commitMessage','prComments',
                             'Util','NLP','APM','Network','DB','Interpreter','Error Handling','Logging','Lang','Data Structure','DevOps','i18n','Setup','Logic','Microservices','ML','Test','Search','IO','UI','Parser','Security','Cloud','Big Data','Event Handling','App','GIS' ]].copy()

    #print(type(data_test1))
    #data_test1['corpus'] = IssuePRDataset['issueTitle'] + IssuePRDataset['issueBody']
    data_test1["corpus"] = data_test1["issueTitle"].map(str)+" "+ data_test1["issueBody"].map(str)+" "+ data_test1["prComments"].map(str)

    # rxjava 2489 terms
    # mockito 598
    # presto 4
    # guava 1140
    # jabref 740
    
    #data_test1["corpus"] = data_test1["issueTitle"].map(str) + ' ' + data_test1["issueBody"].map(str) + ' ' + data_test1["prTitle"].map(str) + ' ' + data_test1["prBody"].map(str)
    # rxjava 3002 terms
    
    del data_test1["issueTitle"]
    del data_test1["issueBody"]
    del data_test1["prTitle"]
    del data_test1["prBody"]
    del data_test1["issueTitleLink"]
    del data_test1["issueBodyLink"]
    del data_test1["commitMessage"]
    del data_test1["prComments"]

    print('before filtering out empty corpus',data_test1.shape)
    data_test1.dropna(subset = ["corpus"], inplace=True)
    
    data_test1['corpus'] = data_test1['corpus'].str.replace("nan",' ')
    print('after filtering out empty corpus',data_test1.shape)

    #removing utils because we won't to predict a so simple API that is basically used in all PRs
    #del data_test1["Util"]

    data_test1 = data_test1.reset_index(drop=True)
    
    return data_test1

In [18]:
data_classes = organize()

number of columns varying. Skipping bad lines!
categories: ['prNumber', 'Util', 'NLP', 'APM', 'Network', 'DB', 'Interpreter', 'Error Handling', 'Logging', 'Lang', 'Data Structure', 'DevOps', 'i18n', 'Setup', 'Logic', 'Microservices', 'ML', 'Test', 'Search', 'IO', 'UI', 'Parser', 'Security', 'Cloud', 'Big Data', 'Event Handling', 'App', 'GIS', 'prTitle', 'prBody', 'prIssue', 'issueNumber', 'issueTitle', 'issueBody', 'prComments', 'issueTitleLink', 'issueBodyLink', 'issue_Comments', 'isPR', 'isTrain', 'commitMessage', 'prCodeReviewComments']
before filtering out empty classes (1977, 42)
rows filtered out empty classes (parse error) (0, 42)
no parse errors found


b'Skipping line 1923: expected 42 fields, saw 45\n'


In [19]:
categories, IssuePRDataset = filtering(data_classes)

before filtering out isTrain == 0 (1977, 42)
after filtering out isTrain == 0 (625, 42)


In [20]:
#preprocessing text

#We first convert the comments to lower-case 
#then use custom made functions to remove html-tags, punctuation and non-alphabetic characters from the TitleBody.

def clean_data(data_test1):
    if not sys.warnoptions:
        warnings.simplefilter("ignore")

    def cleanHtml(sentence):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, ' ', str(sentence))
        return cleantext

    def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
        cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
        cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
        cleaned = cleaned.strip()
        cleaned = cleaned.replace("\n"," ")
        return cleaned

    def keepAlpha(sentence):
        alpha_sent = ""
        for word in sentence.split():
            alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
            alpha_sent += alpha_word
            alpha_sent += " "
        alpha_sent = alpha_sent.strip()
        return alpha_sent

    #function pra remover palavras com menos de 3 tokens

    data_test1['corpus'] = data_test1['corpus'].str.lower()
    data_test1['corpus'] = data_test1['corpus'].apply(cleanHtml)
    data_test1['corpus'] = data_test1['corpus'].apply(cleanPunc)
    data_test1['corpus'] = data_test1['corpus'].apply(keepAlpha)
    
    return data_test1

In [21]:
#### removing stopwords

def remove_stop_words():
    stop_words = set(stopwords.words('english'))
    stop_words.update(['nan','pr','zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within','jabref','org','github','com','md','https','ad','changelog','','joelparkerhenderson','localizationupd',' localizationupd','localizationupd ','i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the','Mr', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'])
    #stop_words.update(['i', 'me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the","Mr", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"])

    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)

    return re_stop_words

In [22]:
def removeStopWords(sentence, re_stop_words):
    #global re_stop_words
    #print(sentence)
    return re_stop_words.sub(" ", sentence)

#removing words with less than 3 characters
#data_classes['titleBody'] = data_classes['titleBody'].str.findall('\w{3,}').str.join(' ')

In [23]:
def apply_stem(data_test1):
    stemmer = SnowballStemmer("english")
    
    def stemming(sentence):
        stemSentence = ""
        for word in sentence.split():
            stem = stemmer.stem(word)
            stemSentence += stem
            stemSentence += " "
        stemSentence = stemSentence.strip()
        return stemSentence
    
    data_test1['corpus'] = data_test1['corpus'].apply(stemming)
    #print(data_test1['corpus'])
    
    return data_test1

In [24]:
data_test1 = dataset_config(IssuePRDataset)
data_test1 = clean_data(data_test1)
#print('################# data_test1 after fixing')
#print(data_test1)

#   print('1',data_test1['corpus'])

re_stop_words = remove_stop_words()
data_test1['corpus'] = data_test1['corpus'].apply(removeStopWords, re_stop_words=re_stop_words)
data = data_test1
#   print('2',data_test1['corpus'])
data_test1 = apply_stem(data)


before filtering out empty corpus (625, 30)
after filtering out empty corpus (625, 30)


In [25]:
data_test1.to_csv('./data/'+proj_name+'binaryNew.csv', encoding='utf-8', header=True, index=False, sep=',')

In [26]:
cols = data.columns.values.tolist()
labels = cols[2:len(cols)-1]
labelsdf = pd.DataFrame(labels)
labelsdf.to_csv('./data/'+proj_name+'labelsNew.csv', encoding='utf-8', header=False, index=False, sep=',')
labels

['Util',
 'NLP',
 'APM',
 'Network',
 'DB',
 'Interpreter',
 'Error Handling',
 'Logging',
 'Lang',
 'Data Structure',
 'DevOps',
 'i18n',
 'Setup',
 'Logic',
 'Microservices',
 'ML',
 'Test',
 'Search',
 'IO',
 'UI',
 'Parser',
 'Security',
 'Cloud',
 'Big Data',
 'Event Handling',
 'App',
 'GIS']

In [None]:
#labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
DATA_PATH = './data/'
LABEL_PATH = './data/'
from fast_bert.data_cls import BertDataBunch

databunch = BertDataBunch(DATA_PATH, LABEL_PATH,
                          tokenizer='bert-base-uncased',
                          #train_file='train.csv',
                          #val_file='val.csv',
                          #label_file='labels.csv',
                          train_file='binaryNew.csv',
                          val_file='binaryNew.csv',
                          label_file='labelsNew.csv',
                          #text_col='text',
                          text_col='corpus',
                          label_col=labels,
                          batch_size_per_gpu=16,
                          max_seq_length=512,
                          multi_gpu=False,
                          multi_label=True,
                          model_type='bert')

In [None]:
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy
import logging

logger = logging.getLogger()
#device_cuda = torch.device("cuda")
metrics = [{'name': 'accuracy', 'function': accuracy}]

OUTPUT_DIR = './output/'

learner = BertLearner.from_pretrained_model(
						databunch,
						pretrained_path='bert-base-uncased',
						metrics=metrics,
						#device=device_cuda,
                        device=None,
						logger=logger,
						output_dir=OUTPUT_DIR,
						finetuned_wgts_path=None,
						warmup_steps=500,
						multi_gpu=False,
						is_fp16=True,
						multi_label=True,
						logging_steps=50)

In [None]:
learner.lr_find(start_lr=1e-5,optimizer_type='lamb')

In [None]:
learner.fit(epochs=6,
			lr=6e-5,
			validate=True, 	# Evaluate the model after each epoch
			schedule_type="warmup_cosine",
			optimizer_type="lamb")