In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%time
#Installations
!pip install Keras-Preprocessing
!pip install pytorch-pretrained-bert
!pip install transformers

---

In [None]:
%%time
#Library Importation
import random, re, pickle, random, collections
from tabulate import tabulate
from tqdm import trange

#Data Encoding
import nltk, gensim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import word2vec, KeyedVectors
from gensim.models.word2vec import Word2Vec
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

#Sentiment Analysis
#from tensorflow.keras.models import load_model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, SimpleRNN, LSTM, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.python.client import device_lib
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
from keras.preprocessing.text import Tokenizer
from transformers import BertTokenizer, BertTokenizerFast, BertForSequenceClassification, BertModel
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset, RandomSampler, SequentialSampler

In [None]:
%%time
# Get the GPU device name.

device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
%%time
# If there's a GPU available tell PyTorch to use the GPU

if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
#Funtion Declaration

#Cleaning Funtion
def clean_code(code):
    """
    Function to clean tweet by:
    - Changing '&' sign to and
    - Removing newlines, carriage returns, links, emojis, handles, hashtags and punctuations

    Parameters:
        tweet (string): The tweet
    """      
            
    code = re.sub("@[^\s]+", "",code) # removes handles
    code = re.sub("\n", " ", code) # remove newlines
    code = re.sub("\r", "", code) # remove carriage returns
    code = re.sub(r"http\S+", "", code) # removes links
    #code = re.sub(r"#(\w+)", "", code) # remove hashtags
    code = re.sub("&", "and", code) # changes & sign to and
    #code = re.sub(r"[^\w\s\@]","",code) # removes punctuation
    code = code.strip()

    return code


#Vector-Encoding Function

#Gensim Word2Vec-Google-News-300
def sent_vect(series):
    """This function tokenizes each text and encodes each word in each text with it's vector representation
    in the word2vec-google-news-300 GENSIM dictionary.
    
    This nested list/array will later be converted into a tensor, and fed directly into an RNN"""
    
    shape = series.shape[0]
    series = list(series.values)
    array = []
    pad_array = np.zeros(300)
    for i in range(shape):
        word_token = word_tokenize(series[i])
        word_token = word_token[:60]
        sample_vector = np.array([list(wv[word]) for word in word_token if word in wv.index_to_key])
        if sample_vector.shape[0] > 0:
            if sample_vector.shape[0] >= 60:
                sample_vector = sample_vector[:60,:]
            else:
                deficit = 60-sample_vector.shape[0]
                for i in range(deficit):
                    sample_vector = np.vstack((sample_vector, pad_array))
        else:
            sample_vector = np.zeros((60, 300))
        array.append(sample_vector.tolist())
    return array

#Fucntion for saving the dataset in encoded format to avoid re-encoding everytime a new session is started.
def modified_sent_vect(series):
    """This function tokenizes each text and encodes each word in each text with it's vector representation
    in the word2vec-google-news-300 GENSIM dictionary. This encoded data is then stored in a .csv file.
    
    This nested list/array will later be converted into a tensor in a following function, and fed directly 
    into an RNN"""
    
    shape = series.shape[0]
    series = list(series.values)
    max_length = 113
    array = []
    pad_array = np.zeros(300)
    for i in range(shape):
        word_token = word_tokenize(series[i])
        word_token = word_token[:max_length]
        sample_vector = np.array([list(wv[word]) for word in word_token if word in wv.index_to_key])
        if sample_vector.shape[0] > 0:
            if sample_vector.shape[0] >= max_length:
                sample_vector = sample_vector[:max_length,:]
            else:
                deficit = max_length-sample_vector.shape[0]
                for i in range(deficit):
                    sample_vector = np.vstack((sample_vector, pad_array))
        else:
            sample_vector = np.zeros((max_length, 300))
        array.append(sample_vector.tolist())
        
    dictionary = {}
    for i in range(max_length):
        dictionary[str(i)] = []
    
    for i in range(max_length):
        for item in array:
            temp_item = res = " ".join([str(itm) for itm in item[i]])
            dictionary[str(i)].append(temp_item)
            
    dataset = pd.DataFrame(dictionary)
    
    return dataset

#Function to read the encoded dataset and convert it into a tensor
def convert_data(csv_file):
    '''Function for importing encoded data stored in a csv file, and converting to tensor'''
    
    array = []
    for i in range(csv_file.shape[0]):
        #Loops through each row of the dataset
        temp_list = list(csv_file.iloc[i,:])
        new_list = []
        
        for item in temp_list:
            #Loops through each column entry in the row, splits and converts to list of floats.
            chg_list = [float(i) for i in item.split(" ")]
            new_list.append(chg_list) #adds each column entry to the list of vector encoded terms
        temp_arr = np.array(new_list)
        array.append(temp_arr.tolist()) #adds the entire row of encoded values to the overall input array
    array = np.array(array)
    array = tf.convert_to_tensor(array, dtype=tf.float64) #converts the entire array to a tensor.
    return array


#Bert Tokenizer
def bert_preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 113,
                        padding='max_length',
                        truncation = True,
                        return_attention_mask = True,
                        return_tensors = 'pt')

#CodeBERT Tokenizer
def codebert_preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 113,
                        padding='max_length',
                        truncation = True,
                        return_attention_mask = True,
                        return_tensors = 'pt')


#Defining Evaluation Metrics
def rnn_tp(preds, labels):
    '''Returns True Positives (TP): count of correct predictions of actual class 1'''
    return sum([preds == labels and preds == 'COHERENT' for preds, labels in zip(preds, labels)])

def rnn_fp(preds, labels):
    '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
    return sum([preds != labels and preds == 'COHERENT' for preds, labels in zip(preds, labels)])

def rnn_tn(preds, labels):
    '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
    return sum([preds == labels and preds == 'NOT_COHERENT' for preds, labels in zip(preds, labels)])

def rnn_fn(preds, labels):
    '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
    return sum([preds != labels and preds == 'NOT_COHERENT' for preds, labels in zip(preds, labels)])

def rnn_metrics(preds, labels):
    '''
    Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
    '''
    
    stat_dict = {'0':'COHERENT','1':'NOT_COHERENT'}
    prediction_class = [stat_dict[str(list(row).index(max(list(row))))] for row in preds]
    labels = labels
    tp = rnn_tp(prediction_class, labels)
    tn = rnn_tn(prediction_class, labels)
    fp = rnn_fp(prediction_class, labels)
    fn = rnn_fn(prediction_class, labels)
    b_accuracy = (tp + tn) / len(labels)
    b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
    b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
    b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
    f1_score = 2*((b_precision*b_recall)/(b_precision+b_recall)) if (b_precision != 'nan' and b_recall != 'nan') else 'nan'
    return b_accuracy, b_precision, b_recall, b_specificity, f1_score

def b_tp(preds, labels):
    '''Returns True Positives (TP): count of correct predictions of actual class 1'''
    return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
    '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
    return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
    '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
    return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
    '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
    return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
    '''
    Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
    '''
    preds = np.argmax(preds, axis = 1).flatten()
    labels = labels.flatten()
    tp = b_tp(preds, labels)
    tn = b_tn(preds, labels)
    fp = b_fp(preds, labels)
    fn = b_fn(preds, labels)
    b_accuracy = (tp + tn) / len(labels)
    b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
    b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
    b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
    f1_score = 2*((b_precision*b_recall)/(b_precision+b_recall)) if (b_precision != 'nan' and b_recall != 'nan') else 'nan'
    return b_accuracy, b_precision, b_recall, b_specificity, f1_score

---

In [None]:
%%time
#Saving all datasets to in encoded format to reduce experiment time.
files = ['Benchmark_Raw_Data_2.csv','CoffeeMaker_Raw_Data_2.csv','JFreeChart060_Raw_Data_2.csv',
           'JFreeChart071_Raw_Data_2.csv','JHotDraw741_Raw_Data_2.csv']
folders = ['Benchmark','CoffeeMaker','JFreeChart060','JFreeChart071','JHotDraw741']
parent_path = '/kaggle/input/sourcesniffer/SourceSniffer'

#concatenating all other datasets
for i in range(len(files)):
    curr_path = parent_path+'/'+folders[i]+'/'+files[i]
    save_path = '/kaggle/working/'+folders[i]+'.csv'
    temp_df = pd.read_csv(curr_path, encoding='latin-1')
    temp_df.drop(['Unnamed: 0','Index'], axis=1, inplace=True)
    
    #Dataset Preprocessing (Cleaning)
    temp_df['Code and Comment'] = temp_df['Code and Comment'].str.lower()
    temp_df['Code and Comment'] = temp_df['Code and Comment'].apply(clean_code)
    #g_codes2.head(1)

    #Dataset Preprocessing (Assigning non-null entries to new variable)
    temp_df = temp_df[["Code and Comment","Label"]][~temp_df['Label'].isnull()]
    temp_comm2 = temp_df['Code and Comment']

    #Dataset Preprocessing (Code-Comment Vector-Encoding)
    temp_Dataset = modified_sent_vect(temp_comm2)
    temp_Dataset["Label"] = temp_df["Label"]
    
    temp_Dataset.to_csv(save_path)
    print('{} dataset has been saved!'.format(files[i]))

In [None]:
#Dataset size check
files = ['Benchmark_Raw_Data_2.csv','CoffeeMaker_Raw_Data_2.csv','JFreeChart060_Raw_Data_2.csv',
           'JFreeChart071_Raw_Data_2.csv','JHotDraw741_Raw_Data_2.csv']
folders = ['Benchmark','CoffeeMaker','JFreeChart060','JFreeChart071','JHotDraw741']
parent_path = '/kaggle/input/sourcesniffer/SourceSniffer'

first_path = parent_path+'/'+folders[0]+'/'+files[0]
temp_codes  = pd.read_csv(first_path, encoding='latin-1')
print(temp_codes.shape)

#concatenating all other datasets
for i in range(1,len(files)):
    curr_path = parent_path+'/'+folders[i]+'/'+files[i]
    temp_df = pd.read_csv(curr_path, encoding='latin-1')
    temp_codes = pd.concat([temp_codes, temp_df], axis=0)
    print(temp_df.shape, temp_codes.shape)

---

## Gensim + LSTM Bidirectional

#### Load Tokenizer

In [None]:
%%time
#Corpus, Dictionary etc declaration

wv = KeyedVectors.load('/kaggle/input/gensimcorpus-googlenews300/word2vec-google-news-300/word2vec-google-news-300')

In [None]:
%%time
stop_words = set(stopwords.words('english'))
ps = nltk.PorterStemmer()

### Benchmark

#### Loading Dataset and Data Preprocessing

In [None]:
%%time
#Encoded Dataset Importation
gcoh_data1 = pd.read_csv('/kaggle/input/encoded-datasets/Benchmark.csv')
gcoh_data1.drop(['Unnamed: 0'], axis=1, inplace=True)
gcoh_data1.shape

# Creating a dataframe with 80% values of original dataframe "Train Set"
g_training1 = gcoh_data1.sample(frac = 0.80, random_state=2)
g_trainlabel1 = g_training1['Label']

# Creating dataframe with rest of the 20% values "Test Set"
g_testing1 = gcoh_data1.drop(g_training1.index)
g_testlabel1 = g_testing1['Label']

#Dataset Preprocessing (Label Encoding)
g_training1 = convert_data(g_training1.iloc[:,:45])
g_testing1 = convert_data(g_testing1.iloc[:,:45])

le = preprocessing.LabelEncoder()
gcoh_stat1 = le.fit_transform(g_trainlabel1)
gnum_classes1 = 2

g_label1 = np.array(gcoh_stat1)
g_label1 = to_categorical(g_label1, gnum_classes1)
print(g_training1.shape, g_label1.shape)
print(g_testing1.shape, g_testlabel1.shape)

#Identify which label numerical code correspond to which categories
print(collections.Counter(gcoh_stat1), collections.Counter(g_trainlabel1))

#### Model Training and Evaluation

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [30, 40, 50]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model1 = Sequential([
                SimpleRNN(node, input_shape = (g_training1.shape[1], g_training1.shape[2]), return_sequences = False),
                Dense(gnum_classes1, activation='softmax'),
            ])

            #Model Training
            g_epochs1 = epoch
            g_model1.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist1 = g_model1.fit(g_training1, g_label1, epochs = g_epochs1, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist1.history["loss"][g_epochs1 - 1], g_hist1.history["accuracy"][g_epochs1 - 1]))
            print('\n')

            #Model Evaluation
            pred1 = g_model1.predict(g_testing1, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred1, g_testlabel1)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('F1 Score: {:.4f}\n'.format(f1_score))

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [30, 40, 50]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model1_5 = Sequential([
                LSTM(node, input_shape = (g_training1.shape[1], g_training1.shape[2]), return_sequences = False),
                Dense(gnum_classes1, activation='softmax'),
            ])

            #Model Training
            g_epochs1_5 = epoch
            g_model1_5.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist1_5 = g_model1_5.fit(g_training1, g_label1, epochs = g_epochs1_5, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist1_5.history["loss"][g_epochs1_5 - 1], g_hist1_5.history["accuracy"][g_epochs1_5 - 1]))
            print('\n')

            #Model Evaluation
            pred1_5 = g_model1_5.predict(g_testing1, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred1_5, g_testlabel1)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('F1 Score: {:.4f}\n'.format(f1_score))

### CoffeeMaker

#### Loading Dataset and Data Preprocessing

In [None]:
%%time
#Encoded Dataset Importation
gcoh_data2 = pd.read_csv('/kaggle/input/encoded-datasets-75maxlength/Max_Length_75/CoffeeMaker.csv')
gcoh_data2.drop(['Unnamed: 0'], axis=1, inplace=True)
gcoh_data2.shape

# Creating a dataframe with 80% values of original dataframe "Train Set"
g_training2 = gcoh_data2.sample(frac = 0.80)
g_trainlabel2 = g_training2['Label']

# Creating dataframe with rest of the 20% values "Test Set"
g_testing2 = gcoh_data2.drop(g_training2.index)
g_testlabel2 = g_testing2['Label']

#Dataset Preprocessing (Label Encoding)
g_training2 = convert_data(g_training2.iloc[:,:45])
g_testing2 = convert_data(g_testing2.iloc[:,:45])

le = preprocessing.LabelEncoder()
gcoh_stat2 = le.fit_transform(g_trainlabel2)
gnum_classes2 = 2

g_label2 = np.array(gcoh_stat2)
g_label2 = to_categorical(g_label2, gnum_classes2)
print(g_training2.shape, g_label2.shape)
print(g_testing2.shape, g_testlabel2.shape)

#Identify which label numerical code correspond to which categories
print(collections.Counter(gcoh_stat2), collections.Counter(g_trainlabel2))

#### Model Training and Evaluation

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [30, 40, 50]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model2 = Sequential([
                SimpleRNN(node, input_shape = (g_training2.shape[1], g_training2.shape[2]), return_sequences = False),
                Dense(gnum_classes2, activation='softmax'),
            ])

            #Model Training and Evaluation
            g_epochs2 = epoch
            g_model2.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist2 = g_model2.fit(g_training2, g_label2, epochs = g_epochs2, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist2.history["loss"][g_epochs2 - 1], g_hist2.history["accuracy"][g_epochs2 - 1]))
            print('\n')

            #Model Evaluation
            pred2 = g_model2.predict(g_testing2, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred2, g_testlabel2)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('Validation Specificity: {:.4f}'.format(rnn_specificity))
            print('F1 Score: {:.4f}\n'.format(f1_score))

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [30, 40, 50]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model2_5 = Sequential([
                LSTM(node, input_shape = (g_training2.shape[1], g_training2.shape[2]), return_sequences = False),
                Dense(gnum_classes2, activation='softmax'),
            ])
            #Model Training and Evaluation
            g_epochs2_5 = epoch
            g_model2_5.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist2_5 = g_model2_5.fit(g_training2, g_label2, epochs = g_epochs2_5, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist2_5.history["loss"][g_epochs2_5 - 1], g_hist2_5.history["accuracy"][g_epochs2_5 - 1]))
            print('\n')

            #Model Evaluation
            pred2_5 = g_model2_5.predict(g_testing2, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred2_5, g_testlabel2)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('Validation Specificity: {:.4f}'.format(rnn_specificity))
            print('F1 Score: {:.4f}\n'.format(f1_score))

### JFreeChart060

#### Loading Dataset and Data Preprocessing

In [None]:
%%time
#Encoded Dataset Importation
gcoh_data3 = pd.read_csv('/kaggle/input/encoded-datasets-75maxlength/Max_Length_75/JFreeChart060.csv')
gcoh_data3.drop(['Unnamed: 0'], axis=1, inplace=True)
gcoh_data3.shape

# Creating a dataframe with 80% values of original dataframe "Train Set"
g_training3 = gcoh_data3.sample(frac = 0.80, random_state=2)
g_trainlabel3 = g_training3['Label']

# Creating dataframe with rest of the 20% values "Test Set"
g_testing3 = gcoh_data3.drop(g_training3.index)
g_testlabel3 = g_testing3['Label']

#Dataset Preprocessing (Label Encoding)
g_training3 = convert_data(g_training3.iloc[:,:45])
g_testing3 = convert_data(g_testing3.iloc[:,:45])

le = preprocessing.LabelEncoder()
gcoh_stat3 = le.fit_transform(g_trainlabel3)
gnum_classes3 = 2

g_label3 = np.array(gcoh_stat3)
g_label3 = to_categorical(g_label3, gnum_classes3)
print(g_training3.shape, g_label3.shape)
print(g_testing3.shape, g_testlabel3.shape)

#Identify which label numerical code correspond to which categories
print(collections.Counter(gcoh_stat3), collections.Counter(g_trainlabel3))

#### Model Training and Evaluation

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [30, 40, 50]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model3 = Sequential([
                SimpleRNN(node, input_shape = (g_training3.shape[1], g_training3.shape[2]), return_sequences = False),
                Dense(gnum_classes3, activation='softmax'),
            ])

            #Model Training and Evaluation
            g_epochs3 = epoch
            g_model3.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist3 = g_model3.fit(g_training3, g_label3, epochs = g_epochs3, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist3.history["loss"][g_epochs3 - 1], g_hist3.history["accuracy"][g_epochs3 - 1]))
            print('\n')

            #Model Evaluation
            pred3 = g_model3.predict(g_testing3, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred3, g_testlabel3)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('F1 Score: {:.4f}\n'.format(f1_score))

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [30, 40, 50]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model3_5 = Sequential([
                LSTM(node, input_shape = (g_training3.shape[1], g_training3.shape[2]), return_sequences = False),
                Dense(gnum_classes3, activation='softmax'),
            ])

            #Model Training and Evaluation
            g_epochs3_5 = epoch
            g_model3_5.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist3_5 = g_model3_5.fit(g_training3, g_label3, epochs = g_epochs3_5, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist3_5.history["loss"][g_epochs3_5 - 1], g_hist3_5.history["accuracy"][g_epochs3_5 - 1]))
            print('\n')

            #Model Evaluation
            pred3_5 = g_model3_5.predict(g_testing3, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred3_5, g_testlabel3)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('F1 Score: {:.4f}\n'.format(f1_score))

### JFreeChart071

#### Loading Dataset and Data Preprocessing

In [None]:
%%time
#Encoded Dataset Importation
gcoh_data4 = pd.read_csv('/kaggle/input/encoded-datasets-75maxlength/Max_Length_75/JFreeChart071.csv')
gcoh_data4.drop(['Unnamed: 0'], axis=1, inplace=True)
gcoh_data4.shape

# Creating a dataframe with 80% values of original dataframe "Train Set"
g_training4 = gcoh_data4.sample(frac = 0.80, random_state=10)
g_trainlabel4 = g_training4['Label']

# Creating dataframe with rest of the 20% values "Test Set"
g_testing4 = gcoh_data4.drop(g_training4.index)
g_testlabel4 = g_testing4['Label']

#Dataset Preprocessing (Label Encoding)
g_training4 = convert_data(g_training4.iloc[:,:45])
g_testing4 = convert_data(g_testing4.iloc[:,:45])

le = preprocessing.LabelEncoder()
gcoh_stat4 = le.fit_transform(g_trainlabel4)
gnum_classes4 = 2

g_label4 = np.array(gcoh_stat4)
g_label4 = to_categorical(g_label4, gnum_classes4)
print(g_training4.shape, g_label4.shape)
print(g_testing4.shape, g_testlabel4.shape)

#Identify which label numerical code correspond to which categories
print(collections.Counter(gcoh_stat4), collections.Counter(g_trainlabel4))

#### Model Training and Evaluation

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [30, 40, 50]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model4 = Sequential([
                SimpleRNN(node, input_shape = (g_training4.shape[1], g_training4.shape[2]), return_sequences = False),
                Dense(gnum_classes4, activation='softmax'),
            ])

            #Model Training and Evaluation
            g_epochs4 = epoch
            g_model4.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist4 = g_model4.fit(g_training4, g_label4, epochs = g_epochs4, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist4.history["loss"][g_epochs4 - 1], g_hist4.history["accuracy"][g_epochs4 - 1]))
            print('\n')

            #Model Evaluation
            pred4 = g_model4.predict(g_testing4, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred4, g_testlabel4)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('F1 Score: {:.4f}\n'.format(f1_score))

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [10, 20, 30]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model4_5 = Sequential([
                LSTM(node, input_shape = (g_training4.shape[1], g_training4.shape[2]), return_sequences = False),
                Dense(gnum_classes4, activation='softmax'),
            ])

            #Model Training and Evaluation
            g_epochs4_5 = epoch
            g_model4_5.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist4_5 = g_model4_5.fit(g_training4, g_label4, epochs = g_epochs4_5, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist4_5.history["loss"][g_epochs4_5 - 1], g_hist4_5.history["accuracy"][g_epochs4_5 - 1]))
            print('\n')

            #Model Evaluation
            pred4_5 = g_model4_5.predict(g_testing4, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred4_5, g_testlabel4)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('F1 Score: {:.4f}\n'.format(f1_score))

### JHotDraw741

#### Loading Dataset and Data Preprocessing

In [None]:
%%time
#Encoded Dataset Importation
gcoh_data5 = pd.read_csv('/kaggle/input/encoded-datasets/JHotDraw741.csv')
gcoh_data5.drop(['Unnamed: 0'], axis=1, inplace=True)
gcoh_data5.shape

# Creating a dataframe with 80% values of original dataframe "Train Set"
g_training5 = gcoh_data5.sample(frac = 0.80, random_state = 2)
g_trainlabel5 = g_training5['Label']

# Creating dataframe with rest of the 20% values "Test Set"
g_testing5 = gcoh_data5.drop(g_training5.index)
g_testlabel5 = g_testing5['Label']

#Dataset Preprocessing (Label Encoding)
g_training5 = convert_data(g_training5.iloc[:,:45])
g_testing5 = convert_data(g_testing5.iloc[:,:45])

le = preprocessing.LabelEncoder()
gcoh_stat5 = le.fit_transform(g_trainlabel5)
gnum_classes5 = 2

g_label5 = np.array(gcoh_stat5)
g_label5 = to_categorical(g_label5, gnum_classes5)
print(g_training5.shape, g_label5.shape)
print(g_testing5.shape, g_testlabel5.shape)

#Identify which label numerical code correspond to which categories
print(collections.Counter(gcoh_stat5), collections.Counter(g_trainlabel5))

#### Model Training and Evaluation

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [30, 40, 50]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model5 = Sequential([
                SimpleRNN(node, input_shape = (g_training5.shape[1], g_training5.shape[2]), return_sequences = False),
                Dense(gnum_classes5, activation='softmax'),
            ])

            #Model Training and Evaluation
            g_epochs5 = epoch
            g_model5.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist5 = g_model5.fit(g_training5, g_label5, epochs = g_epochs5, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist5.history["loss"][g_epochs5 - 1], g_hist5.history["accuracy"][g_epochs5 - 1]))
            print('\n')

            #Model Evaluation
            pred5 = g_model5.predict(g_testing5, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred5, g_testlabel5)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('F1 Score: {:.4f}\n'.format(f1_score))

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [20, 30, 40, 50]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model5_5 = Sequential([
                LSTM(node, input_shape = (g_training5.shape[1], g_training5.shape[2]), return_sequences = False),
                Dense(gnum_classes5, activation='softmax'),
            ])

            #Model Training and Evaluation
            g_epochs5_5 = epoch
            g_model5_5.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist5_5 = g_model5_5.fit(g_training5, g_label5, epochs = g_epochs5_5, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist5_5.history["loss"][g_epochs5_5 - 1], g_hist5_5.history["accuracy"][g_epochs5_5 - 1]))
            print('\n')

            #Model Evaluation
            pred5_5 = g_model5_5.predict(g_testing5, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred5_5, g_testlabel5)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('F1 Score: {:.4f}\n'.format(f1_score))

### All Datasets Combined

#### Loading Dataset and Data Preprocessing

In [None]:
%%time
#Encoded Dataset Importation
files = ['Benchmark.csv','CoffeeMaker.csv','JFreeChart060.csv','JFreeChart071.csv','JHotDraw741.csv']
parent_path = '/kaggle/input/encoded-datasets'
first_path = parent_path+'/'+files[0]
g_codes6  = pd.read_csv(first_path)

#concatenating all other datasets
for i in range(1,len(files)):
    curr_path = parent_path+'/'+files[i]
    temp_df = pd.read_csv(curr_path)
    g_codes6 = pd.concat([g_codes6, temp_df], axis=0)   
g_codes6.drop(['Unnamed: 0'], axis=1, inplace=True)

# Creating a dataframe with 80% values of original dataframe "Train Set"
g_training6 = g_codes6.sample(frac = 0.80, random_state = 2)
g_trainlabel6 = g_training6['Label']

# Creating dataframe with rest of the 20% values "Test Set"
g_testing6 = g_codes6.drop(g_training6.index)
g_testlabel6 = g_testing6['Label']

#Dataset Preprocessing (Label Encoding)
g_training6 = convert_data(g_training6.iloc[:,:45])
g_testing6 = convert_data(g_testing6.iloc[:,:45])

le = preprocessing.LabelEncoder()
gcoh_stat6 = le.fit_transform(g_trainlabel6)
gnum_classes6 = 2

g_label6 = np.array(gcoh_stat6)
g_label6 = to_categorical(g_label6, gnum_classes6)
print(g_training6.shape, g_label6.shape)
print(g_testing6.shape, g_testlabel6.shape)

#Identify which label numerical code correspond to which categories
print(collections.Counter(gcoh_stat6), collections.Counter(g_trainlabel6))

#### Model Training and Evaluation

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [30, 40, 50]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model6 = Sequential([
                SimpleRNN(node, input_shape = (g_training6.shape[1], g_training6.shape[2]), return_sequences = False),
                Dense(gnum_classes6, activation='softmax'),
            ])

            #Model Training and Evaluation
            g_epochs6 = epoch
            g_model6.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist6 = g_model6.fit(g_training6, g_label6, epochs = g_epochs6, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist6.history["loss"][g_epochs6 - 1], g_hist6.history["accuracy"][g_epochs6 - 1]))
            print('\n')

            #Model Evaluation
            pred6 = g_model6.predict(g_testing6, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred6, g_testlabel6)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('F1 Score: {:.4f}\n'.format(f1_score))

In [None]:
%%time
#Nodes: 100, Lr: 0.0005, Epochs: 40-50
lr_list = [0.0005, 0.00075, 0.001]
nodes = [100, 200, 300]
epochs = [30, 40, 50]

for epoch in epochs:
    for node in nodes:
        for lr in lr_list:
            #Model Object Initiation
            g_model6_5 = Sequential([
                LSTM(node, input_shape = (g_training6.shape[1], g_training6.shape[2]), return_sequences = False),
                Dense(gnum_classes6, activation='softmax'),
            ])

            #Model Training and Evaluation
            g_epochs6_5 = epoch
            g_model6_5.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate=lr), metrics = ['accuracy'])
            g_hist6_5 = g_model6_5.fit(g_training6, g_label6, epochs = g_epochs6_5, batch_size = 50, validation_split=False, verbose=False)
            print('Epochs Complete: Epochs:{}, Hidden Nodes:{}, Learning Rate:{}.'.format(epoch, node, lr))
            print('47/47 [==============================] - loss: {} - accuracy: {}'.format(g_hist6_5.history["loss"][g_epochs6_5 - 1], g_hist6_5.history["accuracy"][g_epochs6_5 - 1]))
            print('\n')

            #Model Evaluation
            pred6_5 = g_model6_5.predict(g_testing6, verbose=False)
            rnn_accuracy, rnn_precision, rnn_recall, rnn_specificity, f1_score = rnn_metrics(pred6_5, g_testlabel6)
            print('Validation Accuracy: {:.4f}'.format(rnn_accuracy))
            print('Validation Precision: {:.4f}'.format(rnn_precision))
            print('Validation Recall: {:.4f}'.format(rnn_recall))
            print('F1 Score: {:.4f}\n'.format(f1_score))

---

## BERT and CodeBERT Sentence Prediction

#### Load Tokenizer

In [None]:
%%time
#Load the BERT tokenizer.

b_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
cb_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

### Benchmark

#### Load Dataset, Data Preprocessing and Model Training and Evaluation

In [None]:
%%time
#Dataset Importation
codes1  = pd.read_csv("/kaggle/input/sourcesniffer/SourceSniffer/Benchmark/Benchmark_Raw_Data_2.csv", encoding='latin-1')
codes1.drop(['Unnamed: 0','Index'], axis=1, inplace=True)
codes1.head(1)


#Dataset Preprocessing (Cleaning)
codes1['Code and Comment'] = codes1['Code and Comment'].str.lower()
codes1['Code and Comment'] = codes1['Code and Comment'].apply(clean_code)
codes1.head(1)

#Dataset Preprocessing (Assigning non-null entries to new variable)
coh_data1 = codes1[["Code and Comment","Label"]][~codes1['Label'].isnull()]
coh_data1.shape

#Dataset Preprocessing (Label Encoding)
le = preprocessing.LabelEncoder()
coh_stat1 = le.fit_transform(coh_data1['Label'])

code_comm1 = coh_data1['Code and Comment']

#Identify which label numerical code correspond to which categories
print(collections.Counter(coh_stat1), collections.Counter(coh_data1['Label']))

#### BERT

In [None]:
%%time
#BERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)

b_token_id1 = []
b_attention_masks1 = []

for sample in code_comm1:
    b_encoding_dict1 = bert_preprocessing(sample, b_tokenizer)
    b_token_id1.append(b_encoding_dict1['input_ids']) 
    b_attention_masks1.append(b_encoding_dict1['attention_mask'])

b_token_id1 = torch.cat(b_token_id1, dim = 0)
b_attention_masks1 = torch.cat(b_attention_masks1, dim = 0)
b_labels1 = torch.tensor(coh_stat1)

#BERT
#Splitting Dataset into training and validation set, and loading into batches.
b_val_ratio1 = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
b_batch_size1 = 16

# Indices of the train and validation splits stratified by labels
b_train_idx1, b_val_idx1 = train_test_split(
    np.arange(len(b_labels1)),
    test_size = b_val_ratio1,
    shuffle = True,
    stratify = b_labels1)

# Train and validation sets
b_train_set1 = TensorDataset(b_token_id1[b_train_idx1], 
                          b_attention_masks1[b_train_idx1], 
                          b_labels1[b_train_idx1])

b_val_set1 = TensorDataset(b_token_id1[b_val_idx1], 
                        b_attention_masks1[b_val_idx1], 
                        b_labels1[b_val_idx1])

# Prepare DataLoader
b_train_dataloader1 = DataLoader(
            b_train_set1,
            sampler = RandomSampler(b_train_set1),
            batch_size = b_batch_size1)

b_validation_dataloader1 = DataLoader(
            b_val_set1,
            sampler = SequentialSampler(b_val_set1),
            batch_size = b_batch_size1)

In [None]:
%%time
#BERT
#Model Object Initiation
#Load the BertForSequenceClassification model
b_model1 = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,)

#Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
b_optimizer1 = torch.optim.AdamW(b_model1.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

#Run on GPU
b_model1.cuda()

#BERT
#Model Training and Evaluation
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
b_epochs1 = 3

for _ in trange(b_epochs1, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    b_model1.train()
    
    # Tracking variables
    b_tr_loss1 = 0
    nb_tr_examples1, nb_tr_steps1 = 0, 0

    for step, batch in enumerate(b_train_dataloader1):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids1, b_input_mask1, b_labels1 = batch
        b_optimizer1.zero_grad()
        # Forward pass
        b_train_output1 = b_model1(b_input_ids1, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask1, 
                             labels = b_labels1)
        # Backward pass
        b_train_output1.loss.backward()
        b_optimizer1.step()
        # Update tracking variables
        b_tr_loss1 += b_train_output1.loss.item()
        nb_tr_examples1 += b_input_ids1.size(0)
        nb_tr_steps1 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    b_model1.eval()

    # Tracking variables 
    b_val_accuracy1, b_val_precision1, b_val_recall1, b_val_specificity1, f1_val_score1 = [], [], [], [], []

    for batch in b_validation_dataloader1:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids1, b_input_mask1, b_labels1 = batch
        with torch.no_grad():
          # Forward pass
          b_eval_output1 = b_model1(b_input_ids1, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask1)
        logits1 = b_eval_output1.logits.detach().cpu().numpy()
        label_ids1 = b_labels1.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy1, b_precision1, b_recall1, b_specificity1, f1_score1 = b_metrics(logits1, label_ids1)
        b_val_accuracy1.append(b_accuracy1)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision1 != 'nan': b_val_precision1.append(b_precision1)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall1 != 'nan': b_val_recall1.append(b_recall1)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity1 != 'nan': b_val_specificity1.append(b_specificity1)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if f1_score1 != 'nan': f1_val_score1.append(f1_score1)

    print('\n\t - Train loss: {:.4f}'.format(b_tr_loss1 / nb_tr_steps1))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(b_val_accuracy1)/len(b_val_accuracy1)))
    print('\t - Validation Precision: {:.4f}'.format(sum(b_val_precision1)/len(b_val_precision1)) if len(b_val_precision1)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(b_val_recall1)/len(b_val_recall1)) if len(b_val_recall1)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(b_val_specificity1)/len(b_val_specificity1)) if len(b_val_specificity1)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(f1_val_score1)/len(f1_val_score1)) if len(f1_val_score1)>0 else '\t - F1 Score: NaN')

#### CodeBERT

In [None]:
%%time
#CodeBERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)
cb_token_id1 = []
cb_attention_masks1 = []

for sample in code_comm1:
    cb_encoding_dict1 = codebert_preprocessing(sample, cb_tokenizer)
    cb_token_id1.append(cb_encoding_dict1['input_ids']) 
    cb_attention_masks1.append(cb_encoding_dict1['attention_mask'])

cb_token_id1 = torch.cat(cb_token_id1, dim = 0)
cb_attention_masks1 = torch.cat(cb_attention_masks1, dim = 0)
cb_labels1 = torch.tensor(coh_stat1)

#CodeBERT
#Splitting Dataset into training and validation set, and loading into batches.
cb_val_ratio1 = 0.2
#Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
cb_batch_size1 = 16

#Indices of the train and validation splits stratified by labels
cb_train_idx1, cb_val_idx1 = train_test_split(
    np.arange(len(cb_labels1)),
    test_size = cb_val_ratio1,
    shuffle = True,
    stratify = cb_labels1)

#Train and validation sets
cb_train_set1 = TensorDataset(cb_token_id1[cb_train_idx1], 
                          cb_attention_masks1[cb_train_idx1], 
                          cb_labels1[cb_train_idx1])

cb_val_set1 = TensorDataset(cb_token_id1[cb_val_idx1], 
                        cb_attention_masks1[cb_val_idx1], 
                        cb_labels1[cb_val_idx1])

#Prepare DataLoader
cb_train_dataloader1 = DataLoader(
            cb_train_set1,
            sampler = RandomSampler(cb_train_set1),
            batch_size = cb_batch_size1)

cb_validation_dataloader1 = DataLoader(
            cb_val_set1,
            sampler = SequentialSampler(cb_val_set1),
            batch_size = cb_batch_size1)

In [None]:
%%time
#CodeBERT
#Model Object Initiation
# Load the RobertaForSequenceClassification model
cb_model1 = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base")

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
cb_optimizer1 = torch.optim.AdamW(cb_model1.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

# Run on GPU
cb_model1.cuda()

#CodeBERT
#Model Training and Evaluation
#Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
cb_epochs1 = 3

for _ in trange(cb_epochs1, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    cb_model1.train()
    
    # Tracking variables
    cb_tr_loss1 = 0
    nb_tr_examples1, nb_tr_steps1 = 0, 0

    for step, batch in enumerate(cb_train_dataloader1):
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids1, cb_input_mask1, cb_labels1 = batch
        cb_optimizer1.zero_grad()
        # Forward pass
        cb_train_output1 = cb_model1(cb_input_ids1, 
                             token_type_ids = None, 
                             attention_mask = cb_input_mask1, 
                             labels = cb_labels1)
        # Backward pass
        cb_train_output1.loss.backward()
        cb_optimizer1.step()
        # Update tracking variables
        cb_tr_loss1 += cb_train_output1.loss.item()
        nb_tr_examples1 += cb_input_ids1.size(0)
        nb_tr_steps1 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    cb_model1.eval()

    # Tracking variables 
    cb_val_accuracy1, cb_val_precision1, cb_val_recall1, cb_val_specificity1, cf1_val_score1 = [], [], [], [], []

    for batch in cb_validation_dataloader1:
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids1, cb_input_mask1, cb_labels1 = batch
        with torch.no_grad():
          # Forward pass
          cb_eval_output1 = cb_model1(cb_input_ids1, 
                              token_type_ids = None, 
                              attention_mask = cb_input_mask1)
        logits1 = cb_eval_output1.logits.detach().cpu().numpy()
        label_ids1 = cb_labels1.to('cpu').numpy()
        # Calculate validation metrics
        cb_accuracy1, cb_precision1, cb_recall1, cb_specificity1, cf1_score1 = b_metrics(logits1, label_ids1)
        cb_val_accuracy1.append(cb_accuracy1)
        # Update precision only when (tp + fp) !=0; ignore nan
        if cb_precision1 != 'nan': cb_val_precision1.append(cb_precision1)
        # Update recall only when (tp + fn) !=0; ignore nan
        if cb_recall1 != 'nan': cb_val_recall1.append(cb_recall1)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if cb_specificity1 != 'nan': cb_val_specificity1.append(cb_specificity1)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if cf1_score1 != 'nan': cf1_val_score1.append(cf1_score1)

    print('\n\t - Train loss: {:.4f}'.format(cb_tr_loss1 / nb_tr_steps1))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(cb_val_accuracy1)/len(cb_val_accuracy1)))
    print('\t - Validation Precision: {:.4f}'.format(sum(cb_val_precision1)/len(cb_val_precision1)) if len(cb_val_precision1)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(cb_val_recall1)/len(cb_val_recall1)) if len(cb_val_recall1)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(cb_val_specificity1)/len(cb_val_specificity1)) if len(cb_val_specificity1)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(cf1_val_score1)/len(cf1_val_score1)) if len(cf1_val_score1)>0 else '\t - F1 Score: NaN')

### CoffeeMaker

#### Load Dataset, Data Preprocessing and Model Training and Evaluation

In [None]:
%%time
#Dataset Importation
codes2  = pd.read_csv("/kaggle/input/sourcesniffer/SourceSniffer/CoffeeMaker/CoffeeMaker_Raw_Data_2.csv", encoding='latin-1')
codes2.drop(['Unnamed: 0','Index'], axis=1, inplace=True)
#codes2.head(1)

#Dataset Preprocessing (Cleaning)
codes2['Code and Comment'] = codes2['Code and Comment'].str.lower()
codes2['Code and Comment'] = codes2['Code and Comment'].apply(clean_code)
#codes2.head(1)

#Dataset Preprocessing (Assigning non-null entries to new variable)
coh_data2 = codes2[["Code and Comment","Label"]][~codes2['Label'].isnull()]
#coh_data2.shape

#Dataset Preprocessing (Label Encoding)
le = preprocessing.LabelEncoder()
coh_stat2 = le.fit_transform(coh_data2['Label'])
code_comm2 = coh_data2['Code and Comment']

#Identify which label numerical code correspond to which categories
print(collections.Counter(coh_stat2), collections.Counter(coh_data2['Label']))

#### BERT

In [None]:
%%time
#BERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)

b_token_id2 = []
b_attention_masks2 = []

for sample in code_comm2:
    b_encoding_dict2 = bert_preprocessing(sample, b_tokenizer)
    b_token_id2.append(b_encoding_dict2['input_ids']) 
    b_attention_masks2.append(b_encoding_dict2['attention_mask'])

b_token_id2 = torch.cat(b_token_id2, dim = 0)
b_attention_masks2 = torch.cat(b_attention_masks2, dim = 0)
b_labels2 = torch.tensor(coh_stat2)

#BERT
#Splitting Dataset into training and validation set, and loading into batches.
b_val_ratio2 = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
b_batch_size2 = 4

# Indices of the train and validation splits stratified by labels
b_train_idx2, b_val_idx2 = train_test_split(
    np.arange(len(b_labels2)),
    test_size = b_val_ratio2,
    shuffle = True,
    stratify = b_labels2)

# Train and validation sets
b_train_set2 = TensorDataset(b_token_id2[b_train_idx2], 
                          b_attention_masks2[b_train_idx2], 
                          b_labels2[b_train_idx2])

b_val_set2 = TensorDataset(b_token_id2[b_val_idx2], 
                        b_attention_masks2[b_val_idx2], 
                        b_labels2[b_val_idx2])

# Prepare DataLoader
b_train_dataloader2 = DataLoader(
            b_train_set2,
            sampler = RandomSampler(b_train_set2),
            batch_size = b_batch_size2)

b_validation_dataloader2 = DataLoader(
            b_val_set2,
            sampler = SequentialSampler(b_val_set2),
            batch_size = b_batch_size2)

In [None]:
#BERT
#Model Object Initiation
#Load the BertForSequenceClassification model
b_model2 = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,)

#Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
b_optimizer2 = torch.optim.AdamW(b_model2.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

#Run on GPU
b_model2.cuda()

#Model Training and Evaluation
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
b_epochs2 = 3

for _ in trange(b_epochs2, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    b_model2.train()
    
    # Tracking variables
    b_tr_loss2 = 0
    nb_tr_examples2, nb_tr_steps2 = 0, 0

    for step, batch in enumerate(b_train_dataloader2):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids2, b_input_mask2, b_labels2 = batch
        b_optimizer2.zero_grad()
        # Forward pass
        b_train_output2 = b_model2(b_input_ids2, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask2, 
                             labels = b_labels2)
        # Backward pass
        b_train_output2.loss.backward()
        b_optimizer2.step()
        # Update tracking variables
        b_tr_loss2 += b_train_output2.loss.item()
        nb_tr_examples2 += b_input_ids2.size(0)
        nb_tr_steps2 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    b_model2.eval()

    # Tracking variables 
    b_val_accuracy2, b_val_precision2, b_val_recall2, b_val_specificity2, f1_val_score2 = [], [], [], [], []

    for batch in b_validation_dataloader2:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids2, b_input_mask2, b_labels2 = batch
        with torch.no_grad():
          # Forward pass
          b_eval_output2 = b_model2(b_input_ids2, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask2)
        logits2 = b_eval_output2.logits.detach().cpu().numpy()
        label_ids2 = b_labels2.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy2, b_precision2, b_recall2, b_specificity2, f1_score2 = b_metrics(logits2, label_ids2)
        b_val_accuracy2.append(b_accuracy2)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision2 != 'nan': b_val_precision2.append(b_precision2)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall2 != 'nan': b_val_recall2.append(b_recall2)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity2 != 'nan': b_val_specificity2.append(b_specificity2)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if f1_score2 != 'nan': f1_val_score2.append(f1_score2)

    print('\n\t - Train loss: {:.4f}'.format(b_tr_loss2 / nb_tr_steps2))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(b_val_accuracy2)/len(b_val_accuracy2)))
    print('\t - Validation Precision: {:.4f}'.format(sum(b_val_precision2)/len(b_val_precision2)) if len(b_val_precision2)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(b_val_recall2)/len(b_val_recall2)) if len(b_val_recall2)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(b_val_specificity2)/len(b_val_specificity2)) if len(b_val_specificity2)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(f1_val_score2)/len(f1_val_score2)) if len(f1_val_score2)>0 else '\t - F1 Score: NaN')

#### CodeBERT

In [None]:
%%time
#CodeBERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)

cb_token_id2 = []
cb_attention_masks2 = []

for sample in code_comm2:
    cb_encoding_dict2 = codebert_preprocessing(sample, cb_tokenizer)
    cb_token_id2.append(cb_encoding_dict2['input_ids']) 
    cb_attention_masks2.append(cb_encoding_dict2['attention_mask'])

cb_token_id2 = torch.cat(cb_token_id2, dim = 0)
cb_attention_masks2 = torch.cat(cb_attention_masks2, dim = 0)
cb_labels2 = torch.tensor(coh_stat2)

#CodeBERT
#Splitting Dataset into training and validation set, and loading into batches.
cb_val_ratio2 = 0.2
#Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
cb_batch_size2 = 4

#Indices of the train and validation splits stratified by labels
cb_train_idx2, cb_val_idx2 = train_test_split(
    np.arange(len(cb_labels2)),
    test_size = cb_val_ratio2,
    shuffle = True,
    stratify = cb_labels2)

#Train and validation sets
cb_train_set2 = TensorDataset(cb_token_id2[cb_train_idx2], 
                          cb_attention_masks2[cb_train_idx2], 
                          cb_labels2[cb_train_idx2])

cb_val_set2 = TensorDataset(cb_token_id2[cb_val_idx2], 
                        cb_attention_masks2[cb_val_idx2], 
                        cb_labels2[cb_val_idx2])

#Prepare DataLoader
cb_train_dataloader2 = DataLoader(
            cb_train_set2,
            sampler = RandomSampler(cb_train_set2),
            batch_size = cb_batch_size2)

cb_validation_dataloader2 = DataLoader(
            cb_val_set2,
            sampler = SequentialSampler(cb_val_set2),
            batch_size = cb_batch_size2)

In [None]:
#CodeBERT
#Model Object Initiation
#Load the RobertaForSequenceClassification model
cb_model2 = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base")

#Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
cb_optimizer2 = torch.optim.AdamW(cb_model2.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

#Run on GPU
cb_model2.cuda()

#Model Training and Evaluation
#Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
cb_epochs2 = 3

for _ in trange(cb_epochs2, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    cb_model2.train()
    
    # Tracking variables
    cb_tr_loss2 = 0
    nb_tr_examples2, nb_tr_steps2 = 0, 0

    for step, batch in enumerate(cb_train_dataloader2):
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids2, cb_input_mask2, cb_labels2 = batch
        cb_optimizer2.zero_grad()
        # Forward pass
        cb_train_output2 = cb_model2(cb_input_ids2, 
                             token_type_ids = None, 
                             attention_mask = cb_input_mask2, 
                             labels = cb_labels2)
        # Backward pass
        cb_train_output2.loss.backward()
        cb_optimizer2.step()
        # Update tracking variables
        cb_tr_loss2 += cb_train_output2.loss.item()
        nb_tr_examples2 += cb_input_ids2.size(0)
        nb_tr_steps2 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    cb_model2.eval()

    # Tracking variables 
    cb_val_accuracy2, cb_val_precision2, cb_val_recall2, cb_val_specificity2, cf1_val_score2 = [], [], [], [], []

    for batch in cb_validation_dataloader2:
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids2, cb_input_mask2, cb_labels2 = batch
        with torch.no_grad():
          # Forward pass
          cb_eval_output2 = cb_model2(cb_input_ids2, 
                              token_type_ids = None, 
                              attention_mask = cb_input_mask2)
        logits2 = cb_eval_output2.logits.detach().cpu().numpy()
        label_ids2 = cb_labels2.to('cpu').numpy()
        # Calculate validation metrics
        cb_accuracy2, cb_precision2, cb_recall2, cb_specificity2, cf1_score2 = b_metrics(logits2, label_ids2)
        cb_val_accuracy2.append(cb_accuracy2)
        # Update precision only when (tp + fp) !=0; ignore nan
        if cb_precision2 != 'nan': cb_val_precision2.append(cb_precision2)
        # Update recall only when (tp + fn) !=0; ignore nan
        if cb_recall2 != 'nan': cb_val_recall2.append(cb_recall2)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if cb_specificity2 != 'nan': cb_val_specificity2.append(cb_specificity2)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if cf1_score2 != 'nan': cf1_val_score2.append(cf1_score2)

    print('\n\t - Train loss: {:.4f}'.format(cb_tr_loss2 / nb_tr_steps2))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(cb_val_accuracy2)/len(cb_val_accuracy2)))
    print('\t - Validation Precision: {:.4f}'.format(sum(cb_val_precision2)/len(cb_val_precision2)) if len(cb_val_precision2)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(cb_val_recall2)/len(cb_val_recall2)) if len(cb_val_recall2)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(cb_val_specificity2)/len(cb_val_specificity2)) if len(cb_val_specificity2)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(cf1_val_score2)/len(cf1_val_score2)) if len(cf1_val_score2)>0 else '\t - F1 Score: NaN')

### JFreeChart060

#### Load Dataset, Data Preprocessing and Model Training and Evaluation

In [None]:
%%time
#Dataset Importation
codes3  = pd.read_csv("/kaggle/input/sourcesniffer/SourceSniffer/JFreeChart060/JFreeChart060_Raw_Data_2.csv", encoding='latin-1')
codes3.drop(['Unnamed: 0','Index'], axis=1, inplace=True)
#codes3.head(1)

#Dataset Preprocessing (Cleaning)
codes3['Code and Comment'] = codes3['Code and Comment'].str.lower()
codes3['Code and Comment'] = codes3['Code and Comment'].apply(clean_code)
#codes3.head(1)

#Dataset Preprocessing (Assigning non-null entries to new variable)
coh_data3 = codes3[["Code and Comment","Label"]][~codes3['Label'].isnull()]
#coh_data3.shape

#Dataset Preprocessing (Label Encoding)
le = preprocessing.LabelEncoder()
coh_stat3 = le.fit_transform(coh_data3['Label'])
code_comm3 = coh_data3['Code and Comment']

#Identify which label numerical code correspond to which categories
print(collections.Counter(coh_stat3), collections.Counter(coh_data3['Label']))


#### BERT

In [None]:
%%time
#BERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)

b_token_id3 = []
b_attention_masks3 = []

for sample in code_comm3:
    b_encoding_dict3 = bert_preprocessing(sample, b_tokenizer)
    b_token_id3.append(b_encoding_dict3['input_ids']) 
    b_attention_masks3.append(b_encoding_dict3['attention_mask'])

b_token_id3 = torch.cat(b_token_id3, dim = 0)
b_attention_masks3 = torch.cat(b_attention_masks3, dim = 0)
b_labels3 = torch.tensor(coh_stat3)

#BERT
#Splitting Dataset into training and validation set, and loading into batches.
b_val_ratio3 = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
b_batch_size3 = 16

# Indices of the train and validation splits stratified by labels
b_train_idx3, b_val_idx3 = train_test_split(
    np.arange(len(b_labels3)),
    test_size = b_val_ratio3,
    shuffle = True,
    stratify = b_labels3)

# Train and validation sets
b_train_set3 = TensorDataset(b_token_id3[b_train_idx3], 
                          b_attention_masks3[b_train_idx3], 
                          b_labels3[b_train_idx3])

b_val_set3 = TensorDataset(b_token_id3[b_val_idx3], 
                        b_attention_masks3[b_val_idx3], 
                        b_labels3[b_val_idx3])

# Prepare DataLoader
b_train_dataloader3 = DataLoader(
            b_train_set3,
            sampler = RandomSampler(b_train_set3),
            batch_size = b_batch_size3)

b_validation_dataloader3 = DataLoader(
            b_val_set3,
            sampler = SequentialSampler(b_val_set3),
            batch_size = b_batch_size3)

In [None]:
#BERT
#Model Object Initiation
#Load the BertForSequenceClassification model
b_model3 = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,)

#Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
b_optimizer3 = torch.optim.AdamW(b_model3.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

#Run on GPU
b_model3.cuda()

#Model Training and Evaluation
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
b_epochs3 = 3

for _ in trange(b_epochs3, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    b_model3.train()
    
    # Tracking variables
    b_tr_loss3 = 0
    nb_tr_examples3, nb_tr_steps3 = 0, 0

    for step, batch in enumerate(b_train_dataloader3):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids3, b_input_mask3, b_labels3 = batch
        b_optimizer3.zero_grad()
        # Forward pass
        b_train_output3 = b_model3(b_input_ids3, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask3, 
                             labels = b_labels3)
        # Backward pass
        b_train_output3.loss.backward()
        b_optimizer3.step()
        # Update tracking variables
        b_tr_loss3 += b_train_output3.loss.item()
        nb_tr_examples3 += b_input_ids3.size(0)
        nb_tr_steps3 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    b_model3.eval()

    # Tracking variables 
    b_val_accuracy3, b_val_precision3, b_val_recall3, b_val_specificity3, f1_val_score3 = [], [], [], [], []

    for batch in b_validation_dataloader3:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids3, b_input_mask3, b_labels3 = batch
        with torch.no_grad():
          # Forward pass
          b_eval_output3 = b_model3(b_input_ids3, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask3)
        logits3 = b_eval_output3.logits.detach().cpu().numpy()
        label_ids3 = b_labels3.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy3, b_precision3, b_recall3, b_specificity3, f1_score3 = b_metrics(logits3, label_ids3)
        b_val_accuracy3.append(b_accuracy3)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision3 != 'nan': b_val_precision3.append(b_precision3)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall3 != 'nan': b_val_recall3.append(b_recall3)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity3 != 'nan': b_val_specificity3.append(b_specificity3)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if f1_score3 != 'nan': f1_val_score3.append(f1_score3)

    print('\n\t - Train loss: {:.4f}'.format(b_tr_loss3 / nb_tr_steps3))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(b_val_accuracy3)/len(b_val_accuracy3)))
    print('\t - Validation Precision: {:.4f}'.format(sum(b_val_precision3)/len(b_val_precision3)) if len(b_val_precision3)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(b_val_recall3)/len(b_val_recall3)) if len(b_val_recall3)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(b_val_specificity3)/len(b_val_specificity3)) if len(b_val_specificity3)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(f1_val_score3)/len(f1_val_score3)) if len(f1_val_score3)>0 else '\t - F1 Score: NaN')

#### CodeBERT

In [None]:
%%time
#CodeBERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)

cb_token_id3 = []
cb_attention_masks3 = []

for sample in code_comm3:
    cb_encoding_dict3 = codebert_preprocessing(sample, cb_tokenizer)
    cb_token_id3.append(cb_encoding_dict3['input_ids']) 
    cb_attention_masks3.append(cb_encoding_dict3['attention_mask'])

cb_token_id3 = torch.cat(cb_token_id3, dim = 0)
cb_attention_masks3 = torch.cat(cb_attention_masks3, dim = 0)
cb_labels3 = torch.tensor(coh_stat3)

#CodeBERT
#Splitting Dataset into training and validation set, and loading into batches.
cb_val_ratio3 = 0.2
#Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
cb_batch_size3 = 16

#Indices of the train and validation splits stratified by labels
cb_train_idx3, cb_val_idx3 = train_test_split(
    np.arange(len(cb_labels3)),
    test_size = cb_val_ratio3,
    shuffle = True,
    stratify = cb_labels3)

#Train and validation sets
cb_train_set3 = TensorDataset(cb_token_id3[cb_train_idx3], 
                          cb_attention_masks3[cb_train_idx3], 
                          cb_labels3[cb_train_idx3])

cb_val_set3 = TensorDataset(cb_token_id3[cb_val_idx3], 
                        cb_attention_masks3[cb_val_idx3], 
                        cb_labels3[cb_val_idx3])

#Prepare DataLoader
cb_train_dataloader3 = DataLoader(
            cb_train_set3,
            sampler = RandomSampler(cb_train_set3),
            batch_size = cb_batch_size3)

cb_validation_dataloader3 = DataLoader(
            cb_val_set3,
            sampler = SequentialSampler(cb_val_set3),
            batch_size = cb_batch_size3)

In [None]:
#CodeBERT
#Model Object Initiation
#Load the RobertaForSequenceClassification model
cb_model3 = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base")

#Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
cb_optimizer3 = torch.optim.AdamW(cb_model3.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

#Run on GPU
cb_model3.cuda()

#Model Training and Evaluation
#Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
cb_epochs3 = 3

for _ in trange(cb_epochs3, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    cb_model3.train()
    
    # Tracking variables
    cb_tr_loss3 = 0
    nb_tr_examples3, nb_tr_steps3 = 0, 0

    for step, batch in enumerate(cb_train_dataloader3):
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids3, cb_input_mask3, cb_labels3 = batch
        cb_optimizer3.zero_grad()
        # Forward pass
        cb_train_output3 = cb_model3(cb_input_ids3, 
                             token_type_ids = None, 
                             attention_mask = cb_input_mask3, 
                             labels = cb_labels3)
        # Backward pass
        cb_train_output3.loss.backward()
        cb_optimizer3.step()
        # Update tracking variables
        cb_tr_loss3 += cb_train_output3.loss.item()
        nb_tr_examples3 += cb_input_ids3.size(0)
        nb_tr_steps3 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    cb_model3.eval()

    # Tracking variables 
    cb_val_accuracy3, cb_val_precision3, cb_val_recall3, cb_val_specificity3, cf1_val_score3 = [], [], [], [], []

    for batch in cb_validation_dataloader3:
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids3, cb_input_mask3, cb_labels3 = batch
        with torch.no_grad():
          # Forward pass
          cb_eval_output3 = cb_model3(cb_input_ids3, 
                              token_type_ids = None, 
                              attention_mask = cb_input_mask3)
        logits3 = cb_eval_output3.logits.detach().cpu().numpy()
        label_ids3 = cb_labels3.to('cpu').numpy()
        # Calculate validation metrics
        cb_accuracy3, cb_precision3, cb_recall3, cb_specificity3, cf1_score3 = b_metrics(logits3, label_ids3)
        cb_val_accuracy3.append(cb_accuracy3)
        # Update precision only when (tp + fp) !=0; ignore nan
        if cb_precision3 != 'nan': cb_val_precision3.append(cb_precision3)
        # Update recall only when (tp + fn) !=0; ignore nan
        if cb_recall3 != 'nan': cb_val_recall3.append(cb_recall3)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if cb_specificity3 != 'nan': cb_val_specificity3.append(cb_specificity3)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if cf1_score3 != 'nan': cf1_val_score3.append(cf1_score3)

    print('\n\t - Train loss: {:.4f}'.format(cb_tr_loss3 / nb_tr_steps3))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(cb_val_accuracy3)/len(cb_val_accuracy3)))
    print('\t - Validation Precision: {:.4f}'.format(sum(cb_val_precision3)/len(cb_val_precision3)) if len(cb_val_precision3)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(cb_val_recall3)/len(cb_val_recall3)) if len(cb_val_recall3)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(cb_val_specificity3)/len(cb_val_specificity3)) if len(cb_val_specificity3)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(cf1_val_score3)/len(cf1_val_score3)) if len(cf1_val_score3)>0 else '\t - F1 Score: NaN')

### JFreeChart071

#### Load Dataset, Data Preprocessing and Model Training and Evaluation

In [None]:
%%time
#Dataset Importation
codes4  = pd.read_csv("/kaggle/input/sourcesniffer/SourceSniffer/JFreeChart071/JFreeChart071_Raw_Data_2.csv", encoding='latin-1')
codes4.drop(['Unnamed: 0','Index'], axis=1, inplace=True)
#codes4.head(1)

#Dataset Preprocessing (Cleaning)
codes4['Code and Comment'] = codes4['Code and Comment'].str.lower()
codes4['Code and Comment'] = codes4['Code and Comment'].apply(clean_code)
#codes4.head(1)

#Dataset Preprocessing (Assigning non-null entries to new variable)
coh_data4 = codes4[["Code and Comment","Label"]][~codes4['Label'].isnull()]
#coh_data4.shape

#Dataset Preprocessing (Label Encoding)
le = preprocessing.LabelEncoder()
coh_stat4 = le.fit_transform(coh_data4['Label'])
code_comm4 = coh_data4['Code and Comment']

#Identify which label numerical code correspond to which categories
print(collections.Counter(coh_stat4), collections.Counter(coh_data4['Label']))

#### BERT

In [None]:
%%time
#BERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)

b_token_id4 = []
b_attention_masks4 = []

for sample in code_comm4:
    b_encoding_dict4 = bert_preprocessing(sample, b_tokenizer)
    b_token_id4.append(b_encoding_dict4['input_ids']) 
    b_attention_masks4.append(b_encoding_dict4['attention_mask'])

b_token_id4 = torch.cat(b_token_id4, dim = 0)
b_attention_masks4 = torch.cat(b_attention_masks4, dim = 0)
b_labels4 = torch.tensor(coh_stat4)

#BERT
#Splitting Dataset into training and validation set, and loading into batches.
b_val_ratio4 = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
b_batch_size4 = 16

# Indices of the train and validation splits stratified by labels
b_train_idx4, b_val_idx4 = train_test_split(
    np.arange(len(b_labels4)),
    test_size = b_val_ratio4,
    shuffle = True,
    stratify = b_labels4)

# Train and validation sets
b_train_set4 = TensorDataset(b_token_id4[b_train_idx4], 
                          b_attention_masks4[b_train_idx4], 
                          b_labels4[b_train_idx4])

b_val_set4 = TensorDataset(b_token_id4[b_val_idx4], 
                        b_attention_masks4[b_val_idx4], 
                        b_labels4[b_val_idx4])

# Prepare DataLoader
b_train_dataloader4 = DataLoader(
            b_train_set4,
            sampler = RandomSampler(b_train_set4),
            batch_size = b_batch_size4)

b_validation_dataloader4 = DataLoader(
            b_val_set4,
            sampler = SequentialSampler(b_val_set4),
            batch_size = b_batch_size4)

In [None]:
#BERT
#Model Object Initiation
#Load the BertForSequenceClassification model
b_model4 = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,)

#Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
b_optimizer4 = torch.optim.AdamW(b_model4.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

#Run on GPU
b_model4.cuda()

#Model Training and Evaluation
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
b_epochs4 = 3

for _ in trange(b_epochs4, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    b_model4.train()
    
    # Tracking variables
    b_tr_loss4 = 0
    nb_tr_examples4, nb_tr_steps4 = 0, 0

    for step, batch in enumerate(b_train_dataloader4):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids4, b_input_mask4, b_labels4 = batch
        b_optimizer4.zero_grad()
        # Forward pass
        b_train_output4 = b_model4(b_input_ids4, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask4, 
                             labels = b_labels4)
        # Backward pass
        b_train_output4.loss.backward()
        b_optimizer4.step()
        # Update tracking variables
        b_tr_loss4 += b_train_output4.loss.item()
        nb_tr_examples4 += b_input_ids4.size(0)
        nb_tr_steps4 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    b_model4.eval()

    # Tracking variables 
    b_val_accuracy4, b_val_precision4, b_val_recall4, b_val_specificity4, f1_val_score4 = [], [], [], [], []

    for batch in b_validation_dataloader4:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids4, b_input_mask4, b_labels4 = batch
        with torch.no_grad():
          # Forward pass
          b_eval_output4 = b_model4(b_input_ids4, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask4)
        logits4 = b_eval_output4.logits.detach().cpu().numpy()
        label_ids4 = b_labels4.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy4, b_precision4, b_recall4, b_specificity4, f1_score4 = b_metrics(logits4, label_ids4)
        b_val_accuracy4.append(b_accuracy4)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision4 != 'nan': b_val_precision4.append(b_precision4)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall4 != 'nan': b_val_recall4.append(b_recall4)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity4 != 'nan': b_val_specificity4.append(b_specificity4)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if f1_score4 != 'nan': f1_val_score4.append(f1_score4)

    print('\n\t - Train loss: {:.4f}'.format(b_tr_loss4 / nb_tr_steps4))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(b_val_accuracy4)/len(b_val_accuracy4)))
    print('\t - Validation Precision: {:.4f}'.format(sum(b_val_precision4)/len(b_val_precision4)) if len(b_val_precision4)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(b_val_recall4)/len(b_val_recall4)) if len(b_val_recall4)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(b_val_specificity4)/len(b_val_specificity4)) if len(b_val_specificity4)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(f1_val_score4)/len(f1_val_score4)) if len(f1_val_score4)>0 else '\t - F1 Score: NaN')

#### CodeBERT

In [None]:
%%time
#CodeBERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)

cb_token_id4 = []
cb_attention_masks4 = []

for sample in code_comm4:
    cb_encoding_dict4 = codebert_preprocessing(sample, cb_tokenizer)
    cb_token_id4.append(cb_encoding_dict4['input_ids']) 
    cb_attention_masks4.append(cb_encoding_dict4['attention_mask'])

cb_token_id4 = torch.cat(cb_token_id4, dim = 0)
cb_attention_masks4 = torch.cat(cb_attention_masks4, dim = 0)
cb_labels4 = torch.tensor(coh_stat4)

#CodeBERT
#Splitting Dataset into training and validation set, and loading into batches.
cb_val_ratio4 = 0.2
#Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
cb_batch_size4 = 16

#Indices of the train and validation splits stratified by labels
cb_train_idx4, cb_val_idx4 = train_test_split(
    np.arange(len(cb_labels4)),
    test_size = cb_val_ratio4,
    shuffle = True,
    stratify = cb_labels4)

#Train and validation sets
cb_train_set4 = TensorDataset(cb_token_id4[cb_train_idx4], 
                          cb_attention_masks4[cb_train_idx4], 
                          cb_labels4[cb_train_idx4])

cb_val_set4 = TensorDataset(cb_token_id4[cb_val_idx4], 
                        cb_attention_masks4[cb_val_idx4], 
                        cb_labels4[cb_val_idx4])

#Prepare DataLoader
cb_train_dataloader4 = DataLoader(
            cb_train_set4,
            sampler = RandomSampler(cb_train_set4),
            batch_size = cb_batch_size4)

cb_validation_dataloader4 = DataLoader(
            cb_val_set4,
            sampler = SequentialSampler(cb_val_set4),
            batch_size = cb_batch_size4)

In [None]:
#CodeBERT
#Model Object Initiation
#Load the RobertaForSequenceClassification model
cb_model4 = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base")

#Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
cb_optimizer4 = torch.optim.AdamW(cb_model4.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

#Run on GPU
cb_model4.cuda()

#Model Training and Evaluation
#Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
cb_epochs4 = 3

for _ in trange(cb_epochs4, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    cb_model4.train()
    
    # Tracking variables
    cb_tr_loss4 = 0
    nb_tr_examples4, nb_tr_steps4 = 0, 0

    for step, batch in enumerate(cb_train_dataloader4):
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids4, cb_input_mask4, cb_labels4 = batch
        cb_optimizer4.zero_grad()
        # Forward pass
        cb_train_output4 = cb_model4(cb_input_ids4, 
                             token_type_ids = None, 
                             attention_mask = cb_input_mask4, 
                             labels = cb_labels4)
        # Backward pass
        cb_train_output4.loss.backward()
        cb_optimizer4.step()
        # Update tracking variables
        cb_tr_loss4 += cb_train_output4.loss.item()
        nb_tr_examples4 += cb_input_ids4.size(0)
        nb_tr_steps4 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    cb_model4.eval()

    # Tracking variables 
    cb_val_accuracy4, cb_val_precision4, cb_val_recall4, cb_val_specificity4, cf1_val_score4 = [], [], [], [], []

    for batch in cb_validation_dataloader4:
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids4, cb_input_mask4, cb_labels4 = batch
        with torch.no_grad():
          # Forward pass
          cb_eval_output4 = cb_model4(cb_input_ids4, 
                              token_type_ids = None, 
                              attention_mask = cb_input_mask4)
        logits4 = cb_eval_output4.logits.detach().cpu().numpy()
        label_ids4 = cb_labels4.to('cpu').numpy()
        # Calculate validation metrics
        cb_accuracy4, cb_precision4, cb_recall4, cb_specificity4, cf1_score4 = b_metrics(logits4, label_ids4)
        cb_val_accuracy4.append(cb_accuracy4)
        # Update precision only when (tp + fp) !=0; ignore nan
        if cb_precision4 != 'nan': cb_val_precision4.append(cb_precision4)
        # Update recall only when (tp + fn) !=0; ignore nan
        if cb_recall4 != 'nan': cb_val_recall4.append(cb_recall4)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if cb_specificity4 != 'nan': cb_val_specificity4.append(cb_specificity4)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if cf1_score4 != 'nan': cf1_val_score4.append(cf1_score4)

    print('\n\t - Train loss: {:.4f}'.format(cb_tr_loss4 / nb_tr_steps4))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(cb_val_accuracy4)/len(cb_val_accuracy4)))
    print('\t - Validation Precision: {:.4f}'.format(sum(cb_val_precision4)/len(cb_val_precision4)) if len(cb_val_precision4)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(cb_val_recall4)/len(cb_val_recall4)) if len(cb_val_recall4)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(cb_val_specificity4)/len(cb_val_specificity4)) if len(cb_val_specificity4)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(cf1_val_score4)/len(cf1_val_score4)) if len(cf1_val_score4)>0 else '\t - F1 Score: NaN')

### JHotDraw741

#### Load Dataset, Data Preprocessing and Model Training and Evaluation

In [None]:
%%time
#Dataset Importation
codes5  = pd.read_csv("/kaggle/input/sourcesniffer/SourceSniffer/JHotDraw741/JHotDraw741_Raw_Data_2.csv", encoding='latin-1')
codes5.drop(['Unnamed: 0','Index'], axis=1, inplace=True)
#codes5.head(1)

#Dataset Preprocessing (Cleaning)
codes5['Code and Comment'] = codes5['Code and Comment'].str.lower()
codes5['Code and Comment'] = codes5['Code and Comment'].apply(clean_code)
#codes5.head(1)

#Dataset Preprocessing (Assigning non-null entries to new variable)
coh_data5 = codes5[["Code and Comment","Label"]][~codes5['Label'].isnull()]
#coh_data5.shape

#Dataset Preprocessing (Label Encoding)
le = preprocessing.LabelEncoder()
coh_stat5 = le.fit_transform(coh_data5['Label'])
code_comm5 = coh_data5['Code and Comment']

#Identify which label numerical code correspond to which categories
print(collections.Counter(coh_stat5), collections.Counter(coh_data5['Label']))


#### BERT

In [None]:
%%time
#BERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)

b_token_id5 = []
b_attention_masks5 = []

for sample in code_comm5:
    b_encoding_dict5 = bert_preprocessing(sample, b_tokenizer)
    b_token_id5.append(b_encoding_dict5['input_ids']) 
    b_attention_masks5.append(b_encoding_dict5['attention_mask'])

b_token_id5 = torch.cat(b_token_id5, dim = 0)
b_attention_masks5 = torch.cat(b_attention_masks5, dim = 0)
b_labels5 = torch.tensor(coh_stat5)

#BERT
#Splitting Dataset into training and validation set, and loading into batches.
b_val_ratio5 = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
b_batch_size5 = 16

# Indices of the train and validation splits stratified by labels
b_train_idx5, b_val_idx5 = train_test_split(
    np.arange(len(b_labels5)),
    test_size = b_val_ratio5,
    shuffle = True,
    stratify = b_labels5)

# Train and validation sets
b_train_set5 = TensorDataset(b_token_id5[b_train_idx5], 
                          b_attention_masks5[b_train_idx5], 
                          b_labels5[b_train_idx5])

b_val_set5 = TensorDataset(b_token_id5[b_val_idx5], 
                        b_attention_masks5[b_val_idx5], 
                        b_labels5[b_val_idx5])

# Prepare DataLoader
b_train_dataloader5 = DataLoader(
            b_train_set5,
            sampler = RandomSampler(b_train_set5),
            batch_size = b_batch_size5)

b_validation_dataloader5 = DataLoader(
            b_val_set5,
            sampler = SequentialSampler(b_val_set5),
            batch_size = b_batch_size5)

In [None]:
#BERT
#Model Object Initiation
#Load the BertForSequenceClassification model
b_model5 = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,)

#Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
b_optimizer5 = torch.optim.AdamW(b_model5.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

#Run on GPU
b_model5.cuda()

#Model Training and Evaluation
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
b_epochs5 = 3

for _ in trange(b_epochs5, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    b_model5.train()
    
    # Tracking variables
    b_tr_loss5 = 0
    nb_tr_examples5, nb_tr_steps5 = 0, 0

    for step, batch in enumerate(b_train_dataloader5):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids5, b_input_mask5, b_labels5 = batch
        b_optimizer5.zero_grad()
        # Forward pass
        b_train_output5 = b_model5(b_input_ids5, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask5, 
                             labels = b_labels5)
        # Backward pass
        b_train_output5.loss.backward()
        b_optimizer5.step()
        # Update tracking variables
        b_tr_loss5 += b_train_output5.loss.item()
        nb_tr_examples5 += b_input_ids5.size(0)
        nb_tr_steps5 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    b_model5.eval()

    # Tracking variables 
    b_val_accuracy5, b_val_precision5, b_val_recall5, b_val_specificity5, f1_val_score5 = [], [], [], [], []

    for batch in b_validation_dataloader5:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids5, b_input_mask5, b_labels5 = batch
        with torch.no_grad():
          # Forward pass
          b_eval_output5 = b_model5(b_input_ids5, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask5)
        logits5 = b_eval_output5.logits.detach().cpu().numpy()
        label_ids5 = b_labels5.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy5, b_precision5, b_recall5, b_specificity5, f1_score5 = b_metrics(logits5, label_ids5)
        b_val_accuracy5.append(b_accuracy5)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision5 != 'nan': b_val_precision5.append(b_precision5)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall5 != 'nan': b_val_recall5.append(b_recall5)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity5 != 'nan': b_val_specificity5.append(b_specificity5)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if f1_score5 != 'nan': f1_val_score5.append(f1_score5)

    print('\n\t - Train loss: {:.4f}'.format(b_tr_loss5 / nb_tr_steps5))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(b_val_accuracy5)/len(b_val_accuracy5)))
    print('\t - Validation Precision: {:.4f}'.format(sum(b_val_precision5)/len(b_val_precision5)) if len(b_val_precision5)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(b_val_recall5)/len(b_val_recall5)) if len(b_val_recall5)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(b_val_specificity5)/len(b_val_specificity5)) if len(b_val_specificity5)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(f1_val_score5)/len(f1_val_score5)) if len(f1_val_score5)>0 else '\t - F1 Score: NaN')

#### CodeBERT

In [None]:
%%time
#CodeBERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)

cb_token_id5 = []
cb_attention_masks5 = []

for sample in code_comm5:
    cb_encoding_dict5 = codebert_preprocessing(sample, cb_tokenizer)
    cb_token_id5.append(cb_encoding_dict5['input_ids']) 
    cb_attention_masks5.append(cb_encoding_dict5['attention_mask'])

cb_token_id5 = torch.cat(cb_token_id5, dim = 0)
cb_attention_masks5 = torch.cat(cb_attention_masks5, dim = 0)
cb_labels5 = torch.tensor(coh_stat5)

#CodeBERT
#Splitting Dataset into training and validation set, and loading into batches.
cb_val_ratio5 = 0.2
#Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
cb_batch_size5 = 16

#Indices of the train and validation splits stratified by labels
cb_train_idx5, cb_val_idx5 = train_test_split(
    np.arange(len(cb_labels5)),
    test_size = cb_val_ratio5,
    shuffle = True,
    stratify = cb_labels5)

#Train and validation sets
cb_train_set5 = TensorDataset(cb_token_id5[cb_train_idx5], 
                          cb_attention_masks5[cb_train_idx5], 
                          cb_labels5[cb_train_idx5])

cb_val_set5 = TensorDataset(cb_token_id5[cb_val_idx5], 
                        cb_attention_masks5[cb_val_idx5], 
                        cb_labels5[cb_val_idx5])

#Prepare DataLoader
cb_train_dataloader5 = DataLoader(
            cb_train_set5,
            sampler = RandomSampler(cb_train_set5),
            batch_size = cb_batch_size5)

cb_validation_dataloader5 = DataLoader(
            cb_val_set5,
            sampler = SequentialSampler(cb_val_set5),
            batch_size = cb_batch_size5)

In [None]:
#CodeBERT
#Model Object Initiation
#Load the RobertaForSequenceClassification model
cb_model5 = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base")

#Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
cb_optimizer5 = torch.optim.AdamW(cb_model5.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

#Run on GPU
cb_model5.cuda()

#Model Training and Evaluation
#Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
cb_epochs5 = 3

for _ in trange(cb_epochs5, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    cb_model5.train()
    
    # Tracking variables
    cb_tr_loss5 = 0
    nb_tr_examples5, nb_tr_steps5 = 0, 0

    for step, batch in enumerate(cb_train_dataloader5):
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids5, cb_input_mask5, cb_labels5 = batch
        cb_optimizer5.zero_grad()
        # Forward pass
        cb_train_output5 = cb_model5(cb_input_ids5, 
                             token_type_ids = None, 
                             attention_mask = cb_input_mask5, 
                             labels = cb_labels5)
        # Backward pass
        cb_train_output5.loss.backward()
        cb_optimizer5.step()
        # Update tracking variables
        cb_tr_loss5 += cb_train_output5.loss.item()
        nb_tr_examples5 += cb_input_ids5.size(0)
        nb_tr_steps5 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    cb_model5.eval()

    # Tracking variables 
    cb_val_accuracy5, cb_val_precision5, cb_val_recall5, cb_val_specificity5, cf1_val_score5 = [], [], [], [], []

    for batch in cb_validation_dataloader5:
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids5, cb_input_mask5, cb_labels5 = batch
        with torch.no_grad():
          # Forward pass
          cb_eval_output5 = cb_model5(cb_input_ids5, 
                              token_type_ids = None, 
                              attention_mask = cb_input_mask5)
        logits5 = cb_eval_output5.logits.detach().cpu().numpy()
        label_ids5 = cb_labels5.to('cpu').numpy()
        # Calculate validation metrics
        cb_accuracy5, cb_precision5, cb_recall5, cb_specificity5, cf1_score5 = b_metrics(logits5, label_ids5)
        cb_val_accuracy5.append(cb_accuracy5)
        # Update precision only when (tp + fp) !=0; ignore nan
        if cb_precision5 != 'nan': cb_val_precision5.append(cb_precision5)
        # Update recall only when (tp + fn) !=0; ignore nan
        if cb_recall5 != 'nan': cb_val_recall5.append(cb_recall5)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if cb_specificity5 != 'nan': cb_val_specificity5.append(cb_specificity5)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if cf1_score5 != 'nan': cf1_val_score5.append(cf1_score5)

    print('\n\t - Train loss: {:.4f}'.format(cb_tr_loss5 / nb_tr_steps5))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(cb_val_accuracy5)/len(cb_val_accuracy5)))
    print('\t - Validation Precision: {:.4f}'.format(sum(cb_val_precision5)/len(cb_val_precision5)) if len(cb_val_precision5)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(cb_val_recall5)/len(cb_val_recall5)) if len(cb_val_recall5)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(cb_val_specificity5)/len(cb_val_specificity5)) if len(cb_val_specificity5)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(cf1_val_score5)/len(cf1_val_score5)) if len(cf1_val_score5)>0 else '\t - F1 Score: NaN')

### All Datasets Combined

#### Load Dataset, Data Preprocessing and Model Training and Evaluation

In [None]:
%%time
#Dataset Importation 
files = ['Benchmark_Raw_Data_2.csv','CoffeeMaker_Raw_Data_2.csv','JFreeChart060_Raw_Data_2.csv',
           'JFreeChart071_Raw_Data_2.csv','JHotDraw741_Raw_Data_2.csv']
folders = ['Benchmark','CoffeeMaker','JFreeChart060','JFreeChart071','JHotDraw741']
parent_path = '/kaggle/input/sourcesniffer/SourceSniffer'

first_path = parent_path+'/'+folders[0]+'/'+files[0]
codes6  = pd.read_csv(first_path, encoding='latin-1')

#concatenating all other datasets
for i in range(1,len(files)):
    curr_path = parent_path+'/'+folders[i]+'/'+files[i]
    temp_df = pd.read_csv(curr_path, encoding='latin-1')
    codes6 = pd.concat([codes6, temp_df], axis=0)
    
codes6.drop(['Unnamed: 0','Index'], axis=1, inplace=True)
#codes6.head(1)

#Dataset Preprocessing (Cleaning)
codes6['Code and Comment'] = codes6['Code and Comment'].str.lower()
codes6['Code and Comment'] = codes6['Code and Comment'].apply(clean_code)
#codes6.head(1)

#Dataset Preprocessing (Assigning non-null entries to new variable)
coh_data6 = codes6[["Code and Comment","Label"]][~codes6['Label'].isnull()]
#coh_data6.shape

#Dataset Preprocessing (Label Encoding)
le = preprocessing.LabelEncoder()
coh_stat6 = le.fit_transform(coh_data6['Label'])
code_comm6 = coh_data6['Code and Comment']

#Identify which label numerical code correspond to which categories
print(collections.Counter(coh_stat6), collections.Counter(coh_data6['Label']))


#### BERT

In [None]:
%%time
#BERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)

b_token_id6 = []
b_attention_masks6 = []

for sample in code_comm6:
    b_encoding_dict6 = bert_preprocessing(sample, b_tokenizer)
    b_token_id6.append(b_encoding_dict6['input_ids']) 
    b_attention_masks6.append(b_encoding_dict6['attention_mask'])

b_token_id6 = torch.cat(b_token_id6, dim = 0)
b_attention_masks6 = torch.cat(b_attention_masks6, dim = 0)
b_labels6 = torch.tensor(coh_stat6)

#BERT
#Splitting Dataset into training and validation set, and loading into batches.
b_val_ratio6 = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
b_batch_size6 = 16

# Indices of the train and validation splits stratified by labels
b_train_idx6, b_val_idx6 = train_test_split(
    np.arange(len(b_labels6)),
    test_size = b_val_ratio6,
    shuffle = True,
    stratify = b_labels6)

# Train and validation sets
b_train_set6 = TensorDataset(b_token_id6[b_train_idx6], 
                          b_attention_masks6[b_train_idx6], 
                          b_labels6[b_train_idx6])

b_val_set6 = TensorDataset(b_token_id6[b_val_idx6], 
                        b_attention_masks6[b_val_idx6], 
                        b_labels6[b_val_idx6])

# Prepare DataLoader
b_train_dataloader6 = DataLoader(
            b_train_set6,
            sampler = RandomSampler(b_train_set6),
            batch_size = b_batch_size6)

b_validation_dataloader6 = DataLoader(
            b_val_set6,
            sampler = SequentialSampler(b_val_set6),
            batch_size = b_batch_size6)

In [None]:
%%time
#BERT
#Model Object Initiation
#Load the BertForSequenceClassification model
b_model6 = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,)

#Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
b_optimizer6 = torch.optim.AdamW(b_model6.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

#Run on GPU
b_model6.cuda()

#Model Training and Evaluation
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
b_epochs6 = 3

for _ in trange(b_epochs6, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    b_model6.train()
    
    # Tracking variables
    b_tr_loss6 = 0
    nb_tr_examples6, nb_tr_steps6 = 0, 0

    for step, batch in enumerate(b_train_dataloader6):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids6, b_input_mask6, b_labels6 = batch
        b_optimizer6.zero_grad()
        # Forward pass
        b_train_output6 = b_model6(b_input_ids6, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask6, 
                             labels = b_labels6)
        # Backward pass
        b_train_output6.loss.backward()
        b_optimizer6.step()
        # Update tracking variables
        b_tr_loss6 += b_train_output6.loss.item()
        nb_tr_examples6 += b_input_ids6.size(0)
        nb_tr_steps6 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    b_model6.eval()

    # Tracking variables 
    b_val_accuracy6, b_val_precision6, b_val_recall6, b_val_specificity6, f1_val_score6 = [], [], [], [], []

    for batch in b_validation_dataloader6:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids6, b_input_mask6, b_labels6 = batch
        with torch.no_grad():
          # Forward pass
          b_eval_output6 = b_model6(b_input_ids6, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask6)
        logits6 = b_eval_output6.logits.detach().cpu().numpy()
        label_ids6 = b_labels6.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy6, b_precision6, b_recall6, b_specificity6, f1_score6 = b_metrics(logits6, label_ids6)
        b_val_accuracy6.append(b_accuracy6)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision6 != 'nan': b_val_precision6.append(b_precision6)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall6 != 'nan': b_val_recall6.append(b_recall6)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity6 != 'nan': b_val_specificity6.append(b_specificity6)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if f1_score6 != 'nan': f1_val_score6.append(f1_score6)

    print('\n\t - Train loss: {:.4f}'.format(b_tr_loss6 / nb_tr_steps6))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(b_val_accuracy6)/len(b_val_accuracy6)))
    print('\t - Validation Precision: {:.4f}'.format(sum(b_val_precision6)/len(b_val_precision6)) if len(b_val_precision6)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(b_val_recall6)/len(b_val_recall6)) if len(b_val_recall6)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(b_val_specificity6)/len(b_val_specificity6)) if len(b_val_specificity6)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(f1_val_score6)/len(f1_val_score6)) if len(f1_val_score6)>0 else '\t - F1 Score: NaN')

#### CodeBERT

In [None]:
%%time
#CodeBERT
#Dataset Preprocessing (Code-Comment Vector-Encoding)

cb_token_id6 = []
cb_attention_masks6 = []

for sample in code_comm6:
    cb_encoding_dict6 = codebert_preprocessing(sample, cb_tokenizer)
    cb_token_id6.append(cb_encoding_dict6['input_ids']) 
    cb_attention_masks6.append(cb_encoding_dict6['attention_mask'])

cb_token_id6 = torch.cat(cb_token_id6, dim = 0)
cb_attention_masks6 = torch.cat(cb_attention_masks6, dim = 0)
cb_labels6 = torch.tensor(coh_stat6)

#CodeBERT
#Splitting Dataset into training and validation set, and loading into batches.
cb_val_ratio6 = 0.2
#Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
cb_batch_size6 = 16

#Indices of the train and validation splits stratified by labels
cb_train_idx6, cb_val_idx6 = train_test_split(
    np.arange(len(cb_labels6)),
    test_size = cb_val_ratio6,
    shuffle = True,
    stratify = cb_labels6)

#Train and validation sets
cb_train_set6 = TensorDataset(cb_token_id6[cb_train_idx6], 
                          cb_attention_masks6[cb_train_idx6], 
                          cb_labels6[cb_train_idx6])

cb_val_set6 = TensorDataset(cb_token_id6[cb_val_idx6], 
                        cb_attention_masks6[cb_val_idx6], 
                        cb_labels6[cb_val_idx6])

#Prepare DataLoader
cb_train_dataloader6 = DataLoader(
            cb_train_set6,
            sampler = RandomSampler(cb_train_set6),
            batch_size = cb_batch_size6)

cb_validation_dataloader6 = DataLoader(
            cb_val_set6,
            sampler = SequentialSampler(cb_val_set6),
            batch_size = cb_batch_size6)

In [None]:
%%time
#CodeBERT
#Model Object Initiation
#Load the RobertaForSequenceClassification model
cb_model6 = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base")

#Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
cb_optimizer6 = torch.optim.AdamW(cb_model6.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08)

#Run on GPU
cb_model6.cuda()

#Model Training and Evaluation
#Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
cb_epochs6 = 3

for _ in trange(cb_epochs6, desc = 'Epoch'):
    
    # ========== Training ==========
    # Set model to training mode
    cb_model6.train()
    
    # Tracking variables
    cb_tr_loss6 = 0
    nb_tr_examples6, nb_tr_steps6 = 0, 0

    for step, batch in enumerate(cb_train_dataloader6):
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids6, cb_input_mask6, cb_labels6 = batch
        cb_optimizer6.zero_grad()
        # Forward pass
        cb_train_output6 = cb_model6(cb_input_ids6, 
                             token_type_ids = None, 
                             attention_mask = cb_input_mask6, 
                             labels = cb_labels6)
        # Backward pass
        cb_train_output6.loss.backward()
        cb_optimizer6.step()
        # Update tracking variables
        cb_tr_loss6 += cb_train_output6.loss.item()
        nb_tr_examples6 += cb_input_ids6.size(0)
        nb_tr_steps6 += 1

    # ========== Validation ==========
    # Set model to evaluation mode
    cb_model6.eval()

    # Tracking variables 
    cb_val_accuracy6, cb_val_precision6, cb_val_recall6, cb_val_specificity6, cf1_val_score6 = [], [], [], [], []

    for batch in cb_validation_dataloader6:
        batch = tuple(t.to(device) for t in batch)
        cb_input_ids6, cb_input_mask6, cb_labels6 = batch
        with torch.no_grad():
          # Forward pass
          cb_eval_output6 = cb_model6(cb_input_ids6, 
                              token_type_ids = None, 
                              attention_mask = cb_input_mask6)
        logits6 = cb_eval_output6.logits.detach().cpu().numpy()
        label_ids6 = cb_labels6.to('cpu').numpy()
        # Calculate validation metrics
        cb_accuracy6, cb_precision6, cb_recall6, cb_specificity6, cf1_score6 = b_metrics(logits6, label_ids6)
        cb_val_accuracy6.append(cb_accuracy6)
        # Update precision only when (tp + fp) !=0; ignore nan
        if cb_precision6 != 'nan': cb_val_precision6.append(cb_precision6)
        # Update recall only when (tp + fn) !=0; ignore nan
        if cb_recall6 != 'nan': cb_val_recall6.append(cb_recall6)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if cb_specificity6 != 'nan': cb_val_specificity6.append(cb_specificity6)
        # Update f1_score only when recall and specificity != nan; ignore nan
        if cf1_score6 != 'nan': cf1_val_score6.append(cf1_score6)

    print('\n\t - Train loss: {:.4f}'.format(cb_tr_loss6 / nb_tr_steps6))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(cb_val_accuracy6)/len(cb_val_accuracy6)))
    print('\t - Validation Precision: {:.4f}'.format(sum(cb_val_precision6)/len(cb_val_precision6)) if len(cb_val_precision6)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(cb_val_recall6)/len(cb_val_recall6)) if len(cb_val_recall6)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}'.format(sum(cb_val_specificity6)/len(cb_val_specificity6)) if len(cb_val_specificity6)>0 else '\t - Validation Specificity: NaN')
    print('\t - F1 Score: {:.4f}\n'.format(sum(cf1_val_score6)/len(cf1_val_score6)) if len(cf1_val_score6)>0 else '\t - F1 Score: NaN')

---

## Baseline (Corazza et al)

### SVM + VSM (TD-IDF)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
#Defining Evaluation Metrics
def b_tp(preds, labels):
    '''Returns True Positives (TP): count of correct predictions of actual class 1'''
    return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
    '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
    return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
    '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
    return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
    '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
    return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
    '''
    Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
    '''
    preds = preds
    labels = labels
    tp = b_tp(preds, labels)
    tn = b_tn(preds, labels)
    fp = b_fp(preds, labels)
    fn = b_fn(preds, labels)
    b_accuracy = (tp + tn) / len(labels)
    b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
    b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
    b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
    f1_score = 2*((b_precision*b_recall)/(b_precision+b_recall)) if (b_precision != 'nan' and b_recall != 'nan') else 'nan'
    return b_accuracy, b_precision, b_recall, b_specificity, f1_score

### Benchmark

In [None]:
%%time
#Dataset Importation

g_codes1  = pd.read_csv("/kaggle/input/sourcesniffer/SourceSniffer/Benchmark/Benchmark_Raw_Data_2.csv", encoding='latin-1')
g_codes1.drop(['Unnamed: 0','Index'], axis=1, inplace=True)

#Dataset Preprocessing (Cleaning)

g_codes1['Code and Comment'] = g_codes1['Code and Comment'].str.lower()
g_codes1['Code and Comment'] = g_codes1['Code and Comment'].apply(clean_code)

#Dataset Preprocessing (Assigning non-null entries to new variable)

gcoh_data1 = g_codes1[["Code and Comment","Label"]][~g_codes1['Label'].isnull()]

#Dataset Preprocessing (Label Encoding)

le = preprocessing.LabelEncoder()
gcoh_stat1 = le.fit_transform(gcoh_data1['Label'])

gcode_comm1 = gcoh_data1['Code and Comment']
gcode_comm2 = gcode_comm1.to_list()

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(gcode_comm2)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_coh = pd.DataFrame(denselist, columns=feature_names)

In [None]:
%%time
for i in range(3):
    x_train, x_test, y_train, y_test = train_test_split(df_coh, gcoh_stat1, test_size=.25)
    clf = SVC(C = 10.0, gamma = 1.0, kernel = 'rbf')
    clf.fit(x_train, y_train)
    print('Accuracy on Test set: %.3f' %  (clf.score(x_test, y_test)))
    #Model Evaluation
    y_pred = clf.predict(x_test)
    accuracy, precision, recall, specificity, f1_score = b_metrics(y_pred, y_test)
    print('Validation Accuracy: {:.4f}'.format(accuracy))
    print('Validation Precision: {:.4f}'.format(precision))
    print('Validation Recall: {:.4f}'.format(recall))
    print('F1 Score: {:.4f}\n'.format(f1_score))

### CoffeeMaker

In [None]:
%%time
#Dataset Importation

g_codes1  = pd.read_csv("/kaggle/input/sourcesniffer/SourceSniffer/CoffeeMaker/CoffeeMaker_Raw_Data_2.csv", encoding='latin-1')
g_codes1.drop(['Unnamed: 0','Index'], axis=1, inplace=True)

#Dataset Preprocessing (Cleaning)

g_codes1['Code and Comment'] = g_codes1['Code and Comment'].str.lower()
g_codes1['Code and Comment'] = g_codes1['Code and Comment'].apply(clean_code)

#Dataset Preprocessing (Assigning non-null entries to new variable)

gcoh_data1 = g_codes1[["Code and Comment","Label"]][~g_codes1['Label'].isnull()]

#Dataset Preprocessing (Label Encoding)

le = preprocessing.LabelEncoder()
gcoh_stat1 = le.fit_transform(gcoh_data1['Label'])

gcode_comm1 = gcoh_data1['Code and Comment']
gcode_comm2 = gcode_comm1.to_list()

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(gcode_comm2)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_coh = pd.DataFrame(denselist, columns=feature_names)

In [None]:
for i in range(3):
    x_train, x_test, y_train, y_test = train_test_split(df_coh, gcoh_stat1, test_size=.25)
    clf = SVC(C = 10.0, gamma = 1.0, kernel = 'rbf')
    clf.fit(x_train, y_train)
    print('Accuracy on Test set: %.3f' %  (clf.score(x_test, y_test)))
    #Model Evaluation
    y_pred = clf.predict(x_test)
    accuracy, precision, recall, specificity, f1_score = b_metrics(y_pred, y_test)
    print('Validation Accuracy: {:.4f}'.format(accuracy))
    print('Validation Precision: {:.4f}'.format(precision))
    print('Validation Recall: {:.4f}'.format(recall))
    print('F1 Score: {:.4f}\n'.format(f1_score))

### JFreeChart060

In [None]:
%%time
#Dataset Importation

g_codes1  = pd.read_csv("/kaggle/input/sourcesniffer/SourceSniffer/JFreeChart060/JFreeChart060_Raw_Data_2.csv", encoding='latin-1')
g_codes1.drop(['Unnamed: 0','Index'], axis=1, inplace=True)

#Dataset Preprocessing (Cleaning)

g_codes1['Code and Comment'] = g_codes1['Code and Comment'].str.lower()
g_codes1['Code and Comment'] = g_codes1['Code and Comment'].apply(clean_code)

#Dataset Preprocessing (Assigning non-null entries to new variable)

gcoh_data1 = g_codes1[["Code and Comment","Label"]][~g_codes1['Label'].isnull()]

#Dataset Preprocessing (Label Encoding)

le = preprocessing.LabelEncoder()
gcoh_stat1 = le.fit_transform(gcoh_data1['Label'])

gcode_comm1 = gcoh_data1['Code and Comment']
gcode_comm2 = gcode_comm1.to_list()

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(gcode_comm2)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_coh = pd.DataFrame(denselist, columns=feature_names)

In [None]:
for i in range(3):
    x_train, x_test, y_train, y_test = train_test_split(df_coh, gcoh_stat1, test_size=.25)
    clf = SVC(C = 10.0, gamma = 1.0, kernel = 'rbf')
    clf.fit(x_train, y_train)
    print('Accuracy on Test set: %.3f' %  (clf.score(x_test, y_test)))
    #Model Evaluation
    y_pred = clf.predict(x_test)
    accuracy, precision, recall, specificity, f1_score = b_metrics(y_pred, y_test)
    print('Validation Accuracy: {:.4f}'.format(accuracy))
    print('Validation Precision: {:.4f}'.format(precision))
    print('Validation Recall: {:.4f}'.format(recall))
    print('F1 Score: {:.4f}\n'.format(f1_score))

### JFreeChart071

In [None]:
%%time
#Dataset Importation

g_codes1  = pd.read_csv("/kaggle/input/sourcesniffer/SourceSniffer/JFreeChart071/JFreeChart071_Raw_Data_2.csv", encoding='latin-1')
g_codes1.drop(['Unnamed: 0','Index'], axis=1, inplace=True)

#Dataset Preprocessing (Cleaning)

g_codes1['Code and Comment'] = g_codes1['Code and Comment'].str.lower()
g_codes1['Code and Comment'] = g_codes1['Code and Comment'].apply(clean_code)

#Dataset Preprocessing (Assigning non-null entries to new variable)

gcoh_data1 = g_codes1[["Code and Comment","Label"]][~g_codes1['Label'].isnull()]

#Dataset Preprocessing (Label Encoding)

le = preprocessing.LabelEncoder()
gcoh_stat1 = le.fit_transform(gcoh_data1['Label'])

gcode_comm1 = gcoh_data1['Code and Comment']
gcode_comm2 = gcode_comm1.to_list()

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(gcode_comm2)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_coh = pd.DataFrame(denselist, columns=feature_names)

In [None]:
for i in range(3):
    x_train, x_test, y_train, y_test = train_test_split(df_coh, gcoh_stat1, test_size=.25)
    clf = SVC(C = 10.0, gamma = 1.0, kernel = 'rbf')
    clf.fit(x_train, y_train)
    print('Accuracy on Test set: %.3f' %  (clf.score(x_test, y_test)))
    #Model Evaluation
    y_pred = clf.predict(x_test)
    accuracy, precision, recall, specificity, f1_score = b_metrics(y_pred, y_test)
    print('Validation Accuracy: {:.4f}'.format(accuracy))
    print('Validation Precision: {:.4f}'.format(precision))
    print('Validation Recall: {:.4f}'.format(recall))
    print('F1 Score: {:.4f}\n'.format(f1_score))

### JHotDraw741

In [None]:
%%time
#Dataset Importation

g_codes1  = pd.read_csv("/kaggle/input/sourcesniffer/SourceSniffer/JHotDraw741/JHotDraw741_Raw_Data_2.csv", encoding='latin-1')
g_codes1.drop(['Unnamed: 0','Index'], axis=1, inplace=True)

#Dataset Preprocessing (Cleaning)

g_codes1['Code and Comment'] = g_codes1['Code and Comment'].str.lower()
g_codes1['Code and Comment'] = g_codes1['Code and Comment'].apply(clean_code)

#Dataset Preprocessing (Assigning non-null entries to new variable)

gcoh_data1 = g_codes1[["Code and Comment","Label"]][~g_codes1['Label'].isnull()]

#Dataset Preprocessing (Label Encoding)

le = preprocessing.LabelEncoder()
gcoh_stat1 = le.fit_transform(gcoh_data1['Label'])

gcode_comm1 = gcoh_data1['Code and Comment']
gcode_comm2 = gcode_comm1.to_list()

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(gcode_comm2)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_coh = pd.DataFrame(denselist, columns=feature_names)

In [None]:
for i in range(3):
    x_train, x_test, y_train, y_test = train_test_split(df_coh, gcoh_stat1, test_size=.25)
    clf = SVC(C = 10.0, gamma = 1.0, kernel = 'rbf')
    clf.fit(x_train, y_train)
    print('Accuracy on Test set: %.3f' %  (clf.score(x_test, y_test)))
    #Model Evaluation
    y_pred = clf.predict(x_test)
    accuracy, precision, recall, specificity, f1_score = b_metrics(y_pred, y_test)
    print('Validation Accuracy: {:.4f}'.format(accuracy))
    print('Validation Precision: {:.4f}'.format(precision))
    print('Validation Recall: {:.4f}'.format(recall))
    print('F1 Score: {:.4f}\n'.format(f1_score))

### All Datasets

In [None]:
#Dataset Importation
files = ['Benchmark_Raw_Data_2.csv','CoffeeMaker_Raw_Data_2.csv','JFreeChart060_Raw_Data_2.csv',
           'JFreeChart071_Raw_Data_2.csv','JHotDraw741_Raw_Data_2.csv']
folders = ['Benchmark','CoffeeMaker','JFreeChart060','JFreeChart071','JHotDraw741']
parent_path = '/kaggle/input/sourcesniffer/SourceSniffer'

first_path = parent_path+'/'+folders[0]+'/'+files[0]
g_codes1  = pd.read_csv(first_path, encoding='latin-1')

#concatenating all other datasets
for i in range(1,len(files)):
    curr_path = parent_path+'/'+folders[i]+'/'+files[i]
    temp_df = pd.read_csv(curr_path, encoding='latin-1')
    g_codes1 = pd.concat([g_codes1, temp_df], axis=0)
    
g_codes1.drop(['Unnamed: 0','Index'], axis=1, inplace=True)

#Dataset Preprocessing (Cleaning)

g_codes1['Code and Comment'] = g_codes1['Code and Comment'].str.lower()
g_codes1['Code and Comment'] = g_codes1['Code and Comment'].apply(clean_code)

#Dataset Preprocessing (Assigning non-null entries to new variable)

gcoh_data1 = g_codes1[["Code and Comment","Label"]][~g_codes1['Label'].isnull()]

#Dataset Preprocessing (Label Encoding)

le = preprocessing.LabelEncoder()
gcoh_stat1 = le.fit_transform(gcoh_data1['Label'])

gcode_comm1 = gcoh_data1['Code and Comment']
gcode_comm2 = gcode_comm1.to_list()

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(gcode_comm2)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df_coh = pd.DataFrame(denselist, columns=feature_names)

In [None]:
for i in range(3):
    x_train, x_test, y_train, y_test = train_test_split(df_coh, gcoh_stat1, test_size=.25)
    clf = SVC(C = 10.0, gamma = 1.0, kernel = 'rbf')
    clf.fit(x_train, y_train)
    print('Accuracy on Test set: %.3f' %  (clf.score(x_test, y_test)))
    #Model Evaluation
    y_pred = clf.predict(x_test)
    accuracy, precision, recall, specificity, f1_score = b_metrics(y_pred, y_test)
    print('Validation Accuracy: {:.4f}'.format(accuracy))
    print('Validation Precision: {:.4f}'.format(precision))
    print('Validation Recall: {:.4f}'.format(recall))
    print('F1 Score: {:.4f}\n'.format(f1_score))