In [2]:
!python3 --version

Python 3.8.0


In [3]:
!nvidia-smi

Wed Oct 26 15:42:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  Off  | 00000000:25:00.0 Off |                    0 |
| N/A   24C    P0    31W / 250W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [None]:
!python -m pip install ekphrasis
!python -m pip install transformers
!python -m pip install imblearn
!python -m pip install pandas
!python -m pip install sklearn
!python -m pip install numpy
!python -m pip install keras
!python -m pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
!python -m pip install textattack
!python -m pip install transformers
!python -m pip install ipywidgets==7.4.2
!python -m pip install mlflow
!jupyter nbextension enable --py widgetsnbextension

In [6]:
import os
import time
import numpy as np
import pandas as pd
import sklearn.metrics
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import re
from keras_preprocessing.sequence import pad_sequences
from _datetime import datetime as dt
import random
import torch
from tqdm import tqdm
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    get_linear_schedule_with_warmup
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, TensorDataset
import datetime
from sklearn.metrics import classification_report, f1_score
from imblearn.under_sampling import RandomUnderSampler
import json
import mlflow

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

img_1 = re.compile('!\[(.*)\]\(.*\)')
link_1 = re.compile('\[(.*)\]\(.*\)')
link_2 = re.compile('\[(.*)\]: [^\s]+')
code_1 = re.compile('(:?`[^`]+`|```[^`]*```)')

def preprocess(row):
  doc = ""
  doc += str(row.issue_title)
  doc += " "
  doc += str(row.issue_body)
  
  return clean_text(doc)

def clean_text(text):
  cleaned = re.sub(img_1, r'\1 <img>', text)
  cleaned = re.sub(link_1, r'\1 <url>', cleaned)
  cleaned = re.sub(link_2, r'\1 <url>', cleaned)
  cleaned = re.sub(code_1, '<code>', cleaned)
  ekph_cleaned = " ".join(text_processor.pre_process_doc(cleaned))
  return ekph_cleaned


In [8]:
def preprocess_rows(df, label_encoder):
    df = df.fillna({
                        'issue_title': '',
                        'issue_body':''                  
                   })
    df['text'] = df['issue_title'] + ' ' + df['issue_body']
    df['label'] = label_encoder.transform(df['issue_label'])
    df = df.filter(['text', 'label'])
    df['text'] = [clean_text(text) for text in tqdm(df['text'])]
    return df

In [9]:
def classification_report_to_dataframe(str_representation_of_report):
    split_string = [x.split(' ') for x in str_representation_of_report.split('\n')]
    column_names = ['']+[x for x in split_string[0] if x!='']
    values = []
    for table_row in split_string[1:-1]:
        table_row = [value for value in table_row if value!='']
        if table_row!=[]:
            values.append(table_row)
    for i in values:
        for j in range(len(i)):
            if i[1] == 'avg':
                i[0:2] = [' '.join(i[0:2])]
            if len(i) == 3:
                i.insert(1,np.nan)
                i.insert(2, np.nan)
            else:
                pass
    report_to_df = pd.DataFrame(data=values, columns=column_names)
    return report_to_df

In [10]:
# download the training set if it does not exist
if not os.path.isfile("github-labels-top3-803k-train.csv"):
  !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-train.tar.gz" | tar -xz

trainset = pd.read_csv("github-labels-top3-803k-train.csv")

In [11]:
if not os.path.isfile("github-labels-top3-803k-test.csv"):
  !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-test.tar.gz" | tar -xz

testset = pd.read_csv("github-labels-top3-803k-test.csv")

In [12]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
lenc = LabelEncoder()
lenc.fit(trainset["issue_label"])

In [14]:
trainset.drop_duplicates(subset=['issue_url'], inplace=True)
testset.drop_duplicates(subset=['issue_url'], inplace=True)

In [15]:
with open('..\dict.json') as f:
    memo_dict = json.load(f)

In [None]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

print('Using seed: {}'.format(seed_val))

In [None]:
from datasets import Dataset
from imblearn.under_sampling import RandomUnderSampler
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.metrics import confusion_matrix

STAR_FILTER = 1500

stars = []

temp = trainset.copy()
for ix, x in tqdm(temp.iterrows()):
    stars.append(memo_dict[x['repository_url']][0])

temp['stars'] = stars

filtered_df_train = temp.loc[(temp['stars']>= STAR_FILTER)]

starss = []

tempp = testset.copy()
for ix, x in tqdm(tempp.iterrows()):
    starss.append(memo_dict[x['repository_url']][0])

tempp['stars'] = starss

filtered_df_test = tempp.loc[(tempp['stars']>= STAR_FILTER)]

label_encoder = LabelEncoder()
label_encoder.fit(trainset['issue_label'])

f_train_size = filtered_df_train.groupby("issue_label").size()
f_test_size = filtered_df_test.groupby("issue_label").size()

f_under_train_dict = {
    "bug" : f_train_size['question'],
    
    "enhancement" : f_train_size['question'],
    
    "question" : f_train_size['question'],
}

f_test_set = preprocess_rows(filtered_df_test, label_encoder)

n_train_set, _ = RandomUnderSampler(sampling_strategy=f_under_train_dict, random_state=42).fit_resample(trainset, list(trainset["issue_label"]))

under_test_dict = {
    
    "bug" : f_test_size['bug'],
    
    "enhancement" : f_test_size['enhancement'],
    
    "question" : f_test_size['question'],
}

n_train_set = preprocess_rows(n_train_set, label_encoder)

f_test_set = Dataset.from_pandas(f_test_set)

n_train_set = Dataset.from_pandas(n_train_set)


BERT_MODEL = 'roberta-base'
NUM_LABELS = 3


tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, model_max_length=128)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

tokenized_dataset_train = n_train_set.map(tokenize_function, batched=True)
tokenized_dataset_test = f_test_set.map(tokenize_function, batched=True)

LEARNING_RATE = 2e-5
WEIGHT_DECAY= 1e-8
NUM_TRAIN_EPOCHS = 4
PER_DEVICE_TRAIN_BATCH_SIZE = 32


model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=NUM_LABELS, 
                                                            max_length=128)



training_args = TrainingArguments(output_dir="test_trainer",
                                    learning_rate=LEARNING_RATE,
                                    weight_decay=WEIGHT_DECAY,
                                    num_train_epochs=NUM_TRAIN_EPOCHS,
                                    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
                                    seed = seed_val)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
)

trainer.train()

output = trainer.predict(tokenized_dataset_test)

preds = np.argmax(output.predictions, axis=-1)

print(classification_report(output.label_ids, preds, digits=4))

cf = classification_report(output.label_ids, preds, digits=4, output_dict=True)

cm = confusion_matrix(output.label_ids, preds)

import json

path_to_log = 'predictions'
path_to_file = f'{STAR_FILTER} FILTER.json'

os.makedirs(path_to_log, exist_ok=True)

preds = [int(x) for x in preds]

with open(os.path.join(path_to_log, path_to_file), 'w') as f:
    json.dump(preds, f)
