In [1]:
!python3 --version

Python 3.8.0


In [None]:
!nvidia-smi

In [None]:
!python -m pip install ekphrasis
!python -m pip install transformers
!python -m pip install imblearn
!python -m pip install pandas
!python -m pip install sklearn
!python -m pip install numpy
!python -m pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
!python -m pip install textattack
!python -m pip install transformers
!python -m pip install ipywidgets==7.4.2
!python -m pip install mlflow
!python -m pip install optuna datasets scipy
!python -m pip install evaluate
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import os
import time
import numpy as np
import pandas as pd
import sklearn.metrics
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import re
from _datetime import datetime as dt
import random
import torch
from tqdm import tqdm
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    get_linear_schedule_with_warmup
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, TensorDataset
import datetime
from sklearn.metrics import classification_report, f1_score
from imblearn.under_sampling import RandomUnderSampler
import json
import mlflow

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

img_1 = re.compile('!\[(.*)\]\(.*\)')
link_1 = re.compile('\[(.*)\]\(.*\)')
link_2 = re.compile('\[(.*)\]: [^\s]+')
code_1 = re.compile('(:?`[^`]+`|```[^`]*```)')

def preprocess(row):
  doc = ""
  doc += str(row.issue_title)
  doc += " "
  doc += str(row.issue_body)
  
  return clean_text(doc)

def clean_text(text):
  cleaned = re.sub(img_1, r'\1 <img>', text)
  cleaned = re.sub(link_1, r'\1 <url>', cleaned)
  cleaned = re.sub(link_2, r'\1 <url>', cleaned)
  cleaned = re.sub(code_1, '<code>', cleaned)
  ekph_cleaned = " ".join(text_processor.pre_process_doc(cleaned))
  return ekph_cleaned


In [7]:
def preprocess_rows(df, label_encoder):
    df = df.fillna({
                        'issue_title': '',
                        'issue_body':''                  
                   })
    df['text'] = df['issue_title'] + df['issue_body']
    df['label'] = label_encoder.transform(df['issue_label'])
    df = df.filter(['text', 'label'])
    df['text'] = [clean_text(text) for text in tqdm(df['text'])]
    return df

In [8]:
def classification_report_to_dataframe(str_representation_of_report):
    split_string = [x.split(' ') for x in str_representation_of_report.split('\n')]
    column_names = ['']+[x for x in split_string[0] if x!='']
    values = []
    for table_row in split_string[1:-1]:
        table_row = [value for value in table_row if value!='']
        if table_row!=[]:
            values.append(table_row)
    for i in values:
        for j in range(len(i)):
            if i[1] == 'avg':
                i[0:2] = [' '.join(i[0:2])]
            if len(i) == 3:
                i.insert(1,np.nan)
                i.insert(2, np.nan)
            else:
                pass
    report_to_df = pd.DataFrame(data=values, columns=column_names)
    return report_to_df

In [9]:
# download the training set if it does not exist
if not os.path.isfile("github-labels-top3-803k-train.csv"):
  !curl "https://tickettagger.blob.core.windows.net/datasets/github-labels-top3-803k-train.tar.gz" | tar -xz

trainset = pd.read_csv("github-labels-top3-803k-train.csv")

In [None]:
trainset

In [11]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
lenc = LabelEncoder()
lenc.fit(trainset["issue_label"])

In [13]:
trainset.drop_duplicates(subset=['issue_url'], inplace=True)

In [14]:
train_size = trainset.groupby("issue_label").size()

In [15]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split


# Split the dataset into training and evaluation sets while preserving label distribution
train_dataset, eval_dataset = train_test_split(
    trainset, test_size=0.1, stratify=trainset['issue_label'], random_state=42
)


In [None]:
train_set = preprocess_rows(train_dataset, lenc)
eval_set = preprocess_rows(eval_dataset, lenc)

In [None]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

print('Using seed: {}'.format(seed_val))

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(trainset['issue_label'])

In [19]:
from datasets import Dataset
train_set = Dataset.from_pandas(train_set)
eval_set = Dataset.from_pandas(eval_set)

In [20]:
BERT_MODEL = 'roberta-base'
NUM_LABELS = 3


In [21]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, model_max_length=128)

In [22]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

In [None]:
tokenized_dataset_train = train_set.map(tokenize_function, batched=True)
tokenized_dataset_eval = eval_set.map(tokenize_function, batched=True)

In [24]:

WEIGHT_DECAY= 1e-8
PER_DEVICE_TRAIN_BATCH_SIZE = 32

In [25]:
import evaluate
f1_metric = evaluate.load("f1", average='micro')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    return f1_metric.compute(predictions=predictions, references=labels, average='micro')

In [26]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=NUM_LABELS, 
                                                           max_length=128)

In [27]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer",
                                weight_decay=WEIGHT_DECAY,
                                per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
                                  num_train_epochs=4,
                                seed = seed_val)

In [None]:
from transformers import Trainer
trainer = Trainer(
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_eval,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

In [29]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 4e-5, 3e-5, 2e-5]),
        #"per_device_train_batch_size": trial.suggest_categorical("num_train_epochs", [1,2,3,4]),
    }

In [None]:
# Default objective is the sum of all metrics
# when metrics are provided, so we have to maximize it.

best_run = trainer.hyperparameter_search(
    hp_space=optuna_hp_space,
    direction="maximize", 
    backend="optuna", 
    n_trials=5, # number of trials
)