<a href="https://colab.research.google.com/github/demircanserdar/CS412/blob/main/models/distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install accelerate -U
!pip install evaluate
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m302.6/302.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->acc

In [2]:
import torch
import re
import evaluate

import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
data_path = 'preprocessed_text.csv'
test_path = 'preprocessed_submission.csv'

In [None]:
df_train = pd.read_csv(data_path, on_bad_lines='skip')
df_train

Unnamed: 0.1,Unnamed: 0,bug id,summary,severity,filtered_text,classification
0,0,365569,Remove workaround from bug 297227,normal,remove workaround bug,3
1,1,365578,Print Preview crashes on any URL in gtk2 builds,critical,print preview crash url gtk build,6
2,2,365582,Lines are not showing in table,major,line show table,4
3,3,365584,Firefox render ¬â√õ√èsimplified Arabic¬â√õ¬ù font fa...,normal,firefox render simplify arabic font face incor...,3
4,4,365597,Crash [@ nsINodeInfo::NodeInfoManager],critical,crash ns node info node info manager,6
...,...,...,...,...,...,...
159993,159993,1143381,block elements with height after float left or...,normal,block elements height float leave right corrup...,3
159994,159994,1143392,typing in google translate will send reset inp...,normal,type google translate send reset input method ...,3
159995,159995,1143394,[gstreamer] Nightly instantly crashes on Youtu...,critical,gstreamer nightly instantly crash youtube linu...,6
159996,159996,1143395,Right click on Flash object with accessibility...,critical,right click flash object accessibility active ...,6


In [None]:
df_test = pd.read_csv(test_path, on_bad_lines='skip')
df_test

Unnamed: 0.1,Unnamed: 0,bug id,summary,filtered_text,pred
0,0,1143402,Firefox claims to be not the default browser w...,firefox claim default browser update alter...,3
1,1,1143405,Background of html and body element are not ap...,background html body element apply correctly i...,3
2,2,1143409,Mouse input breaks after using window.showModa...,mouse input break use window show modal dial...,3
3,3,1143411,Build failure with next freetype version/curre...,build failure next freetype version current ...,3
4,4,1143417,HTML element is not treated as root inside for...,html element treat root inside foreign object,3
...,...,...,...,...,...
86089,86089,1426166,Crash in bool IsAboutToBeFinalizedInternal<T>,crash bool finalize internal,3
86090,86090,1426171,Potential crash if GraphRate is greater than 4...,potential crash graph rate greater k hz webrt...,3
86091,86091,1426173,Crash in <name omitted> | decltype JS::Dispatc...,crash name omit decltype js dispatch t...,3
86092,86092,1426174,Crash in xul.dll@0x28145fa | xul.dll@0x3c748ff...,crash xul dll x fa xul dll x c ff ...,3


In [None]:
label_encoder = LabelEncoder()

y_numeric = label_encoder.fit_transform(df_train['severity'])
X_train, X_test, y_train, y_test = train_test_split(df_train, y_numeric, test_size=0.2, random_state=42, stratify=y_numeric)
y_numeric = label_encoder.fit_transform(X_train['severity'])
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_numeric)

In [None]:
dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

{'blocker': 0,
 'critical': 1,
 'enhancement': 2,
 'major': 3,
 'minor': 4,
 'normal': 5,
 'trivial': 6}

In [None]:
category_list = [words for words in df_train['severity'].to_list()]
category_list = list(set(category_list))[::-1]
label2id = {item: index for index, item in enumerate(category_list)}
id2label = {index: item for index, item in enumerate(category_list)}

In [None]:
label2id

{'normal': 0,
 'blocker': 1,
 'minor': 2,
 'critical': 3,
 'enhancement': 4,
 'major': 5,
 'trivial': 6}

In [None]:
id2label

{0: 'normal',
 1: 'blocker',
 2: 'minor',
 3: 'critical',
 4: 'enhancement',
 5: 'major',
 6: 'trivial'}

In [None]:
X_train['labels'] = X_train['severity'].replace(label2id)
X_test['labels'] = X_test['severity'].replace(label2id)
#X_val['labels'] = X_val['severity'].replace(label2id)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def preprocess_function(data_frame):
    return tokenizer(data_frame["filtered_text"], truncation=True)

In [None]:
train_dataset = Dataset.from_pandas(X_train[['filtered_text', 'labels']])
test_dataset = Dataset.from_pandas(X_test[['filtered_text', 'labels']])

tokenized_train_df = train_dataset.map(preprocess_function, batched=True)
tokenized_test_df = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/102390 [00:00<?, ? examples/s]

Map:   0%|          | 0/31998 [00:00<?, ? examples/s]

In [None]:
val_dataset = Dataset.from_pandas(x_val)

tokenized_val_df = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/25600 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
precision = evaluate.load("precision") #for macro precision
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return precision.compute(predictions=predictions, references=labels, average = 'macro')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(X_train['labels'].value_counts()), id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir= '/content/drive/MyDrive/sabanci_sunum/',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_df,
    eval_dataset=tokenized_test_df,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [None]:
trainer.train() #with this preprocessor

Epoch,Training Loss,Validation Loss,Precision
1,0.5394,0.528414,0.332103
2,0.4948,0.529623,0.560136
3,0.4155,0.547342,0.69508


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=19200, training_loss=0.4949422562122345, metrics={'train_runtime': 1547.8914, 'train_samples_per_second': 198.444, 'train_steps_per_second': 12.404, 'total_flos': 2219559280134564.0, 'train_loss': 0.4949422562122345, 'epoch': 3.0})

In [None]:
df_test = df_test.replace('N/A'," ")

In [None]:
submission_dataset = Dataset.from_pandas(df_test[['filtered_text']])
tokenized_submission_df = submission_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/86094 [00:00<?, ? examples/s]

In [None]:
result = trainer.predict(tokenized_submission_df)
result

PredictionOutput(predictions=array([[ 3.0221803 , -2.3315885 , -0.27449262, ..., -0.29899165,
         0.44630373, -1.1332802 ],
       [ 3.4492555 , -2.787294  ,  0.343203  , ..., -1.0462797 ,
         0.40871945, -0.8334625 ],
       [ 3.093691  , -1.9417846 , -0.4066331 , ..., -2.1828926 ,
         1.4493216 , -1.774365  ],
       ...,
       [ 1.9562358 , -0.9886433 , -3.1178112 , ..., -3.1043785 ,
        -1.1164244 , -3.939883  ],
       [ 1.5897529 , -0.95537674, -3.0223804 , ..., -3.1014924 ,
        -1.1782285 , -3.713039  ],
       [ 3.6944695 , -1.129734  , -0.84431046, ..., -1.3455794 ,
        -0.03155104, -1.6302204 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 81.6753, 'test_samples_per_second': 1054.1, 'test_steps_per_second': 65.883})

In [None]:
result.predictions.shape

(86094, 7)

In [None]:
df_test['severity'] = ""

In [None]:
i = 0
for logits in result.predictions:
  df_test.loc[i,'severity'] = model.config.id2label[logits.argmax().item()]
  i = i + 1

In [None]:
df_test

Unnamed: 0.1,Unnamed: 0,bug id,summary,filtered_text,pred,severity
0,0,1143402,Firefox claims to be not the default browser w...,firefox claim default browser update alter...,3,normal
1,1,1143405,Background of html and body element are not ap...,background html body element apply correctly i...,3,normal
2,2,1143409,Mouse input breaks after using window.showModa...,mouse input break use window show modal dial...,3,normal
3,3,1143411,Build failure with next freetype version/curre...,build failure next freetype version current ...,3,normal
4,4,1143417,HTML element is not treated as root inside for...,html element treat root inside foreign object,3,normal
...,...,...,...,...,...,...
86089,86089,1426166,Crash in bool IsAboutToBeFinalizedInternal<T>,crash bool finalize internal,3,critical
86090,86090,1426171,Potential crash if GraphRate is greater than 4...,potential crash graph rate greater k hz webrt...,3,normal
86091,86091,1426173,Crash in <name omitted> | decltype JS::Dispatc...,crash name omit decltype js dispatch t...,3,critical
86092,86092,1426174,Crash in xul.dll@0x28145fa | xul.dll@0x3c748ff...,crash xul dll x fa xul dll x c ff ...,3,critical


In [None]:
df_test = df_test.rename(columns={'bug id': 'bug_id'})
df_test.set_index("bug_id", inplace=True)

In [None]:
df_test = df_test[['severity']]

In [None]:
df_test.to_csv("bert.csv")