In [2]:
import logging 
import os

import numpy as np
import pandas as pd
import torch
import transformers as ppb

from sklearn.model_selection import train_test_split

from google.colab import drive 
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [1]:
!pip install transformers



In [3]:
reports_file_path = '/drive/My Drive/data/clean/mozilla_bug_report_data.csv'
reports_data = pd.read_csv(reports_file_path)[:1000]

In [4]:
reports_data.head()

Unnamed: 0,long_description,severity_code
0,is broken many users can t enter bugs on it p...,4
1,adding support for custom headers and cookie n...,4
2,the patch in bug regressed the fix from bug th...,2
3,from bugzilla helper user agent mozilla x u li...,2
4,i found it odd that relogin cgi didn t clear o...,1


In [5]:
# import pre-trained DistilBERT model and tokenizer
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

In [6]:
# load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [8]:
max_len=64
descriptions = reports_data['long_description'].apply((lambda s: ' '.join(s.split()[:max_len])))
tokenized    = descriptions.apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [9]:
# padding
max_len=0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [10]:
np.array(padded).shape

(1000, 188)

In [11]:
# masking
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1000, 188)

In [12]:
# extracting features
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [15]:
features = last_hidden_states[0][:,0,:].numpy()

In [16]:
labels = reports_data['severity_code']

In [17]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=.25, stratify=labels)

In [19]:
reports_output_path = os.path.join('/','drive', 'My Drive', 'data', 'processed')
torch.save(np.column_stack((train_features, train_labels)), 
        os.path.join(reports_output_path, 'mozilla_bug_report_train_data.pt'))
torch.save(np.column_stack((test_features, test_labels)), 
        os.path.join(reports_output_path, 'mozilla_bug_report_test_data.pt'))