# Prepare
Install required libraries and import

In [0]:
!pip install pytorch-transformers fastprogress

In [0]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [0]:
from fastai import *
from fastai.text import *
from fastai.callbacks import *
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.utils.multiclass import unique_labels

In [0]:
from pytorch_transformers import BertTokenizer, BertPreTrainedModel, BertModel, BertConfig
from pytorch_transformers import AdamW

In [0]:
from fastprogress import master_bar, progress_bar
from datetime import datetime

Check, if and what kind of GPU is used

In [0]:
cuda_available = torch.cuda.is_available()
if cuda_available:
    curr_device = torch.cuda.current_device()
    print(torch.cuda.get_device_name(curr_device))
device = torch.device("cuda" if cuda_available else "cpu")
device

In [0]:
def get_memory_usage():
    return torch.cuda.memory_allocated(device)/1000000

def get_memory_usage_str():
    return 'Memory usage: {:.2f} MB'.format(get_memory_usage())

Create a config

In [0]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

class Fold(Enum):
  No = 1
  TenFold = 2
  ProjFold = 3

config = Config(
    num_labels = 4, # will be set automatically
    model_name="bert-base-cased", 
    max_lr=2e-5, # default: 2e-5
    moms=(0.8, 0.7), # default: (0.8, 0.7); alt.(0.95, 0.85)
    epochs=16,
    bs=16, # default: 2 or 4
    weight_decay = 0.01,
    max_seq_len=128,
    train_size=0.75,
    loss_func=nn.CrossEntropyLoss(),
    seed=904727489, #default: 904727489, 42 (as in Dalpiaz) or None
    es = False,
    min_delta = 0.01,
    fold = Fold.TenFold # Fold.No, Fold.TenFold, Fold.ProjFold
)

clazz = 'clazz'

config_data = Config(
    root_folder = '.',
    data_folder = '/',
    train_data = ['Raw-DataTrack-Huang_all.csv'],
    label_column = clazz,
    log_file = '/log/' + clazz + '_' + Fold(config.fold).name + '_classifierPredictions_' + datetime.now().strftime('%Y%m%d-%H%M') + '.txt',
    result_file = '/log/' + clazz + '_' + Fold(config.fold).name +  '_classifierResults_' + datetime.now().strftime('%Y%m%d-%H%M') + '.txt',
    model_path = '/model/',
    model_name = 'NoRBERT.pkl',
    #project_fold = [[3, 9, 11], [1, 5, 12], [6, 10, 13], [1, 8, 14], [3, 12, 15], [2, 5, 11], [6, 9, 14], [7, 8, 13], [2, 4, 15], [4, 7, 10] ],
    project_fold = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15] ],
    #classes = ['US', 'SE', 'O', 'PE'],
    classes= ['A', 'FT', 'L', 'LF', 'MN', 'O', 'PE', 'PO', 'SC', 'SE', 'US'],
)

load_from_gdrive = True
save_model = False


To import the dataset, first we have to connect to our Google drive (if data should be loaded from gdrive). For this, we have to authenticating the access and mount the drive

In [0]:
if load_from_gdrive:
    from google.colab import drive
    # Connect to drive to load the corpus from there
    drive.mount('/content/drive', force_remount=True)
    config_data.root_folder = '/content/drive/My Drive/x/Dataset/Promise'

In [0]:
def initLog():
    logfile = config_data.root_folder + config_data.log_file
    log_txt = datetime.now().strftime('%Y-%m-%d %H:%M') + ' ' + get_info()
    with open(logfile, 'w') as log:
        log.write(log_txt + '\n')

def logLine(line):
    logfile = config_data.root_folder + config_data.log_file
    with open(logfile, 'a') as log:
        log.write(line + '\n')

def logResult(result):
    logfile = config_data.root_folder + config_data.result_file
    with open(logfile, 'a') as log:
        log.write(get_info() + '\n')
        log.write(result + '\n')

In [0]:
def get_info():
    model_config = 'model: {}, max_lr: {}, epochs: {}, bs: {}, train_size: {}, weight decay: {}, Seed: {}, Data: {}, Column: {}, EarlyStopping: {}:{}'.format(config.model_name, config.max_lr, config.epochs, config.bs, config.train_size, config.weight_decay, config.seed, config_data.train_data, config_data.label_column, config.es, config.min_delta)
    return model_config

In [0]:
def set_seed(seed):
    if seed is None:
        seed = random.randint(0, 2**31)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    return seed

set_seed(config.seed)

# Data


Create proper tokenizer for our data

In [0]:
class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=512, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str):
        """Limits the maximum sequence length. Prepend with [CLS] and append [SEP]"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]



Now, we can create our own databunch using the tokenizer above. Notice we're passing the include_bos=False and include_eos=False options. This is to prevent fastai from adding its own SOS/EOS tokens that will interfere with BERT's SOS/EOS tokens.

We can pass our own list of Preprocessors to the databunch.

In [0]:
class BertTokenizeProcessor(TokenizeProcessor):
    """Special Tokenizer, where we remove sos/eos tokens since we add that ourselves in the tokenizer."""
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class BertNumericalizeProcessor(NumericalizeProcessor):
    """Use a custom vocabulary to match the original BERT model."""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)

def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]

class BertDataBunch(TextDataBunch):
    @classmethod
    def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,
              tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,
              label_cols:IntsOrStrs=0, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from DataFrames."
        p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
        # use our custom processors while taking tokenizer and vocab as kwargs
        processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
        if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
        src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                      TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
        src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
        if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
        return src.databunch(**kwargs)

Create the BertTextClassifier-Class

In [0]:
class BertTextClassifier(BertPreTrainedModel):
    def __init__(self, model_name, num_labels):
        config = BertConfig.from_pretrained(model_name)
        super(BertTextClassifier, self).__init__(config)
        self.num_labels = num_labels
        
        self.bert = BertModel.from_pretrained(model_name, config=config)
        
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.config.hidden_size, num_labels)

        #self.apply(self.init_weights)
    
    def forward(self, tokens, labels=None, position_ids=None, token_type_ids=None, attention_mask=None, head_mask=None):
        outputs = self.bert(tokens, position_ids=position_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, head_mask=head_mask)
        
        pooled_output = outputs[1]
        # According to documentation of pytorch-transformers, pooled output might not be the best 
        # and you’re often better with averaging or pooling the sequence of hidden-states for the whole input sequence 
        #hidden_states = outputs[0]
        #pooled_output = torch.mean(hidden_states, 1)

        dropout_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_output)

        activation = nn.Softmax(dim=1)
        probs = activation(logits)   

        return logits

Load the dataset

In [0]:
def create_label_indices():
    #prepare label
    labels = config_data.classes
    labels.append('Other')
  
    #create dict
    labelDict = dict()
    for i in range (0, len(labels)):
        labelDict[i] = labels[i]
    return labelDict

label_indices = create_label_indices()
print(label_indices)

In [0]:
def load_data(filename):
    fpath = config_data.root_folder + config_data.data_folder + filename
    print(fpath)
    df = pd.read_csv(fpath, delimiter=';', header=0, encoding='utf8', names=['number', 'ProjectID', 'RequirementText', 'clazz', 'NFR', 'F', 'A', 'FT', 'L', 'LF', 'MN', 'O', 'PE', 'PO', 'SC', 'SE', 'US'])
    df = df.dropna()
    is_NFR = df['NFR']==1
    df = df[is_NFR]
    
    inv_map = {v: k for k, v in label_indices.items()}
    df[config_data.label_column] = df[config_data.label_column].map(inv_map)
    df[config_data.label_column].fillna(inv_map.get('Other'), inplace=True)
    df[config_data.label_column]=df[config_data.label_column].astype(int)
    df = df.loc[df[config_data.label_column] != 7]
    return df

def load_all_data(filenames):
    df = load_data(filenames[0])
    for i in range(1, len(filenames)):
        df = df.append(load_data(filenames[i]))
    return df



In [0]:
# load the train datasets
df = load_all_data(config_data.train_data)

# shuffle the dataset a bit and get the amount of classes
df = df.sample(frac=1, axis=0, random_state = config.seed)
config.num_labels = df[config_data.label_column].nunique()

print(df.shape)
print(df[config_data.label_column].value_counts())
print(df['ProjectID'].value_counts())



# Predictor


Create a predictor class. 

In [0]:
class Predictor:
    def __init__(self, classifier):
        self.classifier = classifier
        self.classes = self.classifier.data.classes

    def predict(self, text):
        prediction = self.classifier.predict(text)
        prediction_class = prediction[1]
        return self.classes[prediction_class]   

# Create and train the learner/classifier


Create the needed functions to create and train a classifier



In [0]:
def split_dataframe(df, train_size = 0.8, random_state = None):
    # split data into training and validation set
    df_trn, df_valid = train_test_split(df, stratify = df[config_data.label_column], train_size = train_size, random_state = random_state)
    return df_trn, df_valid
  
def create_databunch(config, df_trn, df_valid):
    bert_tok = BertTokenizer.from_pretrained(config.model_name,)
    fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), pre_rules=[], post_rules=[])
    fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))
    return BertDataBunch.from_df(".", 
                   train_df=df_trn,
                   valid_df=df_valid,
                   tokenizer=fastai_tokenizer,
                   vocab=fastai_bert_vocab,
                   bs=config.bs,
                   text_cols='RequirementText',
                   label_cols=config_data.label_column,
                   collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
              )


def create_learner(config, databunch):
    model = BertTextClassifier(config.model_name, config.num_labels)

    optimizer = partial(AdamW)
    if config.es:
      learner = Learner(
        databunch, model,
        optimizer,
        wd = config.weight_decay,
        metrics=accuracy,
        loss_func=config.loss_func, callback_fns=[partial(EarlyStoppingCallback, monitor='accuracy', min_delta=config.min_delta, patience=4)]
      )
    else:
      learner = Learner(
        databunch, model,
        optimizer,
        wd = config.weight_decay,
        metrics=accuracy,
        loss_func=config.loss_func,
      )
    
    return learner

Actually create the trained classifier

In [0]:
# Create the classifier
def create_classifier(config, df):
  df_trn, df_valid = split_dataframe(df, train_size = config.train_size, random_state = config.seed)
  databunch = create_databunch(config, df_trn, df_valid)

  return create_learner(config, databunch)

Define train and test loop


In [0]:
def train_and_predict(df_train, df_eval, overall_flat_predictions, overall_flat_true_labels):
  classifier = create_classifier(config, df_train)
  # Train the classifier
  print(classifier.fit_one_cycle(config.epochs, max_lr=config.max_lr, moms=config.moms, wd=config.weight_decay))
  # Predict
  predictor = Predictor(classifier)
  flat_predictions, flat_true_labels = [], []
  column_index = df_eval.columns.get_loc(config_data.label_column)
  for row in progress_bar(df_eval.itertuples(), total=len(df_eval)):
      class_text = row.RequirementText
      class_label = row[column_index+1]
      flat_true_labels.append(class_label)
      prediction = predictor.predict(class_text)
      flat_predictions.append(prediction)

      log_text = '{}, {} -> {}'.format(class_text, label_indices.get(class_label), label_indices.get(prediction))
      logLine(log_text)
  
  target_names = []
  test_labels = unique_labels(flat_true_labels, flat_predictions)

  test_labels = np.sort(test_labels)
  for x in test_labels:
    target_names.append(label_indices.get(x))

  result = classification_report(flat_true_labels, flat_predictions, target_names=target_names, digits = 5)
  logResult(result)
  print(result)
  overall_flat_predictions.extend(flat_predictions)
  overall_flat_true_labels.extend(flat_true_labels)
  return overall_flat_predictions, overall_flat_true_labels

Decide how to fold and train

In [0]:
overall_flat_predictions, overall_flat_true_labels = [], []
initLog()
if config.fold == Fold.TenFold:
  skf = StratifiedKFold(n_splits=10)
  fold_number = 1
  for train, test in skf.split(df, df[config_data.label_column]):
    df_train = df.iloc[train]
    df_eval = df.iloc[test]
    log_text = '/////////////////////// Fold: {} of {} /////////////////////////////'.format(fold_number,10)
    logLine(log_text)
    overall_flat_predictions, overall_flat_true_labels = train_and_predict(df_train, df_eval, overall_flat_predictions, overall_flat_true_labels)
    fold_number = fold_number + 1
elif config.fold == Fold.ProjFold:     
  for k in config_data.project_fold:
    test = df.loc[df['ProjectID'].isin(k)].index
    train = df.loc[~df['ProjectID'].isin(k)].index
    df_train = df.iloc[train]
    df_eval = df.iloc[test]
    log_text = '/////////////////////// Test-Projects: {} /////////////////////////////'.format(k)
    logLine(log_text)
    overall_flat_predictions, overall_flat_true_labels = train_and_predict(df_train, df_eval, overall_flat_predictions, overall_flat_true_labels)
else:
  df_train, df_eval = train_test_split(df,stratify=df[config_data.label_column], train_size=config.train_size, random_state= config.seed)
  classifier = create_classifier(config, df_train)
  # Train the classifier
  print(classifier.fit_one_cycle(config.epochs, max_lr=config.max_lr, moms=config.moms, wd=config.weight_decay))
  # Predict
  predictor = Predictor(classifier)
  column_index = df_eval.columns.get_loc(config_data.label_column)
  print(column_index)
  for row in progress_bar(df_eval.itertuples(), total=len(df_eval)):
      class_text = row.RequirementText
      class_label = row[column_index+1]
      overall_flat_true_labels.append(class_label)
      prediction = predictor.predict(class_text)
      overall_flat_predictions.append(prediction)

      log_text = '{}, {} -> {}'.format(class_text, label_indices.get(class_label), label_indices.get(prediction))
      logLine(log_text)


target_names = []
test_labels = df_eval[config_data.label_column].unique()

test_labels = np.sort(test_labels)
for x in test_labels:
  target_names.append(label_indices.get(x))

result = classification_report(overall_flat_true_labels, overall_flat_predictions, target_names=target_names, digits = 5)
logResult(result)
print(result)
get_memory_usage_str()


# Save Model

Save the model along with its config

In [0]:
def create_model_name():
    name = 'NoRBERT_e{epochs}_{data_filename}'.format(epochs=str(config.epochs),data_filename=data_filenames[0][:-4])
    return name

def save_config(model_save_path, model_name):
    settings = ''
    for item in config.__dict__:
        value = config[item]
        setting = '{item}={value},\n'.format(item=item, value=value)
        settings += setting
    save_path = model_save_path + model_name + '.config'
    with open(save_path, 'w', encoding='utf-8') as out:
        out.write(settings)



In [0]:
if save_model:
    model_name = create_model_name()
    model_save_path = config_data.root_folder + config_data.model_path
    save_config(model_save_path, model_name)
    model_save_file = model_save_path + model_name + '.pkl'
    classifier.export(file = model_save_file)