# Using BERT Tokenizer and BERT Model from Huggingface

# Aggregated Preprocessing Steps

In [100]:
import pandas as pd

In [101]:
%pip install contractions

Note: you may need to restart the kernel to use updated packages.


In [102]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: pd.Series):
    return data.str.lower()


In [103]:
def remove_accented_characters(data: pd.Series):
    import unicodedata

    """Removes accented characters from the Series

    Args:
        data (pd.Series): Series of string

    Returns:
        _type_: pd.Series
    """
    import unicodedata

    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))


In [104]:
def remove_html_encodings(data: pd.Series):
  return data.str.replace(r"&#\d+;", " ", regex=True)

In [105]:
def remove_html_tags(data: pd.Series):
  return data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

In [106]:
def remove_url(data: pd.Series):
  return data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

In [107]:
def remove_html_and_url(data: pd.Series):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (pd.Series): A Pandas series of type string

    Returns:
        _type_: pd.Series
    """
    # Remove HTML encodings
    data.str.replace(r"&#\d+;", " ", regex=True)

    # Remove HTML tags (both open and closed)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

    # Remove URLs
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

    return data


In [108]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z0-9\s]", " ", regex=True)


In [109]:
# Remove extra spaces
def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)


In [110]:
# Expanding contractions
def fix_contractions(data: pd.Series):
    import contractions

    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    return data.apply(contraction_fixer)


In [111]:
# remove "-lrb-"
def remove_special_words(data: pd.Series):
  return data.str.replace(r"\-[^a-zA-Z]{3}\-", " ", regex=True)

In [112]:
!pip install transformers



In [113]:
from transformers import BertTokenizer

### Load Data

In [206]:
import os
import numpy as np

In [237]:
files = os.listdir("./data/Sentences/")
prefix_sentence = "./data/Sentences/"
prefix_label = "./data/Labels/"

clauses = []
for file in files:
    sentence_file_path = prefix_sentence + file 
    label_file_path = prefix_label + file
    sentences_df = pd.read_csv(sentence_file_path, sep="dummy_separator", header=None)
    sentences_df.columns = ["sentences"]
    label_df = pd.read_csv(label_file_path, sep=" ", header=None)
    label_df.columns = ["label"]
    label_df["label_converted"] = np.where(label_df["label"] == -1, 0, 1)
    sentences_df["document"] = file
    df_concat = pd.concat([label_df["label_converted"], sentences_df], axis=1)
    clauses.append(df_concat)

  return func(*args, **kwargs)


In [238]:
colnames = ["sentences", "label_converted", "document"]
clauses_df = pd.DataFrame(columns = colnames)
for df in clauses:
    clauses_df = clauses_df.append(df)

In [239]:
clauses_df.rename(columns={'label_converted': 'label', 'sentences': 'sentences', 'document' : 'document'}, inplace=True)


In [240]:
clauses_df.head()

Unnamed: 0,sentences,label,document
0,thanks for sending us good vibes by using the ...,0,Viber.txt
1,"you may be surprised , but we will refer to al...",0,Viber.txt
2,"the terms of use -lrb- or , the `` terms '' -r...",0,Viber.txt
3,the language of the terms will seem legal -lrb...,0,Viber.txt
4,"when you use our services , in addition to enj...",1,Viber.txt


In [242]:
clauses_df.sentences[0]

0    thanks for sending us good vibes by using the ...
0                  version : 06.2017 -lrb- en-eu -rrb-
0    notice to california subscribers : you may can...
0                           posted : december 8 , 2016
0    if you live in -lrb- or your principal place o...
0    when you -lrb- `` you '' -rrb- sign up with be...
0    we 've recently updated our : -lrb- 1 -rrb- te...
0      end user license agreement and terms of service
0    crowdtangle , inc. -lrb- `` crowdtangle , '' `...
0    welcome to the tripadvisor website or mobile p...
0            deliveroo terms and conditions of service
0                        effective date : may 5 , 2014
0                        • terms and conditions of use
0                     effective date : august 1 , 2017
0                     * accepting the terms of service
0    these terms and conditions , as may be amended...
0    these terms & conditions -lrb- these `` terms ...
0                    we recently revised these terms .
0        d

In [234]:
clauses_df.to_csv("./data/data.csv", index=False)

### Clean the Data

In [217]:
# A dictionary containing the columns and a list of functions to perform on it in order
def cleaning(df):
  data_cleaning_pipeline = {
      "sentences": [
          to_lower,
          remove_special_words,
          remove_accented_characters,
          remove_html_encodings,
          remove_html_tags,
          remove_url,
          fix_contractions,
          remove_non_alpha_characters,
          remove_extra_spaces,
      ]
  }

  cleaned_data = df.copy()

  # Process all the cleaning instructions
  for col, pipeline in data_cleaning_pipeline.items():
      # Get the column to perform cleaning on
      temp_data = cleaned_data[col].copy()

      # Perform all the cleaning functions sequencially
      for func in pipeline:
          print(f"Starting: {func.__name__}")
          temp_data = func(temp_data)
          print(f"Ended: {func.__name__}")

      # Replace the old column with cleaned one.
      cleaned_data[col] = temp_data.copy()

  return cleaned_data


In [218]:
cleaned_data = cleaning(clauses_df)


Starting: to_lower
Ended: to_lower
Starting: remove_special_words
Ended: remove_special_words
Starting: remove_accented_characters
Ended: remove_accented_characters
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [220]:
cleaned_data.to_csv("./data/cleaned.csv", index=False)

### Using BERT Tokenizer and BERT Model to get Embeddings

In [151]:
cls = "[CLS]"
sep = "[SEP]"
pad = "[PAD]"
bert_pad_len = 512

In [152]:
import logging
import torch
import numpy as np
import warnings
from transformers import BertTokenizer, BertModel

In [153]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### Functions

In [154]:
def create_tensors_BERT(text):
  """
    Tokenize using BERT Tokenizer for the pd.Series
  """
  print("Tokenizing text...")
  logging.basicConfig(level = logging.INFO)

  # Load the `bert-base-uncased` model
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

  # Tokenize every sentence in the pd.Series
  tokenized_text = [tokenizer.tokenize(x) for x in text]

  # Pad the tokens to be used for BERT Model; BERT takes fixed lengend sequence
  tokenized_text = [x + ([pad] * (bert_pad_len - len(x))) for x in tokenized_text]

  # Convert the tokens to their IDs
  indexed_text = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]

  # BERTModel has Q&A format, so setting the context to one for every sentence
  segment_ids = [[1] * len(x) for x in tokenized_text]

  # Convert to tensor
  torch_idx_text = torch.LongTensor(indexed_text)
  torch_seg_ids = torch.LongTensor(segment_ids)
  
  return tokenized_text, torch_idx_text, torch_seg_ids 

In [155]:
#takes in the index and segment tensors and returns the bert embeddings as a list
def get_embeddings(torch_idx_text, torch_seg_ids):
    """
      Create BERT embeddings from tokens
    """
    print("Getting Embeddings...")

    # Load pretrained `bert-base-uncased` model, and set to inference
    model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
    model.eval()

    torch_idx_text, torch_seg_ids = torch_idx_text.to("cpu"), torch_seg_ids.to("cpu")
    model.to(device)

    # Disable gradient and get BERT embeddings
    with torch.no_grad():
        bert_embeddings = []
        for i in range(len(torch_idx_text)):
            print(i, end = "\r")
            text_temp = torch.unsqueeze(torch_idx_text[i], dim = 0).to(device)
            sgmt_temp = torch.unsqueeze(torch_seg_ids[i], dim = 0).to(device)
            output = model(text_temp, sgmt_temp)
            bert_embeddings.append(output[0])
            del text_temp, sgmt_temp
    del model
  
    return bert_embeddings


Note: As an additional improvement to reduce the dimentionality, we can aggregate expansions of words like - `running` -> "run", "##ing". This is an additional step is not mandatory and can be used when trying out the embeddings.

In [156]:
def embeddings_to_words(tokenized_text, embeddings):
    """
      Clubbing same word tokens to reduce dimensionality
      Note: Need to run this locally and tweak virtual memory, as the colab
      runtime crashes
    """
    print("Untokenizing text and embeddings...")
    embeddings = [x.cpu().detach().numpy() for x in embeddings]
    embeddings = np.concatenate(embeddings, axis = 0)
    sentences = []
    final_emb = []

    # Iterate over every sentence
    for i in range(len(tokenized_text)):
        txt = tokenized_text[i]
        sub_len = 0
        sent = []
        sub = []
        emb = []
        sub_emb = None
        try:
            idx = txt.index(pad)
        except:
            idx = len(txt)
        for j in range(idx):
            # For the token that starts with ## process it; remove ## and
            # club that with previous token;
            # For the embedding, take the average of token's embeddings
            if txt[j].startswith("##"):
                if sub == []:
                    sub.append(sent.pop())
                    emb.pop()
                    sub_emb = embeddings[i][j - 1] + embeddings[i][j]
                    sub.append(txt[j][2:])
                    sub_len = 2
                else:
                    sub.append(txt[j][2:])
                    sub_emb += embeddings[i][j]
                    sub_len += 1
            else:
                if sub != []:
                    sent.append("".join(sub))
                    emb.append(sub_emb / sub_len)
                    sub = []
                    sub_emb = None
                    sub_len = 0
                sent.append(txt[j])
                emb.append(embeddings[i][j])
        sentences.append(sent)
        final_emb.append(emb)
    return sentences, final_emb

### Tokenize and Create Embeddings

In [157]:
tokenized_text, torch_idx_text, torch_seg_ids = create_tensors_BERT(cleaned_data.sentences)

Tokenizing text...


In [158]:
bert_embeddings = get_embeddings(torch_idx_text, torch_seg_ids)


Getting Embeddings...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


9413

Most computationally time consuming part is to get embeddings

Store the embeddings for using in other models.

In [159]:
import pickle as pkl
embeddings_file_path = "./embeddings/bert_embeddings.pkl"

def save_embeddings(embeddings_file_path, embeddings, tokenized_text):
  with open(embeddings_file_path, mode="wb") as file:
    pkl.dump({"embeddings": embeddings, "tokenized_txt": tokenized_text}, file, protocol=pkl.HIGHEST_PROTOCOL)

In [160]:
save_embeddings(embeddings_file_path, bert_embeddings, tokenized_text)

In [None]:
# text, bert = embeddings_to_words(tokenized_text[:10], bert_embeddings[:10])


Untokenizing text and embeddings...


In [161]:
len(bert_embeddings), bert_embeddings[0].shape

(9414, torch.Size([1, 512, 768]))

### Using Avg Embeddings

In [162]:
bert_embeddings = torch.cat(bert_embeddings)

In [163]:
avg_embeddings = torch.sum(bert_embeddings, dim=1) / 512

In [164]:
avg_embeddings.shape

torch.Size([9414, 768])

In [166]:
avg_embeddings_file_path = "./embeddings/avg_embeddings.pkl"

with open(avg_embeddings_file_path, mode="wb") as file:
    import pickle as pkl

    pkl.dump(avg_embeddings.numpy(), file, protocol=pkl.HIGHEST_PROTOCOL)

### Simple SVM

#### Split the Data

In [182]:
avg_embeddings = avg_embeddings.numpy()

AttributeError: 'numpy.ndarray' object has no attribute 'numpy'

In [221]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    avg_embeddings,
    cleaned_data['label'],
    train_size=0.8, 
    test_size=0.2, 
    random_state=0, 
    shuffle=True,
    stratify=cleaned_data['label']
)

In [222]:
RANDOM_STATE = 42

In [223]:
from sklearn.svm import SVC

clf = SVC(gamma="auto", class_weight="balanced", random_state=RANDOM_STATE)


In [224]:
clf.fit(X_train, y_train)

ValueError: Unknown label type: 'unknown'

In [87]:
from sklearn.metrics import classification_report

In [88]:
print(classification_report(y_train, clf.predict(X_train)))

              precision    recall  f1-score   support

           0       0.95      0.71      0.82      6705
           1       0.23      0.70      0.35       826

    accuracy                           0.71      7531
   macro avg       0.59      0.71      0.58      7531
weighted avg       0.87      0.71      0.76      7531



In [89]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.73      0.82      1677
           1       0.24      0.69      0.35       206

    accuracy                           0.72      1883
   macro avg       0.59      0.71      0.59      1883
weighted avg       0.87      0.72      0.77      1883



### SVM with LeaveOneGroupOut

In [93]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.svm import SVC

In [94]:
logo = LeaveOneGroupOut()

X = avg_embeddings
y = cleaned_data['label']
group = cleaned_data['document']


logo.get_n_splits(X, y, group)


In [None]:
scores = []
for train_idx, test_idx in logo.split(X, y, group):
    for train, test in train_val_test:
        X_train, y_train = X[train_idx], y[train_idx].astype(np.int8)
        X_test, y_test = X[test_idx], y[test_idx].astype(np.int8)
        train_groups = cleaned_data.iloc[train_idx]["document"]

        svm = SVC(gamma="auto", max_iter = 5000, class_weights="balanced", random_state=RANDOM_STATE)
        clf = GridSearchCV(estimator=svm,n_jobs=-1, scoring = 'f1_macro', refit = True)
        clf.fit(X_train, y_train, groups = train_groups)
        score = clf.score(X_test, y_test)
        scores.append(score)
average_test_f1_score = sum(scores)/len(scores)

In [None]:
print(f"Average F1 score from Leave one out document : {average_test_f1_score}")