# Using RoBERTa Tokenizer and RoBERTa Model from Huggingface

# Aggregated Preprocessing Steps

In [25]:
import pandas as pd
import numpy as np

In [2]:
!pip install contractions



In [3]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: pd.Series):
    return data.str.lower()


In [4]:
def remove_accented_characters(data: pd.Series):
    import unicodedata

    """Removes accented characters from the Series

    Args:
        data (pd.Series): Series of string

    Returns:
        _type_: pd.Series
    """
    import unicodedata

    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))


In [5]:
def remove_html_encodings(data: pd.Series):
  return data.str.replace(r"&#\d+;", " ", regex=True)

In [6]:
def remove_html_tags(data: pd.Series):
  return data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

In [7]:
def remove_url(data: pd.Series):
  return data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

In [8]:
def remove_html_and_url(data: pd.Series):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (pd.Series): A Pandas series of type string

    Returns:
        _type_: pd.Series
    """
    # Remove HTML encodings
    data.str.replace(r"&#\d+;", " ", regex=True)

    # Remove HTML tags (both open and closed)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

    # Remove URLs
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

    return data


In [9]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z0-9\s]", " ", regex=True)


In [10]:
# Remove extra spaces
def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)


In [11]:
# Expanding contractions
def fix_contractions(data: pd.Series):
    import contractions

    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    return data.apply(contraction_fixer)


In [12]:
# remove "-lrb-"
def remove_special_words(data: pd.Series):
  return data.str.replace(r"\-[^a-zA-Z]{3}\-", " ", regex=True)

In [13]:
!pip install transformers



In [14]:
from transformers import BertTokenizer

### Load Data

In [15]:
train_dataset_path = "./data/tos_clauses_train.csv"
test_dataset_path = "./data/tos_clauses_dev.csv"

In [16]:
train_df = pd.read_csv(train_dataset_path, header=0)
test_df = pd.read_csv(test_dataset_path, header=0)

In [17]:
train_df.head()

Unnamed: 0,label,sentences
0,0,content license and intellectual property rights
1,0,reactivated skype credit is not refundable .
2,1,spotify may change the price for the paid subs...
3,0,the term of your licenses under this eula shal...
4,0,the arbitrator may award declaratory or injunc...


### Clean the Data

In [18]:
# A dictionary containing the columns and a list of functions to perform on it in order
def cleaning(df):
  data_cleaning_pipeline = {
      "sentences": [
          to_lower,
          remove_special_words,
          remove_accented_characters,
          remove_html_encodings,
          remove_html_tags,
          remove_url,
          fix_contractions,
          remove_non_alpha_characters,
          remove_extra_spaces,
      ]
  }

  cleaned_data = df.copy()

  # Process all the cleaning instructions
  for col, pipeline in data_cleaning_pipeline.items():
      # Get the column to perform cleaning on
      temp_data = cleaned_data[col].copy()

      # Perform all the cleaning functions sequencially
      for func in pipeline:
          print(f"Starting: {func.__name__}")
          temp_data = func(temp_data)
          print(f"Ended: {func.__name__}")

      # Replace the old column with cleaned one.
      cleaned_data[col] = temp_data.copy()

  return cleaned_data


In [19]:
train_df = cleaning(train_df)
test_df = cleaning(test_df)

train_df.head(), test_df.head()

Starting: to_lower
Ended: to_lower
Starting: remove_special_words
Ended: remove_special_words
Starting: remove_accented_characters
Ended: remove_accented_characters
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces
Starting: to_lower
Ended: to_lower
Starting: remove_special_words
Ended: remove_special_words
Starting: remove_accented_characters
Ended: remove_accented_characters
Starting: remove_html_encodings
Ended: remove_html_encodings
Starting: remove_html_tags
Ended: remove_html_tags
Starting: remove_url
Ended: remove_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


(   label                                          sentences
 0      0   content license and intellectual property rights
 1      0        reactivated skype credit is not refundable 
 2      1   spotify may change the price for the paid sub...
 3      0   the term of your licenses under this eula sha...
 4      0   the arbitrator may award declaratory or injun...,
    label                                          sentences
 0      0   uber reserves the right to withhold or deduct...
 1      0   niantic s failure to enforce any right or pro...
 2      0   14 3 if you feel that any member you interact...
 3      0   blizzard entertainment has the right to obtai...
 4      0   myfitnesspal does not lrb i rrb guarantee the...)

### Using BERT Tokenizer and BERT Model to get Embeddings

In [20]:
cls = "[CLS]"
sep = "[SEP]"
pad = "[PAD]"
bert_pad_len = 512

In [27]:
import logging
import torch
import numpy as np
import warnings
from transformers import RobertaTokenizer, RobertaModel

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### Functions

In [29]:
def create_tensors_ROBERTA(text):
  """
    Tokenize using ROBERTA Tokenizer for the pd.Series
  """
  print("Tokenizing text...")
  logging.basicConfig(level = logging.INFO)

  # Load the `bert-base-uncased` model
  tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

  # Tokenize every sentence in the pd.Series
  tokenized_text = [tokenizer.tokenize(x) for x in text]

  # Pad the tokens to be used for BERT Model; BERT takes fixed lengend sequence
  tokenized_text = [x + ([pad] * (bert_pad_len - len(x))) for x in tokenized_text]

  # Convert the tokens to their IDs
  indexed_text = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]

  # BERTModel has Q&A format, so setting the context to one for every sentence
  segment_ids = [[1] * len(x) for x in tokenized_text]

  # Convert to tensor
  torch_idx_text = torch.LongTensor(indexed_text)
  torch_seg_ids = torch.LongTensor(segment_ids)
  
  return tokenized_text, torch_idx_text, torch_seg_ids     

In [30]:
#takes in the index and segment tensors and returns the bert embeddings as a list
def get_embeddings(torch_idx_text, torch_seg_ids):
    """
      Create RoBERTa embeddings from tokens
    """
    print("Getting Embeddings...")

    # Load pretrained `roberta-base` model, and set to inference
    model = RobertaModel.from_pretrained('roberta-base', output_hidden_states = True)
    model.eval()

    torch_idx_text, torch_seg_ids = torch_idx_text.to("cpu"), torch_seg_ids.to("cpu")
    model.to(device)

    # Disable gradient and get BERT embeddings
    with torch.no_grad():
        roberta_embeddings = []
        for i in range(len(torch_idx_text)):
            print(i, end = "\r")
            text_temp = torch.unsqueeze(torch_idx_text[i], dim = 0).to(device)
            sgmt_temp = torch.unsqueeze(torch_seg_ids[i], dim = 0).to(device)
            output = model(text_temp, sgmt_temp)
            roberta_embeddings.append(output[0])
            del text_temp, sgmt_temp
    del model
    return roberta_embeddings

Note: As an additional improvement to reduce the dimentionality, we can aggregate expansions of words like - `running` -> "run", "##ing". This is an additional step is not mandatory and can be used when trying out the embeddings.

In [31]:
def embeddings_to_words(tokenized_text, embeddings):
    """
      Clubbing same word tokens to reduce dimensionality
      Note: Need to run this locally and tweak virtual memory, as the colab
      runtime crashes
    """
    print("Untokenizing text and embeddings...")
    embeddings = [x.cpu().detach().numpy() for x in embeddings]
    embeddings = np.concatenate(embeddings, axis = 0)
    sentences = []
    final_emb = []

    # Iterate over every sentence
    for i in range(len(tokenized_text)):
        txt = tokenized_text[i]
        sub_len = 0
        sent = []
        sub = []
        emb = []
        sub_emb = None
        try:
            idx = txt.index(pad)
        except:
            idx = len(txt)
        for j in range(idx):
            # For the token that starts with ## process it; remove ## and
            # club that with previous token;
            # For the embedding, take the average of token's embeddings
            if txt[j].startswith("##"):
                if sub == []:
                    sub.append(sent.pop())
                    emb.pop()
                    sub_emb = embeddings[i][j - 1] + embeddings[i][j]
                    sub.append(txt[j][2:])
                    sub_len = 2
                else:
                    sub.append(txt[j][2:])
                    sub_emb += embeddings[i][j]
                    sub_len += 1
            else:
                if sub != []:
                    sent.append("".join(sub))
                    emb.append(sub_emb / sub_len)
                    sub = []
                    sub_emb = None
                    sub_len = 0
                sent.append(txt[j])
                emb.append(embeddings[i][j])
        sentences.append(sent)
        final_emb.append(emb)
    return sentences, final_emb

### Tokenize and Create Embeddings

In [32]:
train_tokenized_text, train_torch_idx_text, train_torch_seg_ids = create_tensors_ROBERTA(train_df.sentences)
test_tokenized_text, test_torch_idx_text, test_torch_seg_ids = create_tensors_ROBERTA(test_df.sentences)

Tokenizing text...


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Tokenizing text...


In [33]:
train_roberta_embeddings = get_embeddings(train_torch_idx_text, train_torch_seg_ids)
test_roberta_embeddings = get_embeddings(test_torch_idx_text, test_torch_seg_ids)


Getting Embeddings...


Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting Embeddings...


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1882

Most computationally time consuming part is to get embeddings

Store the embeddings for using in other models.

In [36]:
import pickle as pkl
train_embeddings_file_path = "./data/train_roberta_embeddings.pkl"
test_embeddings_file_path = "./data/test_roberta_embeddings.pkl"

def save_embeddings(embeddings_file_path, embeddings, tokenized_text):
  with open(embeddings_file_path, mode="wb") as file:
    pkl.dump({"embeddings": embeddings, "tokenized_txt": tokenized_text}, file, protocol=pkl.HIGHEST_PROTOCOL)

In [38]:
save_embeddings(train_embeddings_file_path, train_roberta_embeddings, train_tokenized_text)
save_embeddings(test_embeddings_file_path, test_roberta_embeddings, test_tokenized_text)

In [None]:
# text, bert = embeddings_to_words(tokenized_text[:10], bert_embeddings[:10])


Untokenizing text and embeddings...


In [39]:
len(train_roberta_embeddings), train_roberta_embeddings[0].shape

(7531, torch.Size([1, 512, 768]))

In [44]:
train_roberta_embeddings = torch.cat(train_roberta_embeddings)
test_roberta_embeddings = torch.cat(test_roberta_embeddings)

In [46]:
train_avg_embeddings = torch.sum(train_roberta_embeddings, dim=1) / 512
test_avg_embeddings = torch.sum(test_roberta_embeddings, dim=1) / 512

In [47]:
train_avg_embeddings.shape, test_avg_embeddings.shape

(torch.Size([7531, 768]), torch.Size([1883, 768]))

In [48]:
type(train_avg_embeddings.numpy()[0])

numpy.ndarray

In [21]:
train_avg_embeddings_file_path = "./embeddings/train_roberta_avg_embeddings.pkl"
test_avg_embeddings_file_path = "./embeddings/test_roberta_avg_embeddings.pkl"


In [50]:
with open(train_avg_embeddings_file_path, mode="wb") as train_file, \
    open(test_avg_embeddings_file_path, mode="wb") as test_file:
    import pickle as pkl

    pkl.dump(train_avg_embeddings.numpy(), train_file, protocol=pkl.HIGHEST_PROTOCOL)
    pkl.dump(test_avg_embeddings.numpy(), test_file, protocol=pkl.HIGHEST_PROTOCOL)

In [22]:
with open(train_avg_embeddings_file_path, mode="rb") as train_file, \
    open(test_avg_embeddings_file_path, mode="rb") as test_file:

    import pickle as pkl

    X_train = pkl.load(train_file)
    y_train = train_df.label.to_list()

    X_test = pkl.load(test_file)
    y_test = test_df.label.to_list()

### Simple SVM

In [35]:
RANDOM_STATE = 42

In [36]:
from sklearn.svm import SVC

clf = SVC(gamma="auto", class_weight="balanced", random_state=RANDOM_STATE)


In [54]:
clf.fit(X_train, train_df.label)

In [55]:
from sklearn.metrics import classification_report

In [56]:
print(classification_report(y_train, clf.predict(X_train)))

              precision    recall  f1-score   support

           0       0.94      0.63      0.75      6705
           1       0.18      0.68      0.29       826

    accuracy                           0.63      7531
   macro avg       0.56      0.65      0.52      7531
weighted avg       0.86      0.63      0.70      7531



In [57]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      0.63      0.75      1677
           1       0.18      0.67      0.28       206

    accuracy                           0.63      1883
   macro avg       0.56      0.65      0.52      1883
weighted avg       0.86      0.63      0.70      1883



### SVM with LeaveOneGroupOut

In [26]:
files = os.listdir("./data/Sentences/")
prefix_sentence = "./data/Sentences/"
prefix_label = "./data/Labels/"

clauses = []
for file in files:
    sentence_file_path = prefix_sentence + file 
    label_file_path = prefix_label + file
    sentences_df = pd.read_csv(sentence_file_path, sep="dummy_separator", header=None)
    sentences_df.columns = ["sentences"]
    label_df = pd.read_csv(label_file_path, sep=" ", header=None)
    label_df.columns = ["label"]
    label_df["label_converted"] = np.where(label_df["label"] == -1, 0, 1)
    sentences_df["document"] = file
    df_concat = pd.concat([label_df["label_converted"], sentences_df], axis=1)
    clauses.append(df_concat)

  return func(*args, **kwargs)


In [27]:
colnames = ["sentences", "label_converted", "document"]
clauses_df = pd.DataFrame(columns = colnames)
for df in clauses:
    clauses_df = clauses_df.append(df)

In [28]:
clauses_df.rename(columns={'label_converted': 'label', 'sentences': 'sentences', 'document' : 'document'}, inplace=True)


In [29]:
clauses_df.head()

Unnamed: 0,sentences,label,document
0,thanks for sending us good vibes by using the ...,0,Viber.txt
1,"you may be surprised , but we will refer to al...",0,Viber.txt
2,"the terms of use -lrb- or , the `` terms '' -r...",0,Viber.txt
3,the language of the terms will seem legal -lrb...,0,Viber.txt
4,"when you use our services , in addition to enj...",1,Viber.txt


In [30]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GridSearchCV, cross_val_score


In [31]:
logo = LeaveOneGroupOut()

X = np.concatenate([X_train, X_test])
y = np.array(y_train + y_test)
group = clauses_df["document"]

logo.get_n_splits(X, y, group)


50

In [32]:
X[np.array([1, 2, 3])]

array([[ 0.20115653,  0.19506606, -0.17185788, ...,  0.13769725,
        -0.05337823,  0.03994849],
       [ 0.11321874,  0.1895015 , -0.15294266, ..., -0.00742536,
        -0.09807333, -0.01897632],
       [ 0.17579886,  0.15692976, -0.18851504, ...,  0.1078179 ,
        -0.07142051, -0.1317076 ]], dtype=float32)

In [33]:
y.shape

(9414,)

In [37]:
scores = []

for train_idx, test_idx in logo.split(X, y, group):
    X_train, y_train = X[train_idx], y[train_idx].astype(np.int8)
    X_test, y_test = X[test_idx], y[test_idx].astype(np.int8)
    train_groups = clauses_df.iloc[train_idx]["document"]

    svm = SVC(gamma="auto", class_weight="balanced", random_state=RANDOM_STATE)
    clf = GridSearchCV(estimator=svm, param_grid={}, n_jobs=-1, scoring = 'f1_macro', refit = True)
    clf.fit(X_train, y_train, groups = train_groups)
    score = clf.score(X_test, y_test)
    scores.append(score)
    print(score)

average_test_f1_score = sum(scores)/len(scores)

0.43131544275709033
0.47779162739792663
0.4716216216216217
0.5279865722834416
0.53125
0.4649621212121212
0.5139114934511353
0.5027154663518301
0.5369670804101457
0.44597069597069594
0.49139419927342026
0.507313029760154
0.5449329873592497
0.5575757575757576
0.5413292589763178
0.5272331154684096
0.5283283283283282
0.5091418385536033
0.5216359918200408
0.4911258729347641
0.4068517516046709
0.5325337837837838
0.5568181818181817
0.5057471264367817
0.5231907723308898
0.5327981801876682
0.4857571214392804
0.5609046126287505
0.5254901960784314
0.4930014718062698
0.522203947368421
0.5359942979330008
0.5345270440811842
0.5430555555555556
0.5818181818181818
0.4887329931972789
0.44376327043272823
0.5017942583732057
0.5803066989507667
0.546791569086651
0.5250930560903606
0.5377031468059245
0.5761489070052666
0.5103264490339774
0.49831862126944093
0.5537634408602151
0.506218487394958
0.5148185148185148
0.4899738448125545
0.5323877596604869


In [38]:
print(f"Average F1 score from Leave one out document : {average_test_f1_score}")

Average F1 score from Leave one out document : 0.5154267148833886
