### **SET-UP PORTION: Ignore Most of This**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from collections import defaultdict, Counter
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
import torch
from transformers import BertConfig, BertModel
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
from datasets import load_dataset
import math
from pathlib import Path
import pytorch_lightning as pl
from torch import nn
from torch.nn import functional as F
import ast
import itertools
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import BertForTokenClassification
from torch.optim import SGD

In [None]:
def process_list(s):
    lst = ast.literal_eval(s)
    return ' '.join(lst)

def process_text(s):
    lst = ast.literal_eval(s)
    return "".join([" "+i if not i.startswith(("'", ",", ".", ":", ";", "!", "?")) else i for i in lst]).strip()

In [None]:
osd_df = pd.read_csv('/content/drive/MyDrive/CS224U/OSD_identification.csv')
osd_df.head(10)

Unnamed: 0,word_id,word,definitions,usages,tags
0,['1'],['1'],"""goodbye""","[['i', ""'ll"", 'talk', 'to', 'you', 'later', '....","[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O']..."
1,['10'],['10'],"A person who's really hot, even more than others","[['she', 'was', 'only', 'the', '10', 'in', 'th...","[['O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O']]"
2,['101'],['101'],a beginner's course,"[['my', 'boyfriend', 'needs', 'to', 're-take',...","[['O', 'O', 'O', 'O', 'O', 'O', 'B', 'O']]"
3,"['101-scoop', ',', '-the']","['101', 'scoop', ',', 'the']","the real information or the proof, instruction...","[['i', 'got', 'the', '101', 'scoop', 'about', ...","[['O', 'O', 'B', 'I', 'I', 'O', 'B', 'O', 'O',..."
4,['10-4'],"['10', '4']","""message understood","[['i', 'was', 'walking', 'up', 'to', 'this', '...","[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',..."
5,['11'],['11'],an extremely attractive person - more attracti...,"[['dude', ',', 'you', 'have', 'no', 'chance', ...","[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',..."
6,['110-percent'],"['110', 'percent']","110%- indicating a statement is true, beyond a...","[['sports', 'team', 'coach', ':', 'i', 'want',...","[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',..."
7,['1337'],['1337'],"""elite""","[['man', ',', 'that', ""'s"", 'one', '1337', 'co...","[['O', 'O', 'O', 'O', 'O', 'B', 'O', 'O'], ['O..."
8,['133t'],['133t'],"""elite","[['that', 'a', '133t', 'bike', '.'], ['that', ...","[['O', 'O', 'B', 'O', 'O'], ['O', 'O', 'O', 'B..."
9,['13th-step'],"['13th', 'step']",To have sex with a fellow addict in a 12-step ...,"[['my', 'sponsor', 'said', 'that', 'a', 'prope...","[['O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O',..."


### **Old Implementation for OSD concatenate with USD for NER purposes**

In [None]:
def process_col(s):
    lst = ast.literal_eval(s)
    return [' '.join(sublist) for sublist in lst]

In [None]:
osd_df['usages'] = osd_df['usages'].apply(process_col)
osd_df['tags'] = osd_df['tags'].apply(process_col)

In [None]:
osd_df.head(10)

Unnamed: 0,word_id,word,definitions,usages,tags
0,['1'],['1'],"""goodbye""","[i 'll talk to you later . 1 ., see you later ...","[O O O O O O O B O, O O O O B O, O B O O O O, ..."
1,['10'],['10'],"A person who's really hot, even more than others",[she was only the 10 in the crowd .],[O O O O B O O O O]
2,['101'],['101'],a beginner's course,[my boyfriend needs to re-take sex 101 .],[O O O O O O B O]
3,"['101-scoop', ',', '-the']","['101', 'scoop', ',', 'the']","the real information or the proof, instruction...",[i got the 101 scoop about the crime ( or prod...,[O O B I I O B O O O O O O B O O O]
4,['10-4'],"['10', '4']","""message understood",[i was walking up to this girl to talk to her ...,"[O O O O O O O O O O O O O O O O O B O, O O O ..."
5,['11'],['11'],an extremely attractive person - more attracti...,"[dude , you have no chance with her . she 's l...",[O O O O O O O O O O O O O O B O]
6,['110-percent'],"['110', 'percent']","110%- indicating a statement is true, beyond a...",[sports team coach : i want each and every one...,"[O O O O O O O O O O O O O O O B O O O O, O O ..."
7,['1337'],['1337'],"""elite""","[man , that 's one 1337 computer !, i like to ...","[O O O O O B O O, O O O O O B O]"
8,['133t'],['133t'],"""elite","[that a 133t bike ., that kid got 133t skillz ...","[O O B O O, O O O B O O O O O]"
9,['13th-step'],"['13th', 'step']",To have sex with a fellow addict in a 12-step ...,[my sponsor said that a proper 13th step requi...,[O O O O O O B I O O O O O O O O O O O O O O O...


In [None]:
usages_flat = list(itertools.chain.from_iterable(osd_df['usages']))
tags_flat = list(itertools.chain.from_iterable(osd_df['tags']))
new_df = pd.DataFrame({'usage': usages_flat, 'tags': tags_flat})

In [None]:
new_df.head(10)

Unnamed: 0,usage,tags
0,i 'll talk to you later . 1 .,O O O O O O O B O
1,"see you later , 1 .",O O O O B O
2,this 1 is done for .,O B O O O O
3,we are going to get this 1 .,O O O O O O B O
4,watching this 1 .,O O B O
5,you are the 1 ( one ) .,O O O B O O O O
6,this 1 better watch their back .,O B O O O O O
7,she was only the 10 in the crowd .,O O O O B O O O O
8,my boyfriend needs to re-take sex 101 .,O O O O O O B O
9,i got the 101 scoop about the crime ( or produ...,O O B I I O B O O O O O O B O O O


In [None]:
len(new_df)

16569

### **New Implementation for OSD to concatenate with UD**

In [None]:
def process_col_v2(df):
    records = []
    for i, row in df.iterrows():
        usages = ast.literal_eval(row['usages'])
        tags = ast.literal_eval(row['tags'])
        for u, t in zip(usages, tags):
            usage_str = ' '.join(u) # join each list of words into a single string
            tag_str = ' '.join(t) # join each list of words into a single string
            records.append({
                'word': row['word'],
                'definitions': row['definitions'],
                'usage': usage_str,
                'tag': tag_str
            })
    return pd.DataFrame(records)

In [None]:
new_df = process_col_v2(osd_df)

ValueError: ignored

In [None]:
new_df = new_df.rename(columns={'definitions': 'definition', 'tag': 'tags'})

In [None]:
new_df.head(10)

In [None]:
len(new_df)

In [None]:
new_df.to_csv('/content/drive/MyDrive/CS224U/processed_OSD_dataset.csv')

### **New Implementation for UD Dataset to Concatenate with OSD**

In [None]:
ud_df = pd.read_csv('/content/drive/MyDrive/CS224U/UD_identification.csv')
ud_copy = ud_df.copy()
ud_copy.head(10)

In [None]:
ud_copy['tags'] = ud_copy['tags'].apply(process_list)
ud_copy['usage'] = ud_copy['usage'].apply(process_text)

In [None]:
ud_copy = ud_copy.drop(columns=['word_id', 'up_votes', 'down_votes'])

In [None]:
ud_copy.head(10)

In [None]:
ud_copy.to_csv('/content/drive/MyDrive/CS224U/processed_UD_dataset.csv')

In [None]:
new_final_df = pd.concat([new_df, ud_copy], ignore_index=True)

In [None]:
len(new_final_df)

In [None]:
file_path = '/content/drive/MyDrive/CS224U/new_combined_data.csv' 
new_final_df.to_csv(file_path, index=False)

### **Old Implementation for UD Dataset Preprocessing**

In [None]:
ud_df = pd.read_csv('/content/drive/MyDrive/CS224U/UD_identification.csv')
ud_df.head(10)

Unnamed: 0,word_id,word,up_votes,down_votes,definition,usage,tags
0,7,['janky'],296,255,Undesirable; less-than optimum.,"['this', 'janky', 'shirt', 'she', 'gave', 'me'...","['O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,17,['wtf'],183,99,what the fuck? ;; use it in place of expletive...,"['wtf', '?', 'whoth', '?', 'whentf', '?', 'wts...","['B', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
2,19,['hazy'],272,184,A guys state of mind after he sees the girl of...,"['fuckin', 'hazy', 'again', '!', '!', '!', '!'...","['O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,32,['ducket'],481,272,a one dollar bill. $1. ;; equivalent to one hu...,"['fucking', 'ducket', 'again', '..']","['O', 'B', 'O', 'O']"
4,47,['puke'],138,109,to vomit,"['this', 'crappy', 'movie', 'made', 'johnny', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O']"
5,48,['folks'],79,59,"p. noun: People, not necessarily related, to w...","['so', 'are', 'your', 'folks', 'coming', 'over...","['O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', ..."
6,50,['dog'],1303,915,"n. friend of the same sex, usually male. Deriv...","['``', 'definition', 'of', 'a', 'dog', ':', 'n...","['O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', ..."
7,54,['raunchy'],230,195,"distasteful, obscene, and or just plain gross","['that', 'was', 'a', 'very', 'raunchy', 'movie...","['O', 'O', 'O', 'O', 'B', 'O', 'O']"
8,55,['energy'],127,52,"can be converted from one form to another, but...","['i', 'so', 'munch', 'energy', 'i', 'can', 'go...","['O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O']"
9,61,['rental'],20,9,means of transportaion that damage is totally ...,"['my', 'mom', 'could', ""n't"", 'afford', 'to', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [None]:
ud_copy = ud_df.copy()
ud_copy.head(10)

Unnamed: 0,word_id,word,up_votes,down_votes,definition,usage,tags
0,7,['janky'],296,255,Undesirable; less-than optimum.,"['this', 'janky', 'shirt', 'she', 'gave', 'me'...","['O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,17,['wtf'],183,99,what the fuck? ;; use it in place of expletive...,"['wtf', '?', 'whoth', '?', 'whentf', '?', 'wts...","['B', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
2,19,['hazy'],272,184,A guys state of mind after he sees the girl of...,"['fuckin', 'hazy', 'again', '!', '!', '!', '!'...","['O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,32,['ducket'],481,272,a one dollar bill. $1. ;; equivalent to one hu...,"['fucking', 'ducket', 'again', '..']","['O', 'B', 'O', 'O']"
4,47,['puke'],138,109,to vomit,"['this', 'crappy', 'movie', 'made', 'johnny', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O']"
5,48,['folks'],79,59,"p. noun: People, not necessarily related, to w...","['so', 'are', 'your', 'folks', 'coming', 'over...","['O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', ..."
6,50,['dog'],1303,915,"n. friend of the same sex, usually male. Deriv...","['``', 'definition', 'of', 'a', 'dog', ':', 'n...","['O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', ..."
7,54,['raunchy'],230,195,"distasteful, obscene, and or just plain gross","['that', 'was', 'a', 'very', 'raunchy', 'movie...","['O', 'O', 'O', 'O', 'B', 'O', 'O']"
8,55,['energy'],127,52,"can be converted from one form to another, but...","['i', 'so', 'munch', 'energy', 'i', 'can', 'go...","['O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O']"
9,61,['rental'],20,9,means of transportaion that damage is totally ...,"['my', 'mom', 'could', ""n't"", 'afford', 'to', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [None]:
ud_copy['tags'] = ud_copy['tags'].apply(process_list)

In [None]:
ud_copy['usage'] = ud_copy['usage'].apply(process_text)

In [None]:
ud_copy = ud_copy.drop(columns=['word_id', 'word', 'up_votes', 'down_votes', 'definition'])

In [None]:
ud_copy.head(10)

Unnamed: 0,usage,tags
0,this janky shirt she gave me is falling apart.,O B O O O O O O O O
1,wtf? whoth? whentf? wts?,B O O O O O O O
2,fuckin hazy again!!!!!!!!!!!!!,O B O O O O O O O O O O O O O O
3,fucking ducket again..,O B O O
4,this crappy movie made johnny want to puke.,O O O O O O O B O
5,"so are your folks coming over for easter, or a...",O O O B O O O O O O O O O O O O O O
6,`` definition of a dog: not a cat'' = baldrick...,O O O O B O O O O O O O O O O O O O O
7,that was a very raunchy movie.,O O O O B O O
8,i so munch energy i can go all night,O O O B O O O O O
9,my mom could n't afford to buy an air conditio...,O O O O O O O O O O O O O O O O O O O O O O O ...


In [None]:
combined_df = pd.concat([ud_copy, new_df])
combined_df.reset_index(drop=True, inplace=True)

In [None]:
combined_df.rename(columns={'usage': 'text'}, inplace=True)
combined_df.rename(columns={'tags': 'labels'}, inplace=True)

In [None]:
combined_df.head(10)

Unnamed: 0,text,labels
0,this janky shirt she gave me is falling apart.,O B O O O O O O O O
1,wtf? whoth? whentf? wts?,B O O O O O O O
2,fuckin hazy again!!!!!!!!!!!!!,O B O O O O O O O O O O O O O O
3,fucking ducket again..,O B O O
4,this crappy movie made johnny want to puke.,O O O O O O O B O
5,"so are your folks coming over for easter, or a...",O O O B O O O O O O O O O O O O O O
6,`` definition of a dog: not a cat'' = baldrick...,O O O O B O O O O O O O O O O O O O O
7,that was a very raunchy movie.,O O O O B O O
8,i so munch energy i can go all night,O O O B O O O O O
9,my mom could n't afford to buy an air conditio...,O O O O O O O O O O O O O O O O O O O O O O O ...


In [None]:
combined_df.to_csv('/content/drive/MyDrive/CS224U/combined_ID_dataset.csv')

In [None]:
labels = [i.split() for i in combined_df['labels'].values.tolist()]

unique_labels = set()
for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]

print(unique_labels)
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v,k in enumerate(sorted(unique_labels))}
print(labels_to_ids)

{'I', 'B', 'O'}
{'B': 0, 'I': 1, 'O': 2}


In [None]:
text = combined_df['text'].values.tolist()
example = text[34]
print(example)

the unit of value and account in japan. since japan's adoption of the gold standard, in 1897, the value of the yen has been about 50 cents. the yen is equal to 100 sen.


In [None]:
from transformers import BertTokenizerFast
tokenizer=BertTokenizerFast.from_pretrained('bert-base-uncased')
text_tokenized = tokenizer(example, padding='max_length', max_length=512, truncation=True, return_tensors="pt")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
print(text_tokenized)

{'input_ids': tensor([[  101,  1996,  3131,  1997,  3643,  1998,  4070,  1999,  2900,  1012,
          2144,  2900,  1005,  1055,  9886,  1997,  1996,  2751,  3115,  1010,
          1999,  6347,  1010,  1996,  3643,  1997,  1996, 18371,  2038,  2042,
          2055,  2753, 16653,  1012,  1996, 18371,  2003,  5020,  2000,  2531,
         12411,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [None]:
print(tokenizer.decode(text_tokenized.input_ids[0]))

[CLS] the unit of value and account in japan. since japan's adoption of the gold standard, in 1897, the value of the yen has been about 50 cents. the yen is equal to 100 sen. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [None]:
word_ids = text_tokenized.word_ids()
print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0]))
print(word_ids)


['[CLS]', 'the', 'unit', 'of', 'value', 'and', 'account', 'in', 'japan', '.', 'since', 'japan', "'", 's', 'adoption', 'of', 'the', 'gold', 'standard', ',', 'in', '1897', ',', 'the', 'value', 'of', 'the', 'yen', 'has', 'been', 'about', '50', 'cents', '.', 'the', 'yen', 'is', 'equal', 'to', '100', 'sen', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

In [None]:
def align_label_example(tokenized_input, labels):

        word_ids = tokenized_input.word_ids()

        previous_word_idx = None
        label_ids = []
   
        for word_idx in word_ids:

            if word_idx is None:
                label_ids.append(-100)
                
            elif word_idx != previous_word_idx:
                try:
                  label_ids.append(labels_to_ids[labels[word_idx]])
                except:
                  label_ids.append(-100)
        
            else:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
      

        return label_ids

In [None]:
label = labels[36]
label_all_tokens = False

new_label = align_label_example(text_tokenized, label)
print(new_label)
print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0]))

[-100, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 

### **Fetching Data in A Batch**

In [None]:
import torch

def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

In [None]:
class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['text'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [None]:
import numpy as np
df_train, df_val, df_test = np.split(combined_df.sample(frac=1, random_state=42),
                            [int(.8 * len(combined_df)), int(.9 * len(combined_df))])

In [None]:
print(len(df_train))
print(len(df_val))
print(len(df_test))

73906
9238
9239


### **MODEL BUILDING: The Actually Important Part**

In [None]:
from transformers import BertForTokenClassification

class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [None]:
def train_loop(model, df_train, df_val):

    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

LEARNING_RATE = 5e-3
EPOCHS = 5
BATCH_SIZE = 2

model = BertModel()
train_loop(model, df_train, df_val)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

Epochs: 1 | Loss:  0.206 | Accuracy:  0.924 | Val_Loss:  0.164 | Accuracy:  0.939


100%|██████████| 36953/36953 [32:18<00:00, 19.06it/s]


Epochs: 2 | Loss:  0.154 | Accuracy:  0.943 | Val_Loss:  0.149 | Accuracy:  0.947


100%|██████████| 36953/36953 [32:18<00:00, 19.06it/s]


Epochs: 3 | Loss:  0.129 | Accuracy:  0.953 | Val_Loss:  0.143 | Accuracy:  0.949


100%|██████████| 36953/36953 [32:16<00:00, 19.08it/s]


Epochs: 4 | Loss:  0.111 | Accuracy:  0.961 | Val_Loss:  0.152 | Accuracy:  0.952


100%|██████████| 36953/36953 [32:21<00:00, 19.04it/s]


Epochs: 5 | Loss:  0.096 | Accuracy:  0.966 | Val_Loss:  0.146 | Accuracy:  0.953


In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/CS224U/slang_ID_model.pth")

In [None]:
model = BertModel()
model.load_state_dict(torch.load('/content/drive/MyDrive/CS224U/slang_ID_model.pth'))
model.eval()

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

BertModel(
  (bert): BertForTokenClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

In [None]:
def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0

    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][test_label[i] != -100]
              label_clean = test_label[i][test_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_test += acc

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')

In [None]:
evaluate(model, df_test)

Test Accuracy:  0.953


In [None]:
def evaluate_f1(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    all_predictions = []
    all_labels = []

    for test_data, test_label in test_dataloader:

        test_label = test_label.to(device)
        mask = test_data['attention_mask'].squeeze(1).to(device)
        input_id = test_data['input_ids'].squeeze(1).to(device)

        _, logits = model(input_id, mask, test_label)

        for i in range(logits.shape[0]):

          logits_clean = logits[i][test_label[i] != -100]
          label_clean = test_label[i][test_label[i] != -100]

          predictions = logits_clean.argmax(dim=1)
          all_predictions.extend(predictions.cpu().numpy())
          all_labels.extend(label_clean.cpu().numpy())

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='macro')

    print(f'Precision: {precision:.3f}')
    print(f'Recall: {recall:.3f}')
    print(f'F1-Score: {f1:.3f}')

In [None]:
evaluate_f1(model, df_test)

Precision: 0.826
Recall: 0.773
F1-Score: 0.798


### **EVALUATE F1 FOR BASELINE MODELS**

In [None]:
from transformers import BertForTokenClassification, BertTokenizerFast
from sklearn.metrics import precision_recall_fscore_support
import torch

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
base_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_labels)).eval()

def evaluate_f1_baseline(model, df_test):
    test_dataset = DataSequence(df_test)
    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    all_predictions = []
    all_labels = []
    for test_data, test_label in test_dataloader:
        mask = test_data['attention_mask'].squeeze(1).to(device)
        input_id = test_data['input_ids'].squeeze(1).to(device)
        test_label = test_label.to(device)
        with torch.no_grad(): 
            outputs = model(input_ids=input_id, attention_mask=mask, labels=test_label)
        predictions = torch.argmax(outputs.logits, dim=-1)
        for i in range(predictions.shape[0]):
            label_clean = test_label[i][test_label[i] != -100]
            all_predictions.extend(predictions[i, :len(label_clean)].cpu().numpy())
            all_labels.extend(label_clean.cpu().numpy())
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='macro')
    print(f'Precision: {precision:.3f}')
    print(f'Recall: {recall:.3f}')
    print(f'F1-Score: {f1:.3f}')


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

In [None]:
evaluate_f1_baseline(base_model, df_test)

Precision: 0.336
Recall: 0.353
F1-Score: 0.246


### **EVALUATE EM FOR BASELINE MODELS**

In [None]:
from sklearn.metrics import accuracy_score
import numpy as np

def evaluate_f1_and_em_baseline(model, df_test):
    test_dataset = DataSequence(df_test)
    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    all_predictions = []
    all_labels = []
    all_sequence_predictions = []
    all_sequence_labels = []
    for test_data, test_label in test_dataloader:
        mask = test_data['attention_mask'].squeeze(1).to(device)
        input_id = test_data['input_ids'].squeeze(1).to(device)
        test_label = test_label.to(device)
        with torch.no_grad(): 
            outputs = model(input_ids=input_id, attention_mask=mask, labels=test_label)
        predictions = torch.argmax(outputs.logits, dim=-1)
        for i in range(predictions.shape[0]):
            label_clean = test_label[i][test_label[i] != -100]
            all_predictions.extend(predictions[i, :len(label_clean)].cpu().numpy())
            all_labels.extend(label_clean.cpu().numpy())
            
            # For EM, we consider whole sequences rather than individual tokens
            all_sequence_predictions.append(predictions[i, :len(label_clean)].cpu().numpy())
            all_sequence_labels.append(label_clean.cpu().numpy())
    
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='macro')
    print(f'Precision: {precision:.3f}')
    print(f'Recall: {recall:.3f}')
    print(f'F1-Score: {f1:.3f}')

    em_total = 0
    for pred, true in zip(all_sequence_predictions, all_sequence_labels):
        if np.array_equal(pred, true):
            em_total += 1
    em_score = em_total / len(all_sequence_predictions)
    print(f'EM-Score: {em_score:.3f}')

    accuracy = accuracy_score(all_labels, all_predictions)
    print(f'Accuracy: {accuracy:.3f}')


In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
base_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_labels)).eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

In [None]:
evaluate_f1_and_em_baseline(base_model, df_test)

Precision: 0.335
Recall: 0.325
F1-Score: 0.173
EM-Score: 0.000
Accuracy: 0.270


### **Evaluate EM function for our model**

In [None]:
def evaluate_EM(model, df_test):

    test_dataset = DataSequence(df_test)
    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_em_test = 0.0

    for test_data, test_label in test_dataloader:
        test_label = test_label.to(device)
        mask = test_data['attention_mask'].squeeze(1).to(device)
        input_id = test_data['input_ids'].squeeze(1).to(device)

        loss, logits = model(input_id, mask, test_label)

        for i in range(logits.shape[0]):
            logits_clean = logits[i][test_label[i] != -100]
            label_clean = test_label[i][test_label[i] != -100]

            predictions = logits_clean.argmax(dim=1)

            # Compute Exact Match
            em = (predictions == label_clean).all().item()
            total_em_test += em

    em_score = total_em_test / len(df_test)
    print(f'Test EM: {em_score:.3f}')


In [None]:
evaluate_EM(model, df_test)

Test EM: 0.652


In [None]:
def evaluate_f1_entity(model, df_test):
    test_dataset = DataSequence(df_test)
    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    all_predictions = []
    all_labels = []

    for test_data, test_label in test_dataloader:
        test_label = test_label.to(device)
        mask = test_data['attention_mask'].squeeze(1).to(device)
        input_id = test_data['input_ids'].squeeze(1).to(device)

        _, logits = model(input_id, mask, test_label)

        for i in range(logits.shape[0]):
          logits_clean = logits[i][test_label[i] != -100]
          label_clean = test_label[i][test_label[i] != -100]
          predictions = logits_clean.argmax(dim=1)

          pred_labels = [ids_to_labels[id] for id in predictions.cpu().numpy()]
          true_labels = [ids_to_labels[id] for id in label_clean.cpu().numpy()]

          # Maintain a list for each sentence
          all_predictions.append(pred_labels)
          all_labels.append(true_labels)


    f1 = f1_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    accuracy = accuracy_score(all_labels, all_predictions)

    print(f'F1-Score: {f1}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'Accuracy: {accuracy}')

In [None]:
evaluate_f1_entity(model, df_test)

F1-Score: 0.6818449670541596
Precision: 0.7172320522934745
Recall: 0.6497855830100061
Accuracy: 0.964722863143954


### **Evaluation on a Single Line**

In [None]:
def align_word_ids(texts):
  
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def evaluate_one_text(model, sentence):

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
    print(tokenizer.convert_ids_to_tokens(text["input_ids"][0]))

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    print(logits)
    logits_clean = logits[0][label_ids != -100]
    print(logits_clean)

    predictions = logits_clean.argmax(dim=1).tolist()
    print(predictions)
    print(len(predictions))
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)

In [None]:
evaluate_one_text(model, 'Bill Gates is a gaylord... as such I hate him')

['[CLS]', 'bill', 'gates', 'is', 'a', 'gay', '##lord', '.', '.', '.', 'as', 'such', 'i', 'hate', 'him', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD