# Imports

In [157]:
import torch
from transformers import DataCollatorWithPadding, DistilBertTokenizer, pipeline, \
                    DistilBertForSequenceClassification, TrainingArguments, Trainer, \
                    DistilBertTokenizerFast, DistilBertForTokenClassification, AutoModelForTokenClassification
from transformers import TrainerCallback
from torch.utils.data.dataloader import default_collate
from datasets import Dataset
import numpy as np
import pandas as pd
import math
from pynvml import *
from functools import partial
import evaluate
accuracy = evaluate.load("accuracy")
from sklearn.metrics import confusion_matrix
from accelerate import Accelerator
accelerator = Accelerator()
import gc
import matplotlib.pyplot as plt
import seaborn as sn
from IPython.display import clear_output

import contractions
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
stop_words = set(stopwords.words('english'))
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

[nltk_data] Downloading package stopwords to /home/connor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [170]:
# !pip install torch
# !pip install transformers
# !pip install datasets
# !pip install nltk
# !pip install contractions
# !pip install seaborn
# !pip install accelerate
# !pip install scikit-learn
# !pip install evaluate
# !pip install pynvml
# !pip install seqeval

# EDA Location Data 

## EDA Functions 

In [141]:
def preprocess_text(text):
    text = text.lower()
    text = contractions.fix(text)
    # text = text.replace('(','')
    # text = text.replace(')','')
    # text = text.replace('-','')
    # text = text.replace(',','')
    # text = text.replace('–','')
    # text = ' '.join([word for word in text.split() if word not in stop_words])
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

## Create Data Set For label Studio

In [163]:
df = pd.read_csv('all_data.csv')
df

Unnamed: 0,id,title,description,raw_location
0,greenhouse::xometry::4322444007,"Vice President, Software Engineering",Xometry (NASDAQ: XMTR) powers the industries o...,"North Bethesda, MD, Lexington, KY, Remote"
1,greenhouse::splitmetrics::4285324101,VP of Product,"Hi! Its SplitMetrics, a remote-first team of e...",Remote - United Kingdom
2,greenhouse::splitmetrics::4277403101,VP of Product,"Hi! Its SplitMetrics, a remote-first team of e...",Remote
3,ashbyhq::airwallex::6096edab-08a6-475a-aa0c-69...,"Vice President, Engineering","At Airwallex (airwallex.com), were building th...",US - San Francisco
4,lever::centml::8397564a-9cf7-4491-bfd2-b425510...,VP Engineering,About Us We believe AI will fundamentally tran...,"Remote, USA"
...,...,...,...,...
5318,lever::tokenmetrics::5af6e633-25cf-4b1e-826f-1...,Front End Web Developer Intern (Buenos Aires-R...,We are looking for programmers with a keen eye...,
5319,smartrecruiters::EndeavorITSolution::743999656...,Android Internship In Endevor IT Solutions,Company Description Bestowed with high profess...,"Indore, MP, in"
5320,lever::tokenmetrics::6f57bb88-07ab-4ac4-bc58-7...,Business Analyst Intern (Buenos Aires -Remote),Token Metrics is searching for a highly capabl...,
5321,workday::astrazeneca::wd3::Careers::astrazenec...,Graduate Automation Engineer,Graduate Engineer- AutomationWe have a great o...,Australia – New South Wales


In [164]:
new_df = df.dropna(subset=['raw_location'])

In [165]:
new_df['fmt_raw_location'] = new_df['raw_location'].apply(lambda x: preprocess_text(x))
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['fmt_raw_location'] = new_df['raw_location'].apply(lambda x: preprocess_text(x))


Unnamed: 0,id,title,description,raw_location,fmt_raw_location
0,greenhouse::xometry::4322444007,"Vice President, Software Engineering",Xometry (NASDAQ: XMTR) powers the industries o...,"North Bethesda, MD, Lexington, KY, Remote","north bethesda, md, lexington, ky, remote"
1,greenhouse::splitmetrics::4285324101,VP of Product,"Hi! Its SplitMetrics, a remote-first team of e...",Remote - United Kingdom,remote - united kingdom
2,greenhouse::splitmetrics::4277403101,VP of Product,"Hi! Its SplitMetrics, a remote-first team of e...",Remote,remote
3,ashbyhq::airwallex::6096edab-08a6-475a-aa0c-69...,"Vice President, Engineering","At Airwallex (airwallex.com), were building th...",US - San Francisco,us - san francisco
4,lever::centml::8397564a-9cf7-4491-bfd2-b425510...,VP Engineering,About Us We believe AI will fundamentally tran...,"Remote, USA","remote, usa"
...,...,...,...,...,...
5316,lever::tokenmetrics::f28bdf66-e2f6-48f6-a5fa-6...,Crypto QA Automation Engineer Intern (Sao Paul...,Token Metrics is looking for an experienced QA...,São Paulo,são paulo
5317,lever::tokenmetrics::4c479c69-0fae-4489-bd74-e...,Crypto Engineering Manager Intern (Bucharest-R...,We are looking for a trustworthy and proactive...,Bucharest,bucharest
5319,smartrecruiters::EndeavorITSolution::743999656...,Android Internship In Endevor IT Solutions,Company Description Bestowed with high profess...,"Indore, MP, in","indore, mp, in"
5321,workday::astrazeneca::wd3::Careers::astrazenec...,Graduate Automation Engineer,Graduate Engineer- AutomationWe have a great o...,Australia – New South Wales,australia – new south wales


In [222]:
new_df.to_csv('all_data_fmt_location.csv', index=False, header=True)

In [223]:
df = pd.read_csv('all_data_fmt_location.csv')
df

Unnamed: 0,id,title,description,raw_location,fmt_raw_location
0,greenhouse::xometry::4322444007,"Vice President, Software Engineering",Xometry (NASDAQ: XMTR) powers the industries o...,"North Bethesda, MD, Lexington, KY, Remote","north bethesda, md, lexington, ky, remote"
1,greenhouse::splitmetrics::4285324101,VP of Product,"Hi! Its SplitMetrics, a remote-first team of e...",Remote - United Kingdom,remote - united kingdom
2,greenhouse::splitmetrics::4277403101,VP of Product,"Hi! Its SplitMetrics, a remote-first team of e...",Remote,remote
3,ashbyhq::airwallex::6096edab-08a6-475a-aa0c-69...,"Vice President, Engineering","At Airwallex (airwallex.com), were building th...",US - San Francisco,us - san francisco
4,lever::centml::8397564a-9cf7-4491-bfd2-b425510...,VP Engineering,About Us We believe AI will fundamentally tran...,"Remote, USA","remote, usa"
...,...,...,...,...,...
5213,lever::tokenmetrics::f28bdf66-e2f6-48f6-a5fa-6...,Crypto QA Automation Engineer Intern (Sao Paul...,Token Metrics is looking for an experienced QA...,São Paulo,são paulo
5214,lever::tokenmetrics::4c479c69-0fae-4489-bd74-e...,Crypto Engineering Manager Intern (Bucharest-R...,We are looking for a trustworthy and proactive...,Bucharest,bucharest
5215,smartrecruiters::EndeavorITSolution::743999656...,Android Internship In Endevor IT Solutions,Company Description Bestowed with high profess...,"Indore, MP, in","indore, mp, in"
5216,workday::astrazeneca::wd3::Careers::astrazenec...,Graduate Automation Engineer,Graduate Engineer- AutomationWe have a great o...,Australia – New South Wales,australia – new south wales


# Model Train

## Load Label Studio Export Data

In [147]:
df = pd.read_json('label-studio-export.json')
df['label'] = df['label'].apply(lambda x: [] if x is np.nan else x)# Replace nans with an empty list 
df

Unnamed: 0,id,title,description,raw_location,fmt_raw_location,label,annotator,annotation_id,created_at,updated_at,lead_time
0,4155,"Vice President, Software Engineering",Xometry (NASDAQ: XMTR) powers the industries o...,"North Bethesda, MD, Lexington, KY, Remote","north bethesda, md, lexington, ky, remote","[{'start': 16, 'end': 18, 'text': 'md', 'label...",2,62,2024-04-09 21:34:21.146053+00:00,2024-04-09 21:34:21.146086+00:00,14.943
1,4156,VP of Product,"Hi! Its SplitMetrics, a remote-first team of e...",Remote - United Kingdom,remote - united kingdom,"[{'start': 9, 'end': 23, 'text': 'united kingd...",2,63,2024-04-09 21:34:34.088538+00:00,2024-04-09 21:34:34.088571+00:00,10.999
2,4157,VP of Product,"Hi! Its SplitMetrics, a remote-first team of e...",Remote,remote,"[{'start': 0, 'end': 6, 'text': 'remote', 'lab...",2,64,2024-04-09 21:34:39.463909+00:00,2024-04-09 21:34:39.463942+00:00,3.831
3,4158,"Vice President, Engineering","At Airwallex (airwallex.com), were building th...",US - San Francisco,us - san francisco,"[{'start': 5, 'end': 18, 'text': 'san francisc...",2,65,2024-04-09 21:34:49.208866+00:00,2024-04-09 21:34:49.208903+00:00,6.598
4,4159,VP Engineering,About Us We believe AI will fundamentally tran...,"Remote, USA","remote, usa","[{'start': 8, 'end': 11, 'text': 'usa', 'label...",2,66,2024-04-09 21:34:58.376343+00:00,2024-04-09 21:34:58.376374+00:00,6.864
...,...,...,...,...,...,...,...,...,...,...,...
111,4319,"VP Product, Hardware","Location: Berlin (On-site) At SumUp, we are mo...","Berlin, Germany","berlin, germany","[{'start': 0, 'end': 6, 'text': 'berlin', 'lab...",2,162,2024-04-09 22:04:30.532149+00:00,2024-04-09 22:04:30.532181+00:00,4.245
112,4322,"VP, Product Management - IT Leaders",This is a unique opportunity to make an impact...,2 Locations,2 locations,[],2,163,2024-04-09 22:04:35.792296+00:00,2024-04-09 22:04:35.792331+00:00,2.514
113,4328,"Vice President, GSI and Consulting Partnership",Sonar solves the trillion-dollar challenge of ...,Austin,austin,"[{'start': 0, 'end': 6, 'text': 'austin', 'lab...",2,175,2024-04-09 22:14:11.679114+00:00,2024-04-09 22:14:11.679147+00:00,2.546
114,4329,VP Engineering,About Safe:Safe is the account abstraction lea...,Berlin,berlin,"[{'start': 0, 'end': 6, 'text': 'berlin', 'lab...",2,176,2024-04-09 22:14:15.587134+00:00,2024-04-09 22:14:15.587156+00:00,2.196


In [148]:
df.iloc[0]['label']

[{'start': 16, 'end': 18, 'text': 'md', 'labels': ['state']},
 {'start': 31, 'end': 33, 'text': 'ky', 'labels': ['state']},
 {'start': 35, 'end': 41, 'text': 'remote', 'labels': ['remote']},
 {'start': 0, 'end': 14, 'text': 'north bethesda', 'labels': ['city']},
 {'start': 20, 'end': 29, 'text': 'lexington', 'labels': ['city']}]

## NER Helper Functions

In [158]:
def create_ner_label_mappings(token_labels):
    label2id = {'O': 0}
    id2label = {0: 'O'}
    index = 1  
    for label in token_labels:
        for prefix in ['B-', 'I-']:
            current_label = f"{prefix}{label.upper()}"
            label2id[current_label] = index
            id2label[index] = current_label
            index += 1
    print(f"Num Labels: {len(label2id.keys())}")
    print(f"label2id: {label2id}")
    print(f"id2label: {id2label}")
    return label2id, id2label

def format_rows_for_ner_train(df):
    final_df = df[['fmt_raw_location','label']].copy()
    final_df.rename(columns = {'fmt_raw_location':'text'}, inplace = True) 
    
    training_rows_DS = Dataset.from_pandas(final_df)
    training_rows_DS = training_rows_DS.train_test_split(train_size=.8, seed=42) 

    train_rows = pd.DataFrame(list(zip(training_rows_DS['train']['text'], training_rows_DS['train']['label'])), columns =['text','label'])
    test_rows = pd.DataFrame(list(zip(training_rows_DS['test']['text'], training_rows_DS['test']['label'])), columns =['text','label'])
    
    return training_rows_DS, train_rows, test_rows

def align_label(tokenized_inputs, label_list):
    if isinstance(tokenized_inputs['input_ids'], torch.Tensor):
        label_length = tokenized_inputs['input_ids'].size()[1]
    else:
        label_length = len(tokenized_inputs['input_ids'])
    
    example_labels = ['O'] * label_length # Initialize all tokens as 'O'

    if isinstance(tokenized_inputs['offset_mapping'], torch.Tensor):
        offset_mapping_list = tokenized_inputs['offset_mapping'][0] 
        offset_mapping_len  = offset_mapping_list.size()[0]  
    else:
        offset_mapping_list = tokenized_inputs['offset_mapping']
        offset_mapping_len  = len(offset_mapping_list)
        
    for label in label_list:
        label_start = label['start']
        label_end = label['end']
        label_type = label['labels'][0]

        # Assign B- and I- labels
        for idx, (start, end) in enumerate(offset_mapping_list):
            if idx == 0 or idx == offset_mapping_len - 1 or start == end:
                continue
            if start >= label_start and end <= label_end:
                if start == label_start:
                    example_labels[idx] = f"B-{label_type.upper()}"
                else:
                    example_labels[idx] = f"I-{label_type.upper()}"

    return example_labels

def align_labels_with_tokens_batched(batch, tokenizer, label2id, tokenizer_config, **kwargs):
    batch_input_ids, batch_attention_mask, batch_labels = [], [], []

    for i in range(len(batch['text'])):
        tokenized_inputs = tokenizer(batch['text'][i], **tokenizer_config)
        example_labels = align_label(tokenized_inputs, label_list = batch['label'][i])

        # Add tokenized inputs and labels for this example to the batch
        batch_input_ids.append(tokenized_inputs['input_ids'])
        batch_attention_mask.append(tokenized_inputs['attention_mask'])

        # label_ids = [label2id[label] for label in example_labels]# Old way where cls and sep were labeled as 0. New way is below. 
        
        # Convert labels to numerical IDs; label special tokens and padding as -100
        last_real_token_index = len(tokenized_inputs['attention_mask']) - 1 - tokenized_inputs['attention_mask'][::-1].index(1)    
        label_ids = [
            -100 if i == 0 or i == last_real_token_index or tokenized_inputs['attention_mask'][i] == 0 else label2id.get(label, label2id['O'])
            for i, label in enumerate(example_labels)
        ]
        
        batch_labels.append(label_ids)  # Convert labels to numerical IDs
        
        # Remove offset_mapping  
        tokenized_inputs.pop('offset_mapping', None)

    return {
        'input_ids': batch_input_ids,
        'attention_mask': batch_attention_mask,
        'labels': batch_labels
    }
    
def align_labels_with_tokens_single(example, tokenizer, label2id, tokenizer_config, **kwargs):
    """
    Function to align labels with tokens for a single example. Can handle returning pts from tokenizer or standard lists. 
    """
    tokenized_inputs = tokenizer(example['text'], **tokenizer_config)
    example_labels = align_label(tokenized_inputs, label_list = example['label'])
    # Remove offsets mapping 
    tokenized_inputs.pop("offset_mapping", None)
    # tokenized_inputs['labels'] = [label2id[label] for label in example_labels] # Old way where cls and sep were labeled as 0. New way is below. 


    # Convert labels to numerical IDs; label special tokens and padding as -100
    if isinstance(tokenized_inputs['attention_mask'], torch.Tensor):
        attention_mask_list = tokenized_inputs['attention_mask'][0].tolist()            
    else:
        attention_mask_list = tokenized_inputs['attention_mask']
        
    last_real_token_index = len(attention_mask_list) - 1 - attention_mask_list[::-1].index(1)
    tokenized_inputs['labels'] = [
        -100 if i == 0 or i == last_real_token_index or attention_mask_list[i] == 0 else label2id.get(label, label2id['O'])
        for i, label in enumerate(example_labels)
    ]
    return tokenized_inputs

def custom_data_collator(features):
    """
    Custom data collator that bypasses tokenization and padding.
    Assumes that all preprocessing, including tokenization and padding, has been handled.
    """
    for feature in features:
        for key, value in feature.items():
            if isinstance(value, list):
                feature[key] = torch.tensor(value)
    return default_collate(features)

def compute_metrics_token_classification(pred):
    predictions = np.argmax(pred.predictions, axis=2)
    true_labels = pred.label_ids

    pred_labels = [[id2label[p] for (p, l) in zip(prediction, true_label) if l != -100] for prediction, true_label in zip(predictions, true_labels)]
    true_labels = [[id2label[l] for (p, l) in zip(prediction, true_label) if l != -100] for prediction, true_label in zip(predictions, true_labels)]

    return {
        "precision": precision_score(true_labels, pred_labels),
        "recall": recall_score(true_labels, pred_labels),
        "f1": f1_score(true_labels, pred_labels),
        "report": classification_report(true_labels, pred_labels),
        "pred_labels": pred_labels,
        "true_labels": true_labels
    }

class selective_logging_callback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        # Filter out the  report and labels from logs before printing
        if logs:
            logs.pop("report", None)  # Remove report from logs
            logs.pop("pred_labels", None) 
            logs.pop("true_labels", None) 

## Model Function

In [160]:
def run_model_v10(train_rows, test_rows, preprocess_funct_batched, tokenizer, tokenizer_config, num_epochs, train_batch_size, test_batch_size, base_model, out_put_dir, compute_metrics,  should_return=False):
    
    train_rows_DS = Dataset.from_pandas(train_rows)
    test_rows_DS = Dataset.from_pandas(test_rows)
        
    train_rows_DS = train_rows_DS.map(preprocess_funct_batched, batched=True, remove_columns=["text", "label"])
    test_rows_DS = test_rows_DS.map(preprocess_funct_batched, batched=True, remove_columns=["text", "label"])

    # base_model = accelerator.prepare(base_model)
    
    training_args = TrainingArguments(
        output_dir=out_put_dir,
        overwrite_output_dir=True, 
        num_train_epochs=num_epochs, 
        per_device_train_batch_size=train_batch_size, 
        per_device_eval_batch_size=test_batch_size,  
        load_best_model_at_end=True,
        logging_steps=5,
        log_level='info',
        evaluation_strategy='epoch',
        save_strategy='epoch'
    )

    trainer = Trainer(
        model=base_model,
        args=training_args,
        train_dataset=train_rows_DS,
        eval_dataset=test_rows_DS,
        data_collator=custom_data_collator,
        compute_metrics=compute_metrics_token_classification,
        callbacks=[selective_logging_callback()]
    )
    
    eval_results_before = trainer.evaluate()
    
    train_results = trainer.train()
    
    eval_results_after = trainer.evaluate()
    
    trainer.save_model()
    return eval_results_after
    

## Setup Data for model train

In [151]:
training_rows_DS, train_rows, test_rows = format_rows_for_ner_train(df)

## Model Config

In [152]:
max_length=256
out_put_dir = f"./location_bert"
tokenizer_config = {'max_length': max_length,
                    'truncation': True,
                    'padding': 'max_length',
                    'return_offsets_mapping':True,
                    'is_split_into_words': False
                   }
token_labels = ['city', 'state', 'country', 'remote']
label2id, id2label = create_ner_label_mappings(token_labels)

train_batch = 4
test_batch = 4
epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")



Num Labels: 9
label2id: {'O': 0, 'B-CITY': 1, 'I-CITY': 2, 'B-STATE': 3, 'I-STATE': 4, 'B-COUNTRY': 5, 'I-COUNTRY': 6, 'B-REMOTE': 7, 'I-REMOTE': 8}
id2label: {0: 'O', 1: 'B-CITY', 2: 'I-CITY', 3: 'B-STATE', 4: 'I-STATE', 5: 'B-COUNTRY', 6: 'I-COUNTRY', 7: 'B-REMOTE', 8: 'I-REMOTE'}
Device: cuda


## Base Model and tokenizer Load

In [161]:
tokenizer_funct = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
base_model = DistilBertForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(id2label.keys()), id2label=id2label, label2id=label2id
)
base_model.to(device)

loading file vocab.txt from cache at /home/connor/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/vocab.txt
loading file tokenizer.json from cache at /home/connor/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/connor/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/tokenizer_config.json
loading configuration file config.json from cache at /home/connor/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout":

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

## Model Training 

In [162]:
# Add the correct agurements to align_labels_with_tokens_batched function 
preprocess_funct_batched = partial(align_labels_with_tokens_batched, tokenizer=tokenizer_funct, label2id=label2id, tokenizer_config=tokenizer_config)
# Train the model 
results = run_model_v10(train_rows, test_rows, preprocess_funct_batched, tokenizer_funct, tokenizer_config, epochs, train_batch, test_batch, base_model, out_put_dir, compute_metrics_token_classification,  True)
del base_model

Map: 100%|██████████| 92/92 [00:00<00:00, 2690.12 examples/s]
Map: 100%|██████████| 24/24 [00:00<00:00, 2955.47 examples/s]
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
***** Running Evaluation *****
  Num examples = 24
  Batch size = 4


***** Running training *****
  Num examples = 92
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 115
  Number of trainable parameters = 66,369,801


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Report,Pred Labels,True Labels
1,0.9959,0.569134,0.647059,0.75,0.694737,precision recall f1-score support  CITY 0.48 0.70 0.57 20  COUNTRY 1.00 0.67 0.80 12  REMOTE 1.00 0.67 0.80 3  STATE 0.75 1.00 0.86 9  micro avg 0.65 0.75 0.69 44  macro avg 0.81 0.76 0.76 44 weighted avg 0.71 0.75 0.71 44,"[['B-COUNTRY', 'O', 'B-CITY', 'B-CITY'], ['O', 'O', 'O', 'O', 'B-CITY'], ['B-CITY', 'B-CITY', 'O', 'B-STATE'], ['B-CITY', 'B-CITY', 'B-CITY', 'B-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'B-STATE'], ['B-CITY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['O', 'O', 'B-REMOTE'], ['B-CITY', 'O', 'B-COUNTRY', 'B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE', 'O', 'B-STATE'], ['B-REMOTE'], ['B-CITY', 'B-CITY', 'O', 'B-STATE'], ['B-CITY', 'B-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'B-CITY', 'O', 'B-STATE', 'O'], ['B-CITY', 'O', 'O', 'B-STATE'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY']]","[['B-COUNTRY', 'O', 'B-CITY', 'I-CITY'], ['O', 'O', 'O', 'O', 'B-CITY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'I-CITY', 'I-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'B-REMOTE'], ['B-CITY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-COUNTRY', 'O', 'B-REMOTE'], ['B-CITY', 'O', 'B-COUNTRY', 'B-COUNTRY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE', 'O', 'B-COUNTRY'], ['B-REMOTE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE', 'O'], ['B-CITY', 'O', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY']]"
2,0.3196,0.263317,0.930233,0.909091,0.91954,precision recall f1-score support  CITY 0.90 0.95 0.93 20  COUNTRY 1.00 0.83 0.91 12  REMOTE 1.00 0.67 0.80 3  STATE 0.90 1.00 0.95 9  micro avg 0.93 0.91 0.92 44  macro avg 0.95 0.86 0.90 44 weighted avg 0.94 0.91 0.92 44,"[['B-COUNTRY', 'O', 'B-CITY', 'I-CITY'], ['O', 'O', 'O', 'O', 'B-CITY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['O', 'B-CITY', 'I-CITY', 'I-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'O'], ['B-CITY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-COUNTRY', 'O', 'B-REMOTE'], ['B-CITY', 'O', 'B-COUNTRY', 'B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE', 'O', 'B-COUNTRY'], ['B-REMOTE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE', 'O'], ['B-CITY', 'O', 'O', 'B-STATE'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY']]","[['B-COUNTRY', 'O', 'B-CITY', 'I-CITY'], ['O', 'O', 'O', 'O', 'B-CITY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'I-CITY', 'I-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'B-REMOTE'], ['B-CITY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-COUNTRY', 'O', 'B-REMOTE'], ['B-CITY', 'O', 'B-COUNTRY', 'B-COUNTRY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE', 'O', 'B-COUNTRY'], ['B-REMOTE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE', 'O'], ['B-CITY', 'O', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY']]"
3,0.0865,0.173407,0.931818,0.931818,0.931818,precision recall f1-score support  CITY 0.90 0.95 0.93 20  COUNTRY 1.00 0.83 0.91 12  REMOTE 1.00 1.00 1.00 3  STATE 0.90 1.00 0.95 9  micro avg 0.93 0.93 0.93 44  macro avg 0.95 0.95 0.95 44 weighted avg 0.94 0.93 0.93 44,"[['B-COUNTRY', 'O', 'B-CITY', 'I-CITY'], ['O', 'O', 'O', 'O', 'B-CITY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['O', 'B-CITY', 'I-CITY', 'I-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'B-REMOTE'], ['B-CITY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-COUNTRY', 'O', 'B-REMOTE'], ['B-CITY', 'O', 'B-COUNTRY', 'B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE', 'O', 'B-COUNTRY'], ['B-REMOTE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE', 'O'], ['B-CITY', 'O', 'O', 'B-STATE'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY']]","[['B-COUNTRY', 'O', 'B-CITY', 'I-CITY'], ['O', 'O', 'O', 'O', 'B-CITY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'I-CITY', 'I-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'B-REMOTE'], ['B-CITY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-COUNTRY', 'O', 'B-REMOTE'], ['B-CITY', 'O', 'B-COUNTRY', 'B-COUNTRY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE', 'O', 'B-COUNTRY'], ['B-REMOTE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE', 'O'], ['B-CITY', 'O', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY']]"
4,0.0453,0.168735,0.931818,0.931818,0.931818,precision recall f1-score support  CITY 0.90 0.95 0.93 20  COUNTRY 1.00 0.83 0.91 12  REMOTE 1.00 1.00 1.00 3  STATE 0.90 1.00 0.95 9  micro avg 0.93 0.93 0.93 44  macro avg 0.95 0.95 0.95 44 weighted avg 0.94 0.93 0.93 44,"[['B-COUNTRY', 'O', 'B-CITY', 'I-CITY'], ['O', 'O', 'O', 'O', 'B-CITY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['O', 'B-CITY', 'I-CITY', 'I-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'B-REMOTE'], ['B-CITY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-COUNTRY', 'O', 'B-REMOTE'], ['B-CITY', 'O', 'B-COUNTRY', 'B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE', 'O', 'B-COUNTRY'], ['B-REMOTE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE', 'O'], ['B-CITY', 'O', 'O', 'B-STATE'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY']]","[['B-COUNTRY', 'O', 'B-CITY', 'I-CITY'], ['O', 'O', 'O', 'O', 'B-CITY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'I-CITY', 'I-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'B-REMOTE'], ['B-CITY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-COUNTRY', 'O', 'B-REMOTE'], ['B-CITY', 'O', 'B-COUNTRY', 'B-COUNTRY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE', 'O', 'B-COUNTRY'], ['B-REMOTE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE', 'O'], ['B-CITY', 'O', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY']]"
5,0.0762,0.15887,0.931818,0.931818,0.931818,precision recall f1-score support  CITY 0.90 0.95 0.93 20  COUNTRY 1.00 0.83 0.91 12  REMOTE 1.00 1.00 1.00 3  STATE 0.90 1.00 0.95 9  micro avg 0.93 0.93 0.93 44  macro avg 0.95 0.95 0.95 44 weighted avg 0.94 0.93 0.93 44,"[['B-COUNTRY', 'O', 'B-CITY', 'I-CITY'], ['O', 'O', 'O', 'O', 'B-CITY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['O', 'B-CITY', 'I-CITY', 'I-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'B-REMOTE'], ['B-CITY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-COUNTRY', 'O', 'B-REMOTE'], ['B-CITY', 'O', 'B-COUNTRY', 'B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE', 'O', 'B-COUNTRY'], ['B-REMOTE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE', 'O'], ['B-CITY', 'O', 'O', 'B-STATE'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY']]","[['B-COUNTRY', 'O', 'B-CITY', 'I-CITY'], ['O', 'O', 'O', 'O', 'B-CITY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'I-CITY', 'I-CITY', 'O', 'B-STATE', 'O', 'B-CITY', 'O', 'B-STATE', 'O', 'B-REMOTE'], ['B-CITY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-COUNTRY', 'O', 'B-REMOTE'], ['B-CITY', 'O', 'B-COUNTRY', 'B-COUNTRY', 'O', 'B-COUNTRY'], ['B-CITY', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE', 'O', 'B-COUNTRY'], ['B-REMOTE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'I-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY'], ['B-CITY', 'I-CITY', 'O', 'B-STATE', 'O'], ['B-CITY', 'O', 'O', 'B-COUNTRY'], ['O', 'O'], ['B-CITY', 'O', 'B-STATE'], ['B-CITY', 'O', 'B-COUNTRY']]"


***** Running Evaluation *****
  Num examples = 24
  Batch size = 4
Saving model checkpoint to ./location_bert/checkpoint-23
Configuration saved in ./location_bert/checkpoint-23/config.json
Model weights saved in ./location_bert/checkpoint-23/model.safetensors
***** Running Evaluation *****
  Num examples = 24
  Batch size = 4
Saving model checkpoint to ./location_bert/checkpoint-46
Configuration saved in ./location_bert/checkpoint-46/config.json
Model weights saved in ./location_bert/checkpoint-46/model.safetensors
***** Running Evaluation *****
  Num examples = 24
  Batch size = 4
Saving model checkpoint to ./location_bert/checkpoint-69
Configuration saved in ./location_bert/checkpoint-69/config.json
Model weights saved in ./location_bert/checkpoint-69/model.safetensors
***** Running Evaluation *****
  Num examples = 24
  Batch size = 4
Saving model checkpoint to ./location_bert/checkpoint-92
Configuration saved in ./location_bert/checkpoint-92/config.json
Model weights saved in ./lo

Saving model checkpoint to ./location_bert
Configuration saved in ./location_bert/config.json
Model weights saved in ./location_bert/model.safetensors


In [156]:
results

{'eval_loss': 0.1730087250471115,
 'eval_precision': 0.9545454545454546,
 'eval_recall': 0.9545454545454546,
 'eval_f1': 0.9545454545454546,
 'eval_report': '              precision    recall  f1-score   support\n\n        CITY       0.90      0.95      0.93        20\n     COUNTRY       1.00      0.92      0.96        12\n      REMOTE       1.00      1.00      1.00         3\n       STATE       1.00      1.00      1.00         9\n\n   micro avg       0.95      0.95      0.95        44\n   macro avg       0.98      0.97      0.97        44\nweighted avg       0.96      0.95      0.95        44\n',
 'eval_runtime': 0.1048,
 'eval_samples_per_second': 228.978,
 'eval_steps_per_second': 57.244,
 'epoch': 5.0}

# Model Eval

## Load Existing Model

In [9]:
token_classifier = AutoModelForTokenClassification.from_pretrained(out_put_dir)
token_classifier.to(device)
tokenizer_funct = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Device: cuda


## Eval Functions

In [131]:
def eval_ner_example(example_inputs, classifier):
    with torch.no_grad():
        outputs = classifier(**example_inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    return predictions

def make_predictions(input_example, token_classifier, tokenizer, id2label):
    # move input_tensors onto Device
    inputs = {k: v.to(device) for k, v in input_example.items()} 
    # Make prediction
    result = eval_ner_example(inputs, token_classifier) # returns a tensor
    # Move prediction back to cpu
    predictions_np = result.cpu().numpy() # returns a nparray
    # Move attention_mask back to cpu
    attention_mask_np = input_example['attention_mask'].cpu().numpy()# returns a nparray
    # Replace the 1s in the attention mask with the actual returned prediction. replace the 0s with -100
    adjusted_predictions = np.where(attention_mask_np == 1, predictions_np, -100) 
    flat_predictions = adjusted_predictions.flatten() # remove the extra dimension  
    
    attention_mask_flat = attention_mask_np.flatten() # remove extra dimension on np array
    attention_mask_list = attention_mask_flat.tolist() # convert nparray to list 
    last_real_token_index = len(attention_mask_list) - 1 - attention_mask_list[::-1].index(1) # grab the index of the last real item
    filtered_predictions_ids = flat_predictions[:last_real_token_index+1] # use last index +1 to remove the -100s
    
    pred_label_names = [id2label[id] for id in filtered_predictions_ids] # Convert Ids to labels 
    inputs = {k: v.to('cpu') for k, v in inputs.items()} 
    input_ids_list = inputs['input_ids'].squeeze().tolist()  # Remove extra dimension and convert tensor to list  
    # Grab the tokens for the input example then conver them from ids to actual string token. then Remove the pad tokens. 
    tokens = tokenizer.convert_ids_to_tokens(input_ids_list) 
    tokens = [token for token in tokens if token != '[PAD]']
    
    del inputs # Free up memory  
    return pred_label_names, tokens, last_real_token_index
    
def eval_example_with_true_label(example, token_classifier, tokenizer, tokenizer_config, label2id, id2label, device):
    input_example = align_labels_with_tokens_single(example, tokenizer=tokenizer, label2id=label2id, tokenizer_config={'return_tensors':"pt", **tokenizer_config})
    true_label_ids = input_example.pop('labels') # the true labels for an example

    pred_label_names, tokens, last_real_token_index = make_predictions(input_example, token_classifier, tokenizer, id2label)
    
    # Grab the actual label name from the true labels
    true_label_names = [-100 if id == -100 else id2label[id] for id in true_label_ids[:last_real_token_index+1]] # Convert Ids to labels
    
    print(f"Raw Location: \n{example['text']}\n")
    print(f"{'Pred':<15}{'True':<15}{'Token':<10}")
    for pred, actual, token in zip(pred_label_names, true_label_names, tokens):
        print(f"{pred:<15}{actual:<15}{token:<10}")
    
    # Free up memory 
    if device == 'cuda':
        torch.cuda.empty_cache() 
    
    return {
        'predicted_labels': pred_label_names, 
        'actual_labels': true_label_names, 
        'tokens': tokens,
        'raw_string': example['text']
    }

def eval_example_no_label(example, token_classifier, tokenizer, tokenizer_config, label2id, id2label, device):
    input_example = tokenizer_funct(example, return_tensors="pt", **tokenizer_config)
    input_example.pop("offset_mapping", None)

    pred_label_names, tokens, _ = make_predictions(input_example, token_classifier, tokenizer, id2label)
        
    print(f"Raw Location: \n{example}\n")
    print(f"{'Pred':<15}{'Token':<10}")
    for pred, token in zip(pred_label_names, tokens):
        print(f"{pred:<15}{token:<10}")
    
    # Free up memory  
    if device == 'cuda':
        torch.cuda.empty_cache() 
    
    return {
        'predicted_labels': pred_label_names, 
        'tokens': tokens,
        'raw_string': example
    }

def check_test_preds(df, token_classifier, tokenizer_funct, tokenizer_config, label2id, id2label, device):
    should_break_loop = False
    for row_count, row in df.iterrows():
        obj_w_label = eval_example_with_true_label(row, token_classifier, tokenizer_funct, tokenizer_config, label2id, id2label, device)
        while True:
            print()
            user_input = input(" F to continue. P Escape")
            user_input = user_input.lower() 
            if user_input in ['f','p']:
                if user_input == 'p':
                    should_break_loop = True
                    break
                elif user_input == 'f': 
                    break

        clear_output()
        if should_break_loop :
            break

### Testing Eval Functions

In [14]:
example = test_rows.iloc[0]
# example = new_df.iloc[0]['fmt_raw_location']
example


text                                    us - san francisco
label    [{'end': 18, 'labels': ['city'], 'start': 5, '...
Name: 0, dtype: object

In [144]:
obj_w_label = eval_example_with_true_label(example, token_classifier, tokenizer_funct, tokenizer_config, label2id, id2label, device)
print()
obj_no_label = eval_example_no_label(example['text'], token_classifier, tokenizer_funct, tokenizer_config, label2id, id2label, device)

Raw Location: 
us - san francisco

Pred           True           Token     
O              -100           [CLS]     
B-COUNTRY      B-COUNTRY      us        
O              O              -         
B-CITY         B-CITY         san       
I-CITY         I-CITY         francisco 
I-REMOTE       -100           [SEP]     

Raw Location: 
us - san francisco

Pred           Token     
O              [CLS]     
B-COUNTRY      us        
O              -         
B-CITY         san       
I-CITY         francisco 
I-REMOTE       [SEP]     


## Checking Test Rows predictions

In [140]:
check_test_preds(test_rows, token_classifier, tokenizer_funct, tokenizer_config, label2id, id2label, device)

# Sand Box

In [97]:
input_example = align_labels_with_tokens_single(example, tokenizer=tokenizer_funct, label2id=label2id, tokenizer_config={**tokenizer_config})
input_example

{'input_ids': [101, 2149, 1011, 2624, 3799, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [50]:
example

text                                    us - san francisco
label    [{'end': 18, 'labels': ['city'], 'start': 5, '...
Name: 0, dtype: object

In [187]:
align_labels_with_tokens_single(example, tokenizer=tokenizer_funct, label2id=label2id, tokenizer_config=tokenizer_config)

{'input_ids': [101, 6556, 102, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 'labels': [-100, 7, -100, -100, -100, -100, -100, -100, -100, -100]}

In [51]:
align_labels_with_tokens_single(example, tokenizer=tokenizer_funct, label2id=label2id, tokenizer_config=tokenizer_config)

[{'end': 18, 'labels': ['city'], 'start': 5, 'text': 'san francisco'}, {'end': 2, 'labels': ['country'], 'start': 0, 'text': 'us'}]


{'input_ids': [101, 2149, 1011, 2624, 3799, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# Old Helper Functions

In [117]:
def compute_metrics_classification(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


def build_confusion(y_true, y_pred, classes):
    cf_matrix = confusion_matrix(y_true, y_pred)
    make_confusion_matrix(cf_matrix, categories=classes, figsize=(8,6), cbar=False)

def eval_model(test_rows, classifier, id2label, token_max):
    tokenizer_args= {'truncation':True, 'max_length':token_max, 'padding':True}
    y_pred = []
    y_true = []
    for _, row in test_rows.iterrows():
        y_pred.append(classifier(preprocess_text(row['text']), **tokenizer_args)[0]['label'])
        y_true.append(id2label[row['label']])  
    return y_true, y_pred

# def eval_example(example, tokenizer_args, classifier):
#     val = classifier(preprocess_text(example), **tokenizer_args)
#     return val

def calc_accuracy(y_true,y_pred):
    accuracy = 0
    for x in range(len(y_true)):
        if y_pred[x]==y_true[x]:
            accuracy+=1
    print(f'Accuracy: {(accuracy/len(y_true))*100:.2f}%')
    
def make_confusion_matrix(cf,
                          categories='auto',
                          cbar=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    
    group_labels = ['' for i in range(cf.size)]
    group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])

    accuracy  = np.trace(cf) / float(np.sum(cf))
    stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    
    if figsize==None:
        figsize = plt.rcParams.get('figure.figsize')

    plt.figure(figsize=figsize)
    sn.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    plt.ylabel('True label')
    plt.xlabel('Predicted label' + stats_text)
    if title:
        plt.title(title)

def print_GPU():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def format_rows_for_train(df, label2id, preprocess_text=None):
    newdf = df.loc[df['label'].notnull()].copy()
    if preprocess_text:
        newdf['description'] = newdf['description'].apply(lambda x: preprocess_text(x))
    
    newdf['text'] = newdf[['title', 'description']].apply(lambda x: f"title {x['title'].lower()} description {x['description']}", axis=1)
    newdf['label'] = newdf['label'].apply(lambda x: label2id[x])
    
    final_df = newdf[['text','label']].copy()
    training_rows_DS = Dataset.from_pandas(final_df)
    training_rows_DS = training_rows_DS.train_test_split(train_size=.8, seed=42) 

    train_rows = pd.DataFrame(list(zip(training_rows_DS['train']['text'], training_rows_DS['train']['label'])), columns =['text','label'])
    test_rows = pd.DataFrame(list(zip(training_rows_DS['test']['text'], training_rows_DS['test']['label'])), columns =['text','label'])
    
    return training_rows_DS, train_rows, test_rows

def print_label_counts(split_datasets, label_column='label'):
    for split in ['train', 'test']:
        dataset = split_datasets[split] 
        labels = dataset[label_column]  
        
        if isinstance(labels, pd.Series):
            label_counts = labels.value_counts()
        else:
            label_counts = pd.Series(labels).value_counts()
        
        print(f"Label counts for {split} set:")
        print_string = ''
        for idx,[key,value] in enumerate(label_counts.items()):
            print_string += f"{key}: {value} "
            if idx != len(label_counts.keys())-1:
                print_string += "| "
            
        print(print_string)



# Old Data Explore

In [None]:
e_df = pd.read_csv('MASTER_DF.csv', index_col=0)

In [None]:
e_df

In [None]:
null_counts = e_df.isnull().sum()
print(null_counts)

In [None]:
def calculate_total_token_count(text, tokenizer_funct):
    encoded_dict = tokenizer_funct.encode_plus(
        text,
        add_special_tokens=True, 
        max_length=512,  
        truncation=True,
        return_overflowing_tokens=True,  
    )
    total_token= 0
    for x in range(len(encoded_dict['input_ids'])):
        total_token += len(encoded_dict['input_ids'][x])
        if x !=0:
            total_token-=2
    
    return total_token


In [None]:
e_df['tk_count_description'] = e_df['description'].apply(lambda x: calculate_total_token_count(x, tokenizer_funct))
e_df['tk_count_title'] = e_df['title'].apply(lambda x: calculate_total_token_count(x.lower(), tokenizer_funct))
e_df['processed_description'] = e_df['description'].apply(lambda x: preprocess_text(x))
e_df['tk_count_pre_description'] = e_df['processed_description'].apply(lambda x: calculate_total_token_count(x, tokenizer_funct))

e_df['fmt_text'] = e_df[['title', 'processed_description']].apply(lambda x: f"title {x['title'].lower()} description {x['processed_description']}", axis=1)

e_df['tk_count_fmt_text'] = e_df['fmt_text'].apply(lambda x: calculate_total_token_count(x, tokenizer_funct))


In [None]:
def describe_df(df):

    mean_token_fmt = df['tk_count_fmt_text'].mean()    
    max_token_fmt = df['tk_count_fmt_text'].max()
    
    mean_token_title = df['tk_count_title'].mean()    
    max_token_title = df['tk_count_title'].max()
    
    mean_token_p_description = df['tk_count_pre_description'].mean()    
    max_token_p_description = df['tk_count_pre_description'].max()
    
    count_fmt_rows_above_max_length = (df['tk_count_fmt_text'] > 512).sum()
    count_title_rows_above_mean = (df['tk_count_title'] > int(mean_token_title)).sum()

    
    
    print(f"Full text: Mean: {mean_token_fmt}, Max: {max_token_fmt}")
    print(f"Title text: Mean: {mean_token_title}, Max: {max_token_title}")
    print(f"Processed Description text: Mean: {mean_token_p_description}, Max: {max_token_p_description}")
    
    print(f"Title above mean title size: {count_title_rows_above_mean}, or {(count_title_rows_above_mean/len(df))*100:.2f}%")
    print(f"Processed Description text above 512: {count_fmt_rows_above_max_length}, or {(count_fmt_rows_above_max_length/len(df))*100:.2f}%")
    
    

In [None]:
describe_df(e_df)

In [None]:
[{"start": 1909, "end": 1917, "text": "2+ years", "labels": ["exp"]}, 
 {"start": 2035, "end": 2043, "text": "3+ years", "labels": ["exp"]}, 
 {"start": 2605, "end": 2613, "text": "1+ years", "labels": ["exp"]}, 
 {"start": 1703, "end": 1708, "text": "Azure", "labels": ["technology"]}, 
 {"start": 1958, "end": 1965, "text": "Windows", "labels": ["technology"]}, 
 {"start": 1973, "end": 1978, "text": "Linux", "labels": ["technology"]}, 
 {"start": 1735, "end": 1750, "text": "CI/CI pipelines", "labels": ["technology"]}, 
 {"start": 2522, "end": 2532, "text": "PowerShell", "labels": ["technology"]}, 
 {"start": 2534, "end": 2538, "text": "Bash", "labels": ["technology"]}, 
 {"start": 2540, "end": 2546, "text": "Python", "labels": ["technology"]}, 
 {"start": 2548, "end": 2554, "text": "Groovy", "labels": ["technology"]}, 
 {"start": 2377, "end": 2386, "text": "Terraform", "labels": ["technology"]}, 
 {"start": 2388, "end": 2395, "text": "Ansible", "labels": ["technology"]}, 
 {"start": 2397, "end": 2408, "text": "Chef/Puppet", "labels": ["technology"]}, 
 {"start": 2410, "end": 2416, "text": "Docker", "labels": ["technology"]}, 
 {"start": 2424, "end": 2434, "text": "Kubernetes", "labels": ["technology"]}, 
 {"start": 2198, "end": 2210, "text": "Azure DevOps", "labels": ["technology"]}, 
 {"start": 2223, "end": 2230, "text": "Jenkins", "labels": ["technology"]}, 
 {"start": 2239, "end": 2245, "text": "Bamboo", "labels": ["technology"]}, 
 {"start": 2642, "end": 2657, "text": "Microsoft Azure", "labels": ["technology"]}, 
 {"start": 2726, "end": 2729, "text": "AWS", "labels": ["technology"]}, 
{"start": 2733, "end": 2736, "text": "GCP", "labels": ["technology"]}, 
{"start": 3078, "end": 3101, "text": "Microsoft Certification", "labels": ["degree"]}]



Are you looking for a role that motivates and challenges you? Are you ready for an opportunity for growth? Do you want to work on teams where people roll up their sleeves to take on tough problems together, and regularly blow the doors off our clients with their outstanding teamwork? If you answered yes to those questions, 3Cloud might just be for you! At 3Cloud, we hire people who aren’t afraid to experiment or fail. We hire people who are willing to give direct and candid feedback to their managers, leaders, and team members. We hire people who jump at those opportunities because they care about our collective growth and success. We hire people who challenge and hold each other accountable for living 3Cloud’s core values because they know that it will result in amazing experiences and solutions for our clients and each other. As a DevOps Engineer, your primary responsibility will be to implement DevOps solutions in Azure. The role is a technical, consultative, and client facing role that will be accountable for client solution development, delivery and support. The ideal candidate will have experience in consulting and have demonstrated success in technical implementations and project work. Key Responsibilities  As a DevOps Engineer you will:  Participate in technical envisioning, technical design, and delivery of assigned projects. Work with 3Cloud Architects to support project efforts from a technical perspective. Execute the implementation of designed solutions into client deliverables. Assist with design and deployment of client workloads into Azure. Providing technical expertise and support across the following four areas of specialization:  Datacenter Transformation Azure Infrastructure DevOps and CI/CI pipelines Cloud automation      Be an essential part of the team responsible for crafting and delivering creative cloud solutions for our clients. Experience Required  2+ years of experience in system administration (Windows and/or Linux), DevOps and/or software development and IT Operations. 3+ years project experience migrating and deploying cloud-based solutions. Hands-on implementation experience of Continuous Integration and Continuous Delivery in Azure DevOps (VSTS/TFS), Jenkins, and/or Bamboo (or another similar tool) Experience with various cloud deployment methodologies. Hands-on experience with some of the following:  Terraform, Ansible, Chef/Puppet, Docker and/or Kubernetes is preferred, but not required. Automation experience - very good scripting knowledge (PowerShell, Bash, Python, Groovy, etc.).  Experience can be from any public cloud. 1+ years implementing and supporting Microsoft Azure infrastructure and topologies (or demonstrated experience form with AWS or GCP) Understanding of cloud ecosystem and leading-edge cloud emerging technologies. Performance analysis, troubleshooting and remediation techniques. Strong analytical problem-solving ability. Strong presentation, written and verbal communication skills. Self-starter with the ability to work independently or as part of a virtual project team. Microsoft Certification a plus.  Don’t meet every single requirement? At 3Cloud we are dedicated to building a diverse, inclusive and authentic workplace, so if you’re excited about this role but your past experience doesn’t align perfectly with every qualification in the job description, we encourage you to apply anyway. At this time, we cannot sponsor applicants for work visas.,33,DevOps Engineer,2024-04-09T02:49:25.226453Z,Remote - Philippines,340.176,2,2024-04-09T02:49:25.226486Z,3cloud