# BLS SOC Manual - Data Exploration

In [18]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Your text
text = """
Computer and Mathematical Occupations -> not found -> Software and Web Developers, Programmers, and Testers -> Web and Digital Interface Designers -> Graphic Web Designer	Design digital user interfaces or websites. Develop and test layouts, interfaces, functionality, and navigation menus to ensure compatibility and usability across browsers or devices. May use web framework applications as well as client-side code and processes. May evaluate web design following web and accessibility standards, and may analyze web use metrics and optimize websites for marketability and search engine ranking. May design and test interfaces that facilitate the human-computer interaction and maximize the usability of digital devices, websites, and software with a focus on aesthetics and design. May create graphics used in websites and manage website content and links. Excludes “Special Effects Artists and Animators” (27-1014) and “Graphic Designers” (27-1024).	15-1255
"""

# Tokenize the text
tokens = tokenizer.tokenize(text)

# Print the number of tokens
print(f'The text contains {len(tokens)} tokens.')

# If you want to see the tokens, you can print them too:
print(f'They are: {tokens}')


The text contains 188 tokens.
They are: ['computer', 'and', 'mathematical', 'occupations', '-', '>', 'not', 'found', '-', '>', 'software', 'and', 'web', 'developers', ',', 'programmers', ',', 'and', 'test', '##ers', '-', '>', 'web', 'and', 'digital', 'interface', 'designers', '-', '>', 'graphic', 'web', 'designer', 'design', 'digital', 'user', 'interfaces', 'or', 'websites', '.', 'develop', 'and', 'test', 'layout', '##s', ',', 'interfaces', ',', 'functionality', ',', 'and', 'navigation', 'menu', '##s', 'to', 'ensure', 'compatibility', 'and', 'usa', '##bility', 'across', 'browser', '##s', 'or', 'devices', '.', 'may', 'use', 'web', 'framework', 'applications', 'as', 'well', 'as', 'client', '-', 'side', 'code', 'and', 'processes', '.', 'may', 'evaluate', 'web', 'design', 'following', 'web', 'and', 'accessibility', 'standards', ',', 'and', 'may', 'analyze', 'web', 'use', 'metric', '##s', 'and', 'opt', '##imi', '##ze', 'websites', 'for', 'market', '##ability', 'and', 'search', 'engine', 'ra

# Space Laser - Initialization

In [19]:
# Import necessary libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling

# Read in the data
df = pd.read_csv('C://Offline_Storage//radiantClass//JobDesc_BlobText.csv')

# Check for non-string rows
non_string_rows = df[df['JobDesc_BlobText'].apply(lambda x: not isinstance(x, str))]
print(non_string_rows)

# Check for NaN in 'JobDesc_BlobText' column
print(df[df['JobDesc_BlobText'].isna()])

# Drop NaN rows
df = df.dropna(subset=['JobDesc_BlobText'])

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_pt=True)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Tokenize data
inputs = tokenizer(df['JobDesc_BlobText'].tolist(), return_tensors="pt", truncation=True, padding=True, max_length=512)

# Create data collator. This will automatically mask tokens 15% of the time.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

print(f"Train size: {len(inputs['input_ids'])}")


Empty DataFrame
Columns: [JobDesc_BlobText]
Index: []
Empty DataFrame
Columns: [JobDesc_BlobText]
Index: []


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Train size: 6520


In [20]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

batch_size = 32  # Adjust as per your GPU memory

# Create a TensorDataset from the tokenized data
data = TensorDataset(inputs['input_ids'], inputs['attention_mask'])

# Create a DataLoader
dataloader = DataLoader(data, sampler=RandomSampler(data), batch_size=batch_size)


In [21]:
from torch.optim import AdamW

# Define the optimizer. AdamW is the Adam optimizer with weight decay fix.
optimizer = AdamW(model.parameters(), lr=1e-4)  # You can adjust the learning rate


In [22]:
# Function to print current activity and time
from datetime import datetime

def print_activity(activity):
    current_time = datetime.now().strftime('%I:%M %p') # This will give you time in the format '01:37 PM'
    print(f"Starting {activity} at {current_time}...")


# Verify CUDA is available

In [23]:
print(f"Is Cuda available?: {torch.cuda.is_available()}! :-)")
print(f"CUDA version: {torch.version.cuda}")

Is Cuda available?: True! :-)
CUDA version: 11.8


# Pre-Training Loop (~X min on first pass)

In [24]:

epochs = 2  # Number of training epochs. You can adjust this.
batchcount = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):

    current_loopTime = datetime.now().strftime('%I:%M %p')
    print(f"Starting Pre-training Epoch #{epoch} at {current_loopTime} <3")
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(dataloader):

        # Load batch to device
        batchcount += 1
        b_input_ids, b_input_mask = batch[0].to(device), batch[1].to(device)
        current_batchTime = datetime.now().strftime('%I:%M %p')
        if batchcount % 5 == 0:
            print(f"Batch {batchcount} loaded at {current_batchTime}! :-)")
        
        # Clear out the gradients
        model.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_input_ids) # labels for MLM is the same input_ids
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

    print("Pre-training Epoch Complete")
    avg_train_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}")
    
    # Start validation loop
    #model.eval()  # Set the model to evaluation mode
    
    #val_total_loss = 0
    
    #for batch in val_dataloader:
        #b_input_ids, b_input_mask = batch["input_ids"].to(device), batch["attention_mask"].to(device)
        
        #with torch.no_grad():  # Don't compute gradients for validation
            #outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_input_ids)
            #val_loss = outputs.loss
            #val_total_loss += val_loss.item()

    #val_avg_loss = val_total_loss / len(val_dataloader)
    #print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_avg_loss:.4f}")


OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 8.00 GiB total capacity; 13.99 GiB already allocated; 0 bytes free; 14.38 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF