# Normalizing Job Titles

In [16]:
# Import necessary libraries
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import Vocab
from collections import Counter
import spacy
from torch.nn.utils.rnn import pad_sequence

# Load the Spacy tokenizer
spacy_en = spacy.load('en_core_web_sm')

# Load the data
normalized_titles_df = pd.read_pickle('../../Data/app_opp_normalized.pkl')[['NormalizedTitle', 'Title']]
sample_df = pd.read_parquet('../../Data/split_1.parquet')
training_data = pd.concat([normalized_titles_df[['Title', 'NormalizedTitle']], sample_df[['Title']].dropna()], ignore_index=True)
# Drop rows with missing values in 'Title' or 'NormalizedTitle'
training_data.dropna(subset=['Title', 'NormalizedTitle'], inplace=True)
#make all cells strings
training_data['Title'] = training_data['Title'].astype(str)
training_data['NormalizedTitle'] = training_data['NormalizedTitle'].astype(str)


In [17]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
train_data, validation_data = train_test_split(training_data, test_size=0.1) # Adjust the test_size as needed

In [18]:
from collections import Counter

# Tokenization function
def tokenize(text):
    return [token.text for token in spacy_en.tokenizer(text)]

# Vocabulary building function
def build_vocab(texts):
    counter = Counter()
    for text in texts:
        counter.update(tokenize(text))
    return counter


In [19]:
# Build vocabularies
title_vocab = build_vocab(training_data['Title'])
normalized_title_vocab = build_vocab(training_data['NormalizedTitle'])

# Convert Counters to Vocab objects
title_vocab = Vocab(title_vocab)
normalized_title_vocab = Vocab(normalized_title_vocab)

# Numerical encoding function
def numericalize(text, vocab):
    return [vocab[token] for token in tokenize(text)]

# Apply numerical encoding to data
training_data['Title'] = training_data['Title'].apply(lambda x: numericalize(x, title_vocab))
training_data['NormalizedTitle'] = training_data['NormalizedTitle'].apply(lambda x: numericalize(x, normalized_title_vocab))


In [20]:
from torch.utils.data import Dataset, DataLoader

class TitleDataset(Dataset):
    def __init__(self, titles, normalized_titles):
        self.titles = titles
        self.normalized_titles = normalized_titles
    
    def __len__(self):
        return len(self.titles)
    
    def __getitem__(self, idx):
        return torch.tensor(self.titles[idx]), torch.tensor(self.normalized_titles[idx])

# Create datasets
train_dataset = TitleDataset(training_data['Title'].tolist(), training_data['NormalizedTitle'].tolist())

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)


In [21]:
def collate_batch(batch):
    titles, normalized_titles = zip(*batch)
    titles = [title.clone().detach() for title in titles]
    normalized_titles = [ntitle.clone().detach() for ntitle in normalized_titles]
    titles_padded = pad_sequence(titles, batch_first=True, padding_value=0)
    normalized_titles_padded = pad_sequence(normalized_titles, batch_first=True, padding_value=0)
    return titles_padded, normalized_titles_padded

In [22]:
import torch.nn as nn

class Seq2SeqModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, embedding_dim, hidden_dim):
        super(Seq2SeqModel, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, embedding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_vocab_size)
        
    def forward(self, x, y):
        # Encode
        x = self.embedding(x)
        outputs, (hidden, cell) = self.encoder(x)
        
        # Decode
        y = self.embedding(y)
        outputs, _ = self.decoder(y, (hidden, cell))
        predictions = self.fc_out(outputs)
        
        return predictions

# Create model
model = Seq2SeqModel(len(title_vocab), len(normalized_title_vocab), embedding_dim=256, hidden_dim=512)

In [23]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)


In [24]:
import torch.optim as optim

# Loss function (ignoring the padding index)
loss_function = nn.CrossEntropyLoss(ignore_index=0) # Assuming padding index is 0

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [25]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_idx, (titles, normalized_titles) in enumerate(train_loader):
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(titles, normalized_titles)
        
        # Reshape for loss calculation
        predictions = predictions.view(-1, predictions.shape[-1])
        targets = normalized_titles.view(-1)
        
        # Calculate loss
        loss = loss_function(predictions, targets)
        
        # Backward pass
        loss.backward()
        
        # Optimization step
        optimizer.step()
        
        # Accumulate loss
        total_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")

Epoch [1/10], Loss: 3.5429
Epoch [2/10], Loss: 1.0027
Epoch [3/10], Loss: 0.2334
Epoch [4/10], Loss: 0.0639
Epoch [5/10], Loss: 0.0240
Epoch [6/10], Loss: 0.0124
Epoch [7/10], Loss: 0.0077
Epoch [8/10], Loss: 0.0056
Epoch [9/10], Loss: 0.0045
Epoch [10/10], Loss: 0.0038


In [36]:
# Create a validation dataset and DataLoader
validation_dataset = TitleDataset(validation_data['Title'].tolist(), validation_data['NormalizedTitle'].tolist())
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)


In [37]:
# Switch to evaluation mode
model.eval()

# Lists to collect predictions and targets
predictions, targets = [], []

In [38]:
# Loop through validation data
with torch.no_grad():
    for titles, normalized_titles in validation_loader:
        pred = model(titles, normalized_titles[:,:-1]) # Exclude the last token as input to the decoder
        pred_indices = torch.argmax(pred, dim=-1)
        predictions.extend(pred_indices.tolist())
        targets.extend(normalized_titles[:,1:].tolist()) # Exclude the first token as it's the start token

In [41]:
type(normalized_title_vocab)

torchtext.vocab.vocab.Vocab

In [43]:
# Get integer-to-string mapping for normalized titles
normalized_title_itos = normalized_title_vocab.get_itos()


AttributeError: 'Counter' object has no attribute 'get_itos'