### Salary prediction



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.functional as F
import nltk
%matplotlib inline

In [None]:
# CONSTS 
BATCH_SIZE = 16
min_count = 10
max_count = 200000
padding_title = 10
padding_descr = 200

In [None]:
!wget https://ysda-seminars.s3.eu-central-1.amazonaws.com/Train_rev1.zip
data = pd.read_csv("./Train_rev1.zip", compression='zip', index_col=None)
data['Log1pSalary'] = np.log1p(data['SalaryNormalized']).astype('float32')

Preprocess

In [None]:
text_columns = ["Title", "FullDescription"]
categorical_columns = ["Category", "Company", "LocationNormalized", "ContractType", "ContractTime"]
TARGET_COLUMN = "Log1pSalary"
data[categorical_columns] = data[categorical_columns].fillna('NaN') # cast missing values to string "NaN"

# tokenize
tokenizer = nltk.tokenize.WordPunctTokenizer()
data['FullDescription'] = data['FullDescription'].apply(
    lambda x: ' '.join(tokenizer.tokenize(x.lower())))
data['Title'] = data['Title'].apply(
    lambda x: ' '.join(tokenizer.tokenize(str(x).lower())))

In [None]:
import string
def punct_check(text):
  for s in string.punctuation:
    if s in text:
      return True

In [None]:
token_counts = Counter()

# Count how many times does each token occur in both "Title" and "FullDescription" in total
for describtion, title in zip(data['FullDescription'].values, data['Title'].values):
    token_counts.update(describtion.split())
    token_counts.update(title.split())

# tokens from token_counts keys that had at least min_count occurrences throughout the dataset
tokens = sorted(t for t, c in token_counts.items() if c >= min_count and not punct_check(t) and c<=max_count and not t.isdigit())

In [None]:
import gensim.downloader 
embeddings = gensim.downloader.load("glove-wiki-gigaword-100")

In [None]:
vocab_npa = np.array(tokens)
embs_npa = np.array([embeddings.get_vector(x) if x in embeddings.vocab else np.zeros(100) for x in tokens])
# embeddings = 0
vocab_npa = np.insert(vocab_npa, 0, '<pad>')
vocab_npa = np.insert(vocab_npa, 1, '<unk>')
print(vocab_npa[:10])

pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for '<pad>' token.
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for '<unk>' token.

#insert embeddings for pad and unk tokens at top of embs_npa.
embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
print(embs_npa.shape)

['<pad>' '<unk>' '000company' '000market' '000multi' '000my' '000the'
 '00am' '00am2' '00am3']
(33002, 100)


In [None]:
token_to_id = {val: idx for idx, val in enumerate(vocab_npa)}

In [None]:
UNK_IX, PAD_IX = map(token_to_id.get, ['<unk>', '<pad>'])

def as_matrix(sequences, max_len=None):
    """ Convert a list of tokens into a matrix with padding """
    if isinstance(sequences[0], str):
        sequences = list(map(str.split, sequences))
        
    max_len = max_len if max_len else min(max(map(len, sequences)), max_len or float('inf'))
    
    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))
    for i,seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix
    
    return matrix

In [None]:
# we only consider top-1k most frequent companies to minimize memory usage
top_companies, top_counts = zip(*Counter(data['Company']).most_common(1000))
recognized_companies = set(top_companies)
data["Company"] = data["Company"].apply(lambda comp: comp if comp in recognized_companies else "Other")

categorical_vectorizer = DictVectorizer(dtype=np.float32, sparse=False)
categorical_vectorizer.fit(data[categorical_columns].apply(dict, axis=1))

DictVectorizer(dtype=<class 'numpy.float32'>, sparse=False)

In [None]:
data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)
data_train.index = range(len(data_train))
data_val.index = range(len(data_val))

print("Train size = ", len(data_train))
print("Validation size = ", len(data_val))

Train size =  195814
Validation size =  48954


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'


def to_tensors(batch, device):
    batch_tensors = dict()
    for key, arr in batch.items():
        if key in ["FullDescription", "Title"]:
            batch_tensors[key] = torch.tensor(arr, device=device, dtype=torch.int64)
        else:
            batch_tensors[key] = torch.tensor(arr, device=device)
    return batch_tensors


def make_batch(data, max_len=None, word_dropout=0, device=device):
    """
    Creates a keras-friendly dict from the batch data.
    :param word_dropout: replaces token index with UNK_IX with this probability
    :returns: a dict with {'title' : int64[batch, title_max_len]
    """
    batch = {}
    batch["Title"] = as_matrix(data["Title"].values, max_len[0])
    batch["FullDescription"] = as_matrix(data["FullDescription"].values, max_len[1])
    batch['Categorical'] = categorical_vectorizer.transform(data[categorical_columns].apply(dict, axis=1))
    
    if word_dropout != 0:
        batch["FullDescription"] = apply_word_dropout(batch["FullDescription"], 1. - word_dropout)
    
    if TARGET_COLUMN in data.columns:
        batch[TARGET_COLUMN] = data[TARGET_COLUMN].values
    
    return to_tensors(batch, device)

def apply_word_dropout(matrix, keep_prop, replace_with=UNK_IX, pad_ix=PAD_IX,):
    dropout_mask = np.random.choice(2, np.shape(matrix), p=[keep_prop, 1 - keep_prop])
    dropout_mask &= matrix != pad_ix
    return np.choose(dropout_mask, [matrix, np.full_like(matrix, replace_with)])

In [None]:
def iterate_minibatches(data, batch_size=256, shuffle=True, cycle=False, device=device, **kwargs):
    """ iterates minibatches of data in random order """
    while True:
        indices = np.arange(len(data))
        if shuffle:
            indices = np.random.permutation(indices)

        for start in range(0, len(indices), batch_size):
            batch = make_batch(data.iloc[indices[start : start + batch_size]], device=device, max_len = (padding_title, padding_descr), **kwargs)
            yield batch
        
        if not cycle: break

In [None]:
def print_metrics(model, data, batch_size=BATCH_SIZE, name="", device=torch.device('cpu'), **kw):
    squared_error = abs_error = num_samples = 0.0
    model.eval()
    with torch.no_grad():
        for batch in iterate_minibatches(data, batch_size=batch_size, shuffle=False, device=device, **kw):
            batch_pred = model(batch)
            squared_error += torch.sum(torch.square(batch_pred - batch[TARGET_COLUMN]))
            abs_error += torch.sum(torch.abs(batch_pred - batch[TARGET_COLUMN]))
            num_samples += len(batch_pred)
    mse = squared_error.detach().cpu().numpy() / num_samples
    mae = abs_error.detach().cpu().numpy() / num_samples
    print("%s results:" % (name or ""))
    print("Mean square error: %.5f" % mse)
    print("Mean absolute error: %.5f" % mae)
    return mae


In [None]:
class SalaryPredictorConv(nn.Module):
    def __init__(self, n_tokens=len(tokens), 
                 n_cat_features=len(categorical_vectorizer.vocabulary_), 
                 emb_len=100, 
                 padding_title = 10,
                 padding_descr = 200,
                 conv_kernel = 32):
        super().__init__()
        self.emb_len = emb_len
        self.padding_title = padding_title
        self.padding_descr = padding_descr
        self.conv_kernel = conv_kernel
        self.emb = nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float()).requires_grad_(True)
        self.text = nn.Sequential(
            nn.Conv1d(emb_len, conv_kernel*2, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.Conv1d(conv_kernel*2, conv_kernel, kernel_size=6, padding='same'),
            nn.ReLU()
        )
        self.linear = nn.Sequential(
            nn.Linear(conv_kernel * 3, 32),
            nn.ReLU(), 
            nn.Linear(32, 1),
        )
        self.pooling_title = nn.AvgPool1d(padding_title, count_include_pad=False)
        self.pooling_descr = nn.AvgPool1d(padding_descr, count_include_pad=False)
        self.fc_cat = nn.Linear(n_cat_features, conv_kernel)
        self.relu = nn.ReLU()
        
    def forward(self, batch):
        descr = batch['FullDescription']
        title = batch['Title']
        cat = batch['Categorical']
       
        descr = self.emb(descr).permute(0,2,1)
        descr = self.text(descr)
        descr = self.pooling_descr(descr).reshape(len(descr),self.conv_kernel)

        title = self.emb(title).permute(0,2,1)
        title = self.text(title)
        title = self.pooling_title(title).reshape(len(title),self.conv_kernel)

        cat = self.relu(self.fc_cat(cat))

        x = torch.cat((descr,title,cat), dim=1)
        return self.linear(x).reshape(len(x))

In [None]:
from torch.cuda.random import device_count
class SalaryPredictorRecurent(nn.Module):
    def __init__(self, n_tokens=len(tokens), 
                 n_cat_features=len(categorical_vectorizer.vocabulary_), 
                 emb_len=100, 
                 padding_title = 10,
                 padding_descr = 200,
                 rec = 32, 
                 num_layers = 2):
        super().__init__()
        self.emb_len = emb_len
        self.padding_title = padding_title
        self.padding_descr = padding_descr
        self.rec = rec
        self.num_layers = num_layers
        self.emb = nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float()).requires_grad_(True)
        self.lstm = nn.LSTM(input_size=emb_len, 
                            hidden_size=rec, 
                            num_layers=num_layers,
                            batch_first=True, bidirectional =False )
        self.fc_cat = nn.Linear(n_cat_features, rec)
        self.fc1 = nn.Linear(rec * 3, 32)
        self.fc2 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        
    def forward(self, batch):
        descr = batch['FullDescription']
        title = batch['Title']
        cat = batch['Categorical']
       
        descr = self.emb(descr)
        hidden, carry = torch.randn(self.num_layers, len(descr), self.rec), torch.randn(self.num_layers, len(descr), self.rec)
        hidden, carry =  hidden.to(device), carry.to(device)
        descr, _ = self.lstm(descr, (hidden, carry))
        
        title = self.emb(title)
        hidden1, carry1 = torch.randn(self.num_layers, len(title), self.rec), torch.randn(self.num_layers, len(title), self.rec)
        hidden1, carry1 =  hidden1.to(device), carry1.to(device)
        title, _ = self.lstm(title, (hidden1, carry1))

        cat = self.relu(self.fc_cat(cat))
        
        x = torch.cat((descr[:,-1],title[:,-1],cat), dim=1)
        return self.fc2(self.relu(self.fc1(x))).reshape(len(x))

In [None]:
from tqdm.auto import tqdm
EPOCHS = 15
LR = 5e-4

In [None]:
model = SalaryPredictorRecurent().to(device)
criterion = nn.L1Loss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.95**epoch)
def train(model, criterion, optimizer, scheduler, data_train):
    prev_loss = 1000
    patience = 2
    trigger_times = 0
    mae_val = []
    mae_train = []
    for epoch in range(EPOCHS):
        print(f"epoch: {epoch}")
        model.train()
        temp = []
        for i, batch in tqdm(enumerate(
                iterate_minibatches(data_train, batch_size=BATCH_SIZE, device=device)),
                total=len(data_train) // BATCH_SIZE
            ):
            pred = model(batch)
            loss = criterion(pred, batch[TARGET_COLUMN])
            temp.append(float(loss))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        scheduler.step()
        mae_train.append(np.mean(temp)) 
        curr_loss = print_metrics(model, data_val, device=device)
        mae_val.append(curr_loss)
        if curr_loss + 0.001 > prev_loss:
            trigger_times += 1
            print('trigger times:', trigger_times)
            if trigger_times >= patience:
                print('Early stopping!\nStart to test process.')
                break
        else:
            print('trigger times: 0')
            trigger_times = 0
            torch.save(model.state_dict(), 'model')
            prev_loss = curr_loss