### Salary prediction, episode II: make it actually work (4 points)

Your main task is to use some of the tricks you've learned on the network and analyze if you can improve __validation MAE__. Try __at least 3 options__ from the list below for a passing grade. Write a short report about what you have tried. More ideas = more bonus points. 

__Please be serious:__ " plot learning curves in MAE/epoch, compare models based on optimal performance, test one change at a time. You know the drill :)

You can use either __pytorch__ or __tensorflow__ or any other framework (e.g. pure __keras__). Feel free to adapt the seminar code for your needs. For tensorflow version, consider `seminar_tf2.ipynb` as a starting point.


In [1]:
import torch
import pandas as pd

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
UNK, PAD = "UNK", "PAD"

In [4]:
data = pd.read_csv("./Train_rev1.zip", compression='zip', index_col=None)
data.shape

(244768, 12)

In [5]:
data['Log1pSalary'] = np.log1p(data['SalaryNormalized']).astype('float32')

In [6]:
text_columns = ["Title", "FullDescription"]
categorical_columns = ["Category", "Company", "LocationNormalized", "ContractType", "ContractTime"]
TARGET_COLUMN = "Log1pSalary"

data[categorical_columns] = data[categorical_columns].fillna('NaN') # cast missing values to string "NaN"

In [7]:
import nltk

def tokenize(text):
    if not text or pd.isna(text):
        return ''
    tokenizer = nltk.tokenize.WordPunctTokenizer()
    try:
        tokens = tokenizer.tokenize(text)
    except:
        print(text)
    return ' '.join(tokens).lower()


data['FullDescription'] = data['FullDescription'].apply(tokenize)
data['Title'] = data['Title'].apply(tokenize)


In [8]:
from collections import Counter
token_counts = Counter()

for desc in data["FullDescription"]:
    for token in desc.split():
        token_counts[token] += 1
for desc in data["Title"]:
    for token in desc.split():
        token_counts[token] += 1

In [9]:
min_count = 10

# tokens from token_counts keys that had at least min_count occurrences throughout the dataset
tokens = sorted(t for t, c in token_counts.items() if c >= min_count)#TODO<YOUR CODE HERE>

# Add a special tokens for unknown and empty words
UNK, PAD = "UNK", "PAD"
tokens = [UNK, PAD] + tokens

In [10]:
token_to_id = {token: idx for idx, token in enumerate(tokens)}

In [11]:
UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])

def as_matrix(sequences, max_len=None, min_len=None):
    """ Convert a list of tokens into a matrix with padding """
    if isinstance(sequences[0], str):
        sequences = list(map(str.split, sequences))
        
    max_len = min(max(map(len, sequences)), max_len or float('inf'))

    if min_len is not None and min_len > max_len:
        max_len = min_len
    
    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))
    for i,seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix
    
    return matrix

In [12]:
from sklearn.feature_extraction import DictVectorizer

# we only consider top-1k most frequent companies to minimize memory usage
top_companies, top_counts = zip(*Counter(data['Company']).most_common(1000))
recognized_companies = set(top_companies)
data["Company"] = data["Company"].apply(lambda comp: comp if comp in recognized_companies else "Other")

categorical_vectorizer = DictVectorizer(dtype=np.float32, sparse=False)
categorical_vectorizer.fit(data[categorical_columns].apply(dict, axis=1))

DictVectorizer(dtype=<class 'numpy.float32'>, sparse=False)

In [13]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)
data_train.index = range(len(data_train))
data_val.index = range(len(data_val))

print("Train size = ", len(data_train))
print("Validation size = ", len(data_val))

Train size =  195814
Validation size =  48954


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F


device = 'cuda' if torch.cuda.is_available() else 'cpu'


def to_tensors(batch, device):
    batch_tensors = dict()
    for key, arr in batch.items():
        if key in ["FullDescription", "Title"]:
            batch_tensors[key] = torch.tensor(arr, device=device, dtype=torch.int64)
        else:
            batch_tensors[key] = torch.tensor(arr, device=device)
    return batch_tensors


def make_batch(data, max_len=None, min_len=None, word_dropout=0, device=device):
    """
    Creates a keras-friendly dict from the batch data.
    :param word_dropout: replaces token index with UNK_IX with this probability
    :returns: a dict with {'title' : int64[batch, title_max_len]
    """
    batch = {}
    batch["Title"] = as_matrix(data["Title"].values, max_len, min_len)
    batch["FullDescription"] = as_matrix(data["FullDescription"].values, max_len, min_len)
    batch['Categorical'] = categorical_vectorizer.transform(data[categorical_columns].apply(dict, axis=1))
    
    if word_dropout != 0:
        batch["FullDescription"] = apply_word_dropout(batch["FullDescription"], 1. - word_dropout)
    
    if TARGET_COLUMN in data.columns:
        batch[TARGET_COLUMN] = data[TARGET_COLUMN].values
    
    return to_tensors(batch, device)

def apply_word_dropout(matrix, keep_prop, replace_with=UNK_IX, pad_ix=PAD_IX,):
    dropout_mask = np.random.choice(2, np.shape(matrix), p=[keep_prop, 1 - keep_prop])
    dropout_mask &= matrix != pad_ix
    return np.choose(dropout_mask, [matrix, np.full_like(matrix, replace_with)])

In [15]:
def iterate_minibatches(data, batch_size=256, shuffle=True, cycle=False, device=device, **kwargs):
    """ iterates minibatches of data in random order """
    while True:
        indices = np.arange(len(data))
        if shuffle:
            indices = np.random.permutation(indices)

        for start in range(0, len(indices), batch_size):
            batch = make_batch(data.iloc[indices[start : start + batch_size]], device=device, **kwargs)
            yield batch
        
        if not cycle: break

In [16]:
def print_metrics(model, data, batch_size, name="", device=torch.device('cpu'), **kw):
    squared_error = abs_error = num_samples = 0.0
    model.eval()
    with torch.no_grad():
        for batch in iterate_minibatches(data, batch_size=batch_size, shuffle=False, device=device, **kw):
            batch_pred = model(batch)
            squared_error += torch.sum(torch.square(batch_pred - batch[TARGET_COLUMN]))
            abs_error += torch.sum(torch.abs(batch_pred - batch[TARGET_COLUMN]))
            num_samples += len(batch_pred)
    mse = squared_error.detach().cpu().numpy() / num_samples
    mae = abs_error.detach().cpu().numpy() / num_samples
    print("%s results:" % (name or ""))
    print("Mean square error: %.5f" % mse)
    print("Mean absolute error: %.5f" % mae)
    return mse, mae


In [17]:
from tqdm.auto import tqdm

def train(epochs, model, batch_size, data_train, data_val):
    criterion = nn.MSELoss(reduction='sum')
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

    for epoch in range(epochs):
        print(f"epoch: {epoch}")
        model.train()
        for i, batch in tqdm(enumerate(
                iterate_minibatches(data_train, batch_size=batch_size, device=device)),
                total=len(data_train) // batch_size
            ):
            pred = model(batch)
            loss = criterion(pred, batch[TARGET_COLUMN])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        print_metrics(model, data_val, batch_size, device=device)

In [18]:
def get_params_number(model):
    params_num = 0

    for parameter in model.parameters():
        params_num += np.prod(parameter.size())

    return params_num

### A short report

Please tell us what you did and how did it work.

`<YOUR_TEXT_HERE>`, i guess...

## Recommended options

#### A) CNN architecture

All the tricks you know about dense and convolutional neural networks apply here as well.
* Dropout. Nuff said.
* Batch Norm. This time it's `nn.BatchNorm*`/`L.BatchNormalization`
* Parallel convolution layers. The idea is that you apply several nn.Conv1d to the same embeddings and concatenate output channels.
* More layers, more neurons, ya know...


#### B) Play with pooling

There's more than one way to perform pooling:
* Max over time (independently for each feature)
* Average over time (excluding PAD)
* Softmax-pooling:
$$ out_{i, t} = \sum_t {h_{i,t} \cdot {{e ^ {h_{i, t}}} \over \sum_\tau e ^ {h_{j, \tau}} } }$$

* Attentive pooling
$$ out_{i, t} = \sum_t {h_{i,t} \cdot Attn(h_t)}$$

, where $$ Attn(h_t) = {{e ^ {NN_{attn}(h_t)}} \over \sum_\tau e ^ {NN_{attn}(h_\tau)}}  $$
and $NN_{attn}$ is a dense layer.

The optimal score is usually achieved by concatenating several different poolings, including several attentive pooling with different $NN_{attn}$ (aka multi-headed attention).

The catch is that keras layers do not inlude those toys. You will have to [write your own keras layer](https://keras.io/layers/writing-your-own-keras-layers/). Or use pure tensorflow, it might even be easier :)

#### C) Fun with words

It's not always a good idea to train embeddings from scratch. Here's a few tricks:

* Use a pre-trained embeddings from `gensim.downloader.load`. See last lecture.
* Start with pre-trained embeddings, then fine-tune them with gradient descent. You may or may not download pre-trained embeddings from [here](http://nlp.stanford.edu/data/glove.6B.zip) and follow this [manual](https://keras.io/examples/nlp/pretrained_word_embeddings/) to initialize your Keras embedding layer with downloaded weights.
* Use the same embedding matrix in title and desc vectorizer


#### D) Going recurrent

We've already learned that recurrent networks can do cool stuff in sequence modelling. Turns out, they're not useless for classification as well. With some tricks of course..

* Like convolutional layers, LSTM should be pooled into a fixed-size vector with some of the poolings.
* Since you know all the text in advance, use bidirectional RNN
  * Run one LSTM from left to right
  * Run another in parallel from right to left 
  * Concatenate their output sequences along unit axis (dim=-1)

* It might be good idea to mix convolutions and recurrent layers differently for title and description


#### E) Optimizing seriously

* You don't necessarily need 100 epochs. Use early stopping. If you've never done this before, take a look at [early stopping callback(keras)](https://keras.io/callbacks/#earlystopping) or in [pytorch(lightning)](https://pytorch-lightning.readthedocs.io/en/latest/common/early_stopping.html).
  * In short, train until you notice that validation
  * Maintain the best-on-validation snapshot via `model.save(file_name)`
  * Plotting learning curves is usually a good idea
  
Good luck! And may the force be with you!

In [23]:
batch = make_batch(data_train[:20])

# 1.Cnn arhictecture

In [None]:
# import embbedding
from torch.nn import Embedding
from torch.nn import Conv1d
from torch.nn import BatchNorm1d
from torch.nn import Dropout
from torch.nn import Linear
from torch.nn import ReLU

class WordEmbedding(nn.Module):
    def __init__(self, n_tokens=len(tokens), embed_size=50) -> None:
        super().__init__()
        self.embed = Embedding(n_tokens, embed_size)

    def forward(self, batch):
        return self.embed(batch)

class TitleEncoder(nn.Module):
    def __init__(self, embed: WordEmbedding, embed_size=50, encoding_dim=32) -> None:
        super().__init__()
        self.embed = embed
        self.conv1 = Conv1d(in_channels=embed_size, out_channels=encoding_dim, kernel_size=3)
    
    def forward(self, batch):
        batch = self.embed(batch)
        batch = self.conv1(batch.permute(0, 2, 1))
        return torch.max(batch, dim=2).values

class DescriptionEncoder(nn.Module):
    def __init__(self, embed: WordEmbedding, embed_size=50, encoding_dim=32) -> None:
        super().__init__()
        self.embed = embed
        self.conv1 = Conv1d(in_channels=embed_size, out_channels=encoding_dim, kernel_size=3)
    
    def forward(self, batch):
        batch = self.embed(batch)
        batch = self.conv1(batch.permute(0, 2, 1))
        return torch.max(batch, dim=2).values
    
class CetegoricalEncoder(nn.Module):
    def __init__(self, n_cat_features, encoding_dim=64) -> None:
        super().__init__()
        self.linear = Linear(in_features=n_cat_features, out_features=encoding_dim)
    
    def forward(self, batch):
        return self.linear(batch)


class SalaryPredictor(nn.Module):
    def __init__(self, n_tokens=len(tokens), 
                word_embed_size=50, 
                hidden_dim=32, 
                title_encoding_dim=32, 
                description_encoding_dim=32,
                cat_encoding_dim=32,
                dropout=0.1,
                n_cat_features=len(categorical_vectorizer.vocabulary_)):
        super().__init__()
        self.embed = WordEmbedding(n_tokens, word_embed_size)
        self.title_encoder = TitleEncoder(self.embed, word_embed_size, title_encoding_dim)
        self.description_encoder = DescriptionEncoder(self.embed, word_embed_size, description_encoding_dim)
        self.categorical_encoder = CetegoricalEncoder(n_cat_features, cat_encoding_dim)
        self.relu1 = ReLU()
        self.normalization = BatchNorm1d(title_encoding_dim + description_encoding_dim + cat_encoding_dim)
        self.linear = Linear(
            in_features=title_encoding_dim + description_encoding_dim + cat_encoding_dim, 
            out_features=hidden_dim
        )
        self.relu2 = ReLU()
        self.dropout = Dropout(dropout)
        self.out = Linear(in_features=hidden_dim, out_features=1)

    def forward(self, batch):
        title_encoding = self.title_encoder(batch['Title'])
        description_encoding = self.description_encoder(batch['FullDescription'])
        category_encoding = self.categorical_encoder(batch['Categorical'])
        full_encoding = torch.cat([title_encoding, description_encoding, category_encoding], dim=1)
        full_encoding = self.normalization(self.relu1(full_encoding))
        full_encoding = self.relu2(self.linear(full_encoding))
        full_encoding = self.dropout(full_encoding)
        return self.out(full_encoding).squeeze(1)
        

In [None]:
model = SalaryPredictor(dropout=0.4, hidden_dim=64, description_encoding_dim=64).to(device)

In [None]:
print("The number of parameters:", get_params_number(model))

The number of parameters: 1851581


In [None]:
train(5, model, 16, data_train, data_val)

epoch: 0


  0%|          | 0/12238 [00:00<?, ?it/s]

 results:
Mean square error: 0.13340
Mean absolute error: 0.28107
epoch: 1


  0%|          | 0/12238 [00:00<?, ?it/s]

 results:
Mean square error: 0.12495
Mean absolute error: 0.26992
epoch: 2


  0%|          | 0/12238 [00:00<?, ?it/s]

 results:
Mean square error: 0.10835
Mean absolute error: 0.25048
epoch: 3


  0%|          | 0/12238 [00:00<?, ?it/s]

 results:
Mean square error: 0.10460
Mean absolute error: 0.24603
epoch: 4


  0%|          | 0/12238 [00:00<?, ?it/s]

 results:
Mean square error: 0.10506
Mean absolute error: 0.24545


# 2. Recurrent

In [None]:
# import embbedding
from torch.nn import Embedding
from torch.nn import LSTM
from torch.nn import BatchNorm1d
from torch.nn import Dropout
from torch.nn import Linear
from torch.nn import ReLU

class WordEmbedding(nn.Module):
    def __init__(self, n_tokens=len(tokens), embed_size=50) -> None:
        super().__init__()
        self.embed = Embedding(n_tokens, embed_size)

    def forward(self, batch):
        return self.embed(batch)

class TitleEncoder(nn.Module):
    def __init__(self, embed: WordEmbedding, embed_size=50, encoding_dim=32, dropout=0.5) -> None:
        super().__init__()
        self.embed = embed
        self.lstm = LSTM(
            dropout=dropout,
            input_size=embed_size, 
            hidden_size=encoding_dim, 
            bidirectional=True,
            batch_first=True
        )
    
    def forward(self, batch):
        batch = self.embed(batch)
        output, (hidden, cell) = self.lstm(batch)
        return torch.cat([hidden[0], hidden[1]], dim=1)

class DescriptionEncoder(nn.Module):
    def __init__(self, embed: WordEmbedding, embed_size=50, encoding_dim=32, dropout=0.5) -> None:
        super().__init__()
        self.embed = embed
        self.lstm = LSTM(
            dropout=dropout,
            input_size=embed_size, 
            hidden_size=encoding_dim, 
            bidirectional=True,
            batch_first=True
        )
    
    def forward(self, batch):
        batch = self.embed(batch)
        output, (hidden, cell) = self.lstm(batch)
        return torch.cat([hidden[0], hidden[1]], dim=1)
    
class CetegoricalEncoder(nn.Module):
    def __init__(self, n_cat_features, encoding_dim=64) -> None:
        super().__init__()
        self.linear = Linear(in_features=n_cat_features, out_features=encoding_dim)
    
    def forward(self, batch):
        return self.linear(batch)


class SalaryPredictor(nn.Module):
    def __init__(self, n_tokens=len(tokens), 
                word_embed_size=50, 
                hidden_dim=32, 
                title_encoding_dim=32, 
                description_encoding_dim=32,
                cat_encoding_dim=32,
                dropout=0.1,
                n_cat_features=len(categorical_vectorizer.vocabulary_)):
        super().__init__()
        self.embed = WordEmbedding(n_tokens, word_embed_size)
        self.title_encoder = TitleEncoder(self.embed, word_embed_size, title_encoding_dim, dropout)
        self.description_encoder = DescriptionEncoder(self.embed, word_embed_size, description_encoding_dim, dropout)
        self.categorical_encoder = CetegoricalEncoder(n_cat_features, cat_encoding_dim)
        self.relu1 = ReLU()
        self.normalization = BatchNorm1d(title_encoding_dim*2 + description_encoding_dim*2 + cat_encoding_dim)
        self.linear = Linear(
            in_features=title_encoding_dim*2 + description_encoding_dim*2 + cat_encoding_dim, 
            out_features=hidden_dim
        )
        self.relu2 = ReLU()
        self.dropout = Dropout(dropout)
        self.out = Linear(in_features=hidden_dim, out_features=1)

    def forward(self, batch):
        title_encoding = self.title_encoder(batch['Title'])
        description_encoding = self.description_encoder(batch['FullDescription'])
        category_encoding = self.categorical_encoder(batch['Categorical'])
        full_encoding = torch.cat([title_encoding, description_encoding, category_encoding], dim=1)
        full_encoding = self.normalization(self.relu1(full_encoding))
        full_encoding = self.relu2(self.linear(full_encoding))
        full_encoding = self.dropout(full_encoding)
        return self.out(full_encoding).squeeze(1)
        

In [None]:
model = SalaryPredictor(
    dropout=0.4, 
    hidden_dim=64, 
    description_encoding_dim=128,  
    title_encoding_dim=64,
    cat_encoding_dim=32
).to(device)



In [None]:
print("The number of parameters:", get_params_number(model))

The number of parameters: 2099805


In [None]:
train(5, model, 16, data_train, data_val)

epoch: 0


  0%|          | 0/12238 [00:00<?, ?it/s]

 results:
Mean square error: 0.19278
Mean absolute error: 0.32592
epoch: 1


  0%|          | 0/12238 [00:00<?, ?it/s]

KeyboardInterrupt: 

# 3. Pooling with attention

In [53]:
# import embbedding
from torch.nn import Embedding
from torch.nn import Conv1d
from torch.nn import BatchNorm1d
from torch.nn import Dropout
from torch.nn import Linear
from torch.nn import ReLU

class AttentionPooling(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.linear = Linear(in_features=in_features, out_features=1)
        self.relu = ReLU()
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, batch):
        attent_score = self.linear(batch).squeeze(2)
        attent_score = self.relu(batch)
        attent_score = self.softmax(batch)
        return torch.sum(batch * attent_score, dim=1)

class WordEmbedding(nn.Module):
    def __init__(self, n_tokens=len(tokens), embed_size=50) -> None:
        super().__init__()
        self.embed = Embedding(n_tokens, embed_size)

    def forward(self, batch):
        return self.embed(batch)

class TitleEncoder(nn.Module):
    def __init__(self, embed: WordEmbedding, embed_size=50, encoding_dim=32) -> None:
        super().__init__()
        self.embed = embed
        self.conv1 = Conv1d(in_channels=embed_size, out_channels=encoding_dim, kernel_size=3)
        self.attention1 = AttentionPooling(in_features=encoding_dim)
        self.attention2 = AttentionPooling(in_features=encoding_dim)

    def forward(self, batch):
        batch = self.embed(batch)
        batch = self.conv1(batch.permute(0, 2, 1))
        batch = batch.permute(0, 2, 1)

        attention_1 = self.attention1(batch)
        attention_2 = self.attention2(batch)

        return torch.cat([attention_1, attention_2], dim=1)

class DescriptionEncoder(nn.Module):
    def __init__(self, embed: WordEmbedding, embed_size=50, encoding_dim=32) -> None:
        super().__init__()
        self.embed = embed
        self.conv1 = Conv1d(in_channels=embed_size, out_channels=encoding_dim, kernel_size=3)
        self.attention1 = AttentionPooling(in_features=encoding_dim)
        self.attention2 = AttentionPooling(in_features=encoding_dim)
    
    def forward(self, batch):
        batch = self.embed(batch)
        batch = self.conv1(batch.permute(0, 2, 1))
        batch = batch.permute(0, 2, 1)

        attention_1 = self.attention1(batch)
        attention_2 = self.attention2(batch)

        return torch.cat([attention_1, attention_2], dim=1)
    
class CetegoricalEncoder(nn.Module):
    def __init__(self, n_cat_features, encoding_dim=64) -> None:
        super().__init__()
        self.linear = Linear(in_features=n_cat_features, out_features=encoding_dim)
    
    def forward(self, batch):
        batch = self.linear(batch)
        return batch


class SalaryPredictor(nn.Module):
    def __init__(self, n_tokens=len(tokens), 
                word_embed_size=50, 
                hidden_dim=32, 
                title_encoding_dim=32, 
                description_encoding_dim=32,
                cat_encoding_dim=32,
                dropout=0.1,
                n_cat_features=len(categorical_vectorizer.vocabulary_)):
        super().__init__()
        self.embed = WordEmbedding(n_tokens, word_embed_size)
        self.title_encoder = TitleEncoder(self.embed, word_embed_size, title_encoding_dim)
        self.description_encoder = DescriptionEncoder(self.embed, word_embed_size, description_encoding_dim)
        self.categorical_encoder = CetegoricalEncoder(n_cat_features, cat_encoding_dim)
        self.relu1 = ReLU()
        self.normalization = BatchNorm1d(title_encoding_dim*2 + description_encoding_dim*2 + cat_encoding_dim)
        self.linear = Linear(
            in_features=title_encoding_dim*2 + description_encoding_dim*2 + cat_encoding_dim, 
            out_features=hidden_dim
        )
        self.relu2 = ReLU()
        self.dropout = Dropout(dropout)
        self.out = Linear(in_features=hidden_dim, out_features=1)

    def forward(self, batch):
        title_encoding = self.title_encoder(batch['Title'])
        description_encoding = self.description_encoder(batch['FullDescription'])
        category_encoding = self.categorical_encoder(batch['Categorical'])
        full_encoding = torch.cat([title_encoding, description_encoding, category_encoding], dim=1)
        full_encoding = self.normalization(self.relu1(full_encoding))
        full_encoding = self.relu2(self.linear(full_encoding))
        full_encoding = self.dropout(full_encoding)
        return self.out(full_encoding).squeeze(1)
        

In [54]:
model = SalaryPredictor(dropout=0.4, hidden_dim=64, description_encoding_dim=64).to(device)

In [57]:
print("The number of parameters:", get_params_number(model))

The number of parameters: 1858113


In [59]:
train(1, model, 16, data_train, data_val)

epoch: 0


  0%|          | 0/12238 [00:00<?, ?it/s]

 results:
Mean square error: 0.12346
Mean absolute error: 0.27296
