In [None]:
import transformers
import numpy as np
import pandas as pd

import numpy as np
import pandas as pd
import os
import time
import gc
import random
from tqdm.notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
import re

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [None]:
df_train = pd.read_csv('../input/job-salary-prediction/Train_rev1.zip', compression='zip', header=0, sep=',', quotechar='"')

In [None]:
df_train

In [None]:
processed_df = df_train.iloc[:,[1,2,4,6,7,8,10]]
processed_df.head()

In [None]:
processed_df['description_length'] = processed_df.FullDescription.apply(lambda x: len(x.split(' ')))

In [None]:
# np.isnan(pd.unique(processed_df['ContractTime'])[1])
processed_df.fillna('', inplace=True)

In [None]:
processed_df.isna().sum()

In [None]:
processed_df.describe(percentiles=[0.25,0.5,.75,.9,.98,.99])

In [None]:
def process_row(row):
    title = row['Title']
    description = row['FullDescription']
    location = row['LocationNormalized']
    contract_time = row['ContractTime']
    company = row['Company']
    category = row['Category']
    final_row = " ".join([title, location, contract_time, company, category, description])

    return final_row.lower()

processed_df['X'] = processed_df.apply(process_row, axis=1)

In [None]:
X,y = processed_df['X'], processed_df['SalaryNormalized']

In [None]:
X.describe()

In [None]:
lens = X.apply(lambda x:  len(x.split(' ')))
lens.describe(percentiles=[0.5,.75,.9,.95,.99])

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=.3)

In [None]:
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts,val_labels, test_size=.5)

In [None]:
train_texts.to_csv('train_texts.csv',header=['train_texts'], index=False)

In [None]:
train_texts.to_csv('train_texts.csv',header=['train_texts'], index=False)
train_labels.to_csv('train_labels.csv',header=['train_labels'], index=False)
val_texts.to_csv('val_texts.csv',header=['val_texts'], index=False)
val_labels.to_csv('val_labels.csv',header=['val_labels'], index=False)
test_texts.to_csv('test_texts.csv',header=['test_texts'], index=False)
test_labels.to_csv('test_labels.csv',header=['test_labels'], index=False)

In [None]:
CRAWL_EMBEDDING_PATH = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = '../input/glove840b300dtxt/glove.840B.300d.txt'

LSTM_UNITS = 256
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 512

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def train_model(model, train, valid, loss_fn, output_dim, lr=0.001,
                batch_size=64, n_epochs=10,):
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
    
    for epoch in range(n_epochs):
        start_time = time.time()
        
        scheduler.step()
        
        model.train()
        avg_loss = 0.
        
        for data in tqdm(train_loader, disable=False):
            x_batch = data[:-1]
            y_batch = data[-1]

            y_pred = model(*x_batch)            
            loss = loss_fn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        valid_loss = 0.
        for i, batch_data in enumerate(valid_loader):
            x_batch = batch_data[:-1]
            y_batch = batch_data[-1]
            
            y_pred = model(*x_batch)

            loss = loss_fn(y_pred, y_batch)
            valid_loss += loss.item() / len(valid_loader)

        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s \t validation loss={:.4f}'.format(
              epoch + 1, n_epochs, avg_loss, elapsed_time, valid_loss))
        
        if epoch %2 == 0:
            torch.save(model.state_dict(), f"model_weights_{epoch}.pt")

In [None]:
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
#         self.embedding.weight.requires_grad = False
        self.embedding_dropout = nn.Dropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
#         self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
#         print(h_embedding.shape)
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
#         aux_result = self.linear_aux_out(hidden)
        
        return result

In [None]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        text = re.sub('\s+',' ',text)
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [None]:
x_train = preprocess(train_texts)
y_train = train_labels

x_val = preprocess(val_texts)
y_val = val_labels

x_test = preprocess(test_texts)
y_test = test_labels

In [None]:
max_features = None

In [None]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test) + list(x_val))

x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_val = sequence.pad_sequences(x_val, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [None]:
max_features = max_features or len(tokenizer.word_index) + 1
max_features

In [None]:
crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))


In [None]:
glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)
print('n unknown words (glove): ', len(unknown_words_glove))

In [None]:
embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
embedding_matrix.shape

del crawl_matrix
del glove_matrix
gc.collect()


In [None]:
embedding_matrix.shape

In [None]:
x_train_torch = torch.tensor(x_train, dtype=torch.long).cuda()
x_val_torch = torch.tensor(x_val, dtype=torch.long).cuda()
x_test_torch = torch.tensor(x_test, dtype=torch.long).cuda()

y_train_torch = torch.tensor(y_train.to_list(), dtype=torch.float32).unsqueeze(-1).cuda()
y_val_torch = torch.tensor(y_val.to_list(), dtype=torch.float32).unsqueeze(-1).cuda()
y_test_torch = torch.tensor(y_test.to_list(), dtype=torch.float32).unsqueeze(-1).cuda()

In [None]:
y_train_torch.shape

In [None]:
train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
val_dataset = data.TensorDataset(x_val_torch, y_val_torch)
test_dataset = data.TensorDataset(x_test_torch)

# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)

seed_everything(1234)

model = NeuralNet(embedding_matrix)
model.cuda()

train_model(model, train_dataset, val_dataset, output_dim=1, 
                         loss_fn=nn.L1Loss(), batch_size=128, n_epochs=8)
print()

In [None]:
torch.save(model.state_dict(), f"model_weights_7.pt")

In [None]:
test_dataset = data.TensorDataset(x_test_torch)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)
model.cuda()

test_preds = np.zeros((len(test_dataset), 1))
    
for i, x_batch in enumerate(test_loader):
    y_pred = model(*x_batch).detach().cpu().numpy()

    test_preds[i * 128:(i+1) * 128, :] = y_pred

In [None]:
np.mean(np.abs(np.subtract(test_preds, y_test_torch.detach().cpu().numpy())))

In [None]:
df = pd.DataFrame()
df['predictions'] = pd.Series(test_preds.squeeze())
df['input'] = test_texts.reset_index(drop=True)
df['Label'] = y_test_torch.detach().cpu().numpy()

df.to_csv("final_predictions.csv")

In [None]:
df