In [None]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizerFast, DistilBertModel

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [None]:
inputs = tokenizer(["my name is dhruv","hi yolo in pitt"], padding="max_length", truncation=True, return_tensors="pt")


In [None]:
train_df = pd.read_csv("../input/jobsplits/train_texts.csv")
train_labels = pd.read_csv("../input/jobsplits/train_labels.csv")

val_df = pd.read_csv("../input/jobsplits/val_texts.csv")
val_labels = pd.read_csv("../input/jobsplits/val_labels.csv")

test_df = pd.read_csv("../input/jobsplits/test_texts.csv")
test_labels = pd.read_csv("../input/jobsplits/test_labels.csv")

In [None]:
train_texts = train_df['train_texts'].to_list() 
val_texts = val_df['val_texts'].to_list() 
test_texts = test_df['test_texts'].to_list() 

# tokenizer(train_texts[:256], padding="max_length", truncation=True)

In [None]:
bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
bert.transformer.layer[0]

In [None]:
inputs

In [None]:
outputs = bert(**inputs)
outputs

In [None]:
outputs[0][:,0].shape

In [None]:
inputs.keys()

In [None]:
train_input_ids = []
# train_token_type_ids = []
train_attention_mask = []


for idx in range(0,len(train_texts),512):
    inputs = tokenizer(train_texts[idx:idx+512], padding="max_length", truncation=True, return_tensors="pt")
    train_input_ids.extend(inputs.input_ids)
#     train_token_type_ids.extend(inputs.token_type_ids)
    train_attention_mask.extend(inputs.attention_mask)

In [None]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from torch import nn
import time
from tqdm.notebook import tqdm_notebook as tqdm


In [None]:
class SalaryDataset(Dataset):
    def __init__(self, input_ids, attention_masks, outputs):
        self.inputs = input_ids
#         self.token_type = token_type_ids
        self.attention_masks = attention_masks
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_ids = self.inputs[idx]
#         token_type_ids = self.token_type[idx]
        attention_mask = self.attention_masks[idx]
        output = self.outputs[idx]
        return input_ids, attention_mask, output

In [None]:
val_input_ids = []
# val_token_type_ids = []
val_attention_mask = []


for idx in range(0,len(val_texts),512):
    inputs = tokenizer(val_texts[idx:idx+512], padding="max_length", truncation=True, return_tensors="pt")
    val_input_ids.extend(inputs.input_ids)
#     val_token_type_ids.extend(inputs.token_type_ids)
    val_attention_mask.extend(inputs.attention_mask)
    


In [None]:
test_input_ids = []
# test_token_type_ids = []
test_attention_mask = []


for idx in range(0,len(test_texts),512):
    inputs = tokenizer(test_texts[idx:idx+512], padding="max_length", truncation=True, return_tensors="pt")
    test_input_ids.extend(inputs.input_ids)
#     test_token_type_ids.extend(inputs.token_type_ids)
    test_attention_mask.extend(inputs.attention_mask)
    


In [None]:
bert.modules()

In [None]:
class BertRegression(nn.Module):
    
    def __init__(self):
        super(BertRegression, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        
        for param in self.bert.parameters():
            param.requires_grad = False
        
        for param in self.bert.transformer.layer[4:].parameters():
            param.requires_grad = True
            
        self.linear1 = nn.Linear(768,768)
        self.linear_out = nn.Linear(768,1)        
    
    def forward(self, input_ids, attention_mask):
        pooler_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0][:,0]
        out1 = self.linear1(pooler_out)
        out = self.linear_out(out1)
        
        return out

In [None]:
train_dataset = SalaryDataset(train_input_ids, train_attention_mask, train_labels['train_labels'].to_list())
val_dataset = SalaryDataset(val_input_ids, val_attention_mask, val_labels['val_labels'].to_list())
test_dataset = SalaryDataset(test_input_ids, test_attention_mask, test_labels['test_labels'].to_list())

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [None]:
outs = next(iter(train_dataloader))

In [None]:
outs

In [None]:
from transformers import AdamW
from transformers import get_scheduler

In [None]:
def train_model(model, train, valid, loss_fn, lr=0.001,
                batch_size=64, n_epochs=10,):
#     param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
#     optimizer = torch.optim.Adam(param_lrs, lr=lr)
    optimizer = AdamW(model.parameters(), lr=5e-5)

#     scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)

    print(batch_size)
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

    num_training_steps = n_epochs * len(train_loader)
    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
    
    for epoch in range(n_epochs):
        start_time = time.time()
        
        scheduler.step()
        
        model.train()
        avg_loss = 0.
        
        for data in tqdm(train_loader, disable=False):
            input_ids = data[0].cuda()
#             token_type_ids = data[1].cuda()
            attention_mask = data[1].cuda()
            y_batch = data[2].cuda().unsqueeze(-1)

            y_pred = model(input_ids=input_ids, attention_mask=attention_mask)            
            loss = loss_fn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        valid_loss = 0.
        for i, data in enumerate(valid_loader):
            input_ids = data[0].cuda()
#             token_type_ids = data[1].cuda()
            attention_mask = data[1].cuda()
            y_batch = data[2].cuda().unsqueeze(-1)
            
            y_pred = model(input_ids=input_ids, attention_mask=attention_mask)

            loss = loss_fn(y_pred, y_batch)
            valid_loss += loss.item() / len(valid_loader)

        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s \t validation loss={:.4f}'.format(
              epoch + 1, n_epochs, avg_loss, elapsed_time, valid_loss))
        
        if epoch %2 == 0:
            torch.save(model.state_dict(), f"model_weights_{epoch}.pt")


In [None]:
model = BertRegression()
model.cuda()

In [None]:
train_model(model, train_dataset, val_dataset, 
                         loss_fn=nn.L1Loss(), batch_size=64, n_epochs=5)
print()

In [None]:
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)
model.cuda()

test_preds = np.zeros((len(test_dataset), 1))
    
for i, data in enumerate(test_loader):
    input_ids = data[0].cuda()
#     token_type_ids = data[1].cuda()
    attention_mask = data[1].cuda()
    y_batch = data[2].cuda()
    y_pred = model(input_ids=input_ids, attention_mask=attention_mask).detach().cpu().numpy()

    test_preds[i * 128:(i+1) * 128, :] = y_pred

In [None]:
np.mean(np.abs(np.subtract(test_preds, np.reshape(np.array(test_labels), (-1,1)))))

In [None]:
np.mean(np.abs(np.subtract(test_preds, test_labels)))

In [None]:
df = pd.DataFrame()
df['predictions'] = pd.Series(test_preds.squeeze())
df['input'] = test_texts
df['Label'] = test_labels['test_labels'].to_list()

df.to_csv("final_predictions.csv")

In [None]:
df