In [None]:
#!pip install transformers

In [1]:
import numpy as np
import pandas as pd
import random
import os
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification,get_constant_schedule_with_warmup)
from sklearn.metrics import mean_squared_error

In [4]:
# ../input/commonlitreadabilityprize/
#train_data = pd.read_csv('/content/train.csv')
#test_data = pd.read_csv('/content/test.csv')
#sample = pd.read_csv('/content/sample_submission.csv')

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

target = train_data['target'].to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [5]:
print(train_data[train_data['target']==train_data['target'].max()])

             id                                          url_legal  \
2829  25ca8f498  https://sites.ehe.osu.edu/beyondpenguins/files...   

           license                                            excerpt  \
2829  CC BY-SA 3.0  When you think of dinosaurs and where they liv...   

       target  standard_error  
2829  1.71139          0.6469  


In [6]:
# The max target is the simplest text.
row_simple = 2829
row_hard = 1705

simple_text = train_data.iat[row_simple,3]
hard_text = train_data.iat[row_hard,3]

print(simple_text)

When you think of dinosaurs and where they lived, what do you picture? Do you see hot, steamy swamps, thick jungles, or sunny plains? Dinosaurs lived in those places, yes. But did you know that some dinosaurs lived in the cold and the darkness near the North and South Poles?
This surprised scientists, too. Paleontologists used to believe that dinosaurs lived only in the warmest parts of the world. They thought that dinosaurs could only have lived in places where turtles, crocodiles, and snakes live today. Later, these dinosaur scientists began finding bones in surprising places.
One of those surprising fossil beds is a place called Dinosaur Cove, Australia. One hundred million years ago, Australia was connected to Antarctica. Both continents were located near the South Pole. Today, paleontologists dig dinosaur fossils out of the ground. They think about what those ancient bones must mean.


In [7]:
config = {
    'batch_size':128,
    'max_len':256,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [8]:
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
class CLRPDataset(nn.Module):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)  
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [10]:
def get_embeddings(df,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")

    model.to(device)
    model.eval()

    ds = CLRPDataset(df,tokenizer,config['max_len'])
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:,0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
train_embeddings1 =  get_embeddings(train_data)

cuda is used


In [23]:
train_embeddings1[0].shape

(768,)

In [49]:
dataset = torch.tensor(np.column_stack((train_embeddings1, target)))
train_X = torch.tensor(train_embeddings1)
train_Y = torch.tensor(target)
print(dataset.shape)

torch.Size([2834, 769])


In [None]:
class NeuralNet(nn.Module):
    def __int__(self):
        super(NeuralNet, self).__init__()
        self.hidden1 = nn.Linear(768, 100)
        self.hidden2 = nn.Linear(100, 1)

    def forward(self, x):
        x = nn.ReLU(self.hidden1(x))
        return self.hidden2(x)

In [47]:
model = nn.Sequential(
    nn.Linear(768, 100),
    nn.ReLU(),
    nn.Linear(100, 1)
)


In [57]:
#model = NeuralNet()

output = model(train_X[0])
print(output.shape)

print(type(output))

torch.Size([1])
<class 'torch.Tensor'>


In [65]:
import torch.optim as optim

batch_size = 4

trainloader = torch.utils.data.DataLoader(train_X, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

#testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,                                       shuffle=False, num_workers=2)

criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

MAX_EPOCHS = 20

for epoch in range(MAX_EPOCHS):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = train_X, train_Y

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs.float(), labels.float())
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
    print('Epoch [%d/%d] loss: %.3f' %
            (epoch + 1, MAX_EPOCHS, running_loss / train_X.shape[0]))

print('Finished Training')

  return F.mse_loss(input, target, reduction=self.reduction)


Finished Training


In [66]:
print(rmse_score(train_Y,model.forward(train_X).detach().numpy()))

1.0332232314516974


In [None]:

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()