In [12]:
import numpy as np
import pandas as pd 
import torch
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from tqdm import tqdm

train_data = pd.read_csv('train.csv')
train_texts = train_data['text'].tolist()
train_scores = train_data['score'].values

In [13]:
train_data

Unnamed: 0,text,score
0,"Dear Newspaper, I believe that computers have ...",4.2
1,"Dear local newspaper, @CAPS1 stand on this hol...",3.0
2,Hi! I am writing in which computers are a bad ...,3.4
3,I don't think they should remove anything that...,3.0
4,"Dear Local Newspaper, I think that using compu...",3.4
...,...,...
2995,Don't you hate when you come across a book tha...,3.0
2996,I believe that the children of today should be...,4.0
2997,"There a books, music, movies, magazines, and e...",3.0
2998,"In my opinion, having computers is avery nice ...",4.2


In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')

train_scores = torch.tensor(train_scores, dtype=torch.float32)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

In [15]:
validation_data = pd.read_csv('sample_submission.csv')
val_texts = validation_data['text'].tolist()
val_scores = validation_data['score'].values
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt')

In [16]:
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_scores)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_scores, dtype=torch.float32))
val_dataloader = DataLoader(val_dataset, batch_size=4)


In [17]:

model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=1)
model.train()

criterion = nn.MSELoss()
weight_decay = 0.1
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=weight_decay)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(train_dataloader) * 5)


Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
epochs = 5

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

for epoch in range(epochs):
    model.train()
    total_rmse = 0.0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs}'):
        optimizer.zero_grad()
        input_ids, attention_mask, targets = batch
        input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits.view(-1), targets)
        rmse = torch.sqrt(loss)
        rmse.backward()
        optimizer.step()
        scheduler.step()
        total_rmse += rmse.item()

    avg_rmse = total_rmse / len(train_dataloader)
    print(f'Epoch {epoch+1}/{epochs}, RMSE: {avg_rmse:.4f}')



In [None]:
model.save_pretrained('bert_large_regression_model')


In [None]:
submission_data = pd.read_csv('sample_submission.csv')

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

submission_encodings = tokenizer(submission_data['text'].tolist(), truncation=True, padding=True, return_tensors='pt')

submission_dataset = TensorDataset(submission_encodings['input_ids'], submission_encodings['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=4)

model = BertForSequenceClassification.from_pretrained('åbert_large_regression_model')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

model.eval() 

predicted_scores = []

for batch in tqdm(submission_dataloader, desc="Inferencing"):
    with torch.no_grad():
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_scores.extend(outputs.logits.view(-1).cpu().numpy())

submission_data['score'] = predicted_scores


In [None]:
submission_data.to_csv('predicted_csv', index=False)

In [None]:
submission_data

In [None]:
submission_data.score.max()