In [1]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("llm-aes/asappp-1-2-original")

# View the dataset structure
print(dataset)


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['essay_set', 'essay', 'rater1_domain1', 'rater2_domain1', 'domain1_score', 'rubrics', 'prompt', 'content', 'organization', 'word_choice', 'sentence_fluency', 'conventions', '__index_level_0__'],
        num_rows: 3583
    })
})


In [2]:
# Check column names in the training split
print(dataset['train'].column_names)

# View the first sample from the training split to get an idea of the columns
print(dataset['train'][0])



['essay_set', 'essay', 'rater1_domain1', 'rater2_domain1', 'domain1_score', 'rubrics', 'prompt', 'content', 'organization', 'word_choice', 'sentence_fluency', 'conventions', '__index_level_0__']
{'essay_set': 1, 'essay': "Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/s

In [3]:
import re

# Clean function to remove unwanted characters
def clean_text(text):
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove special characters (you can customize this as needed)
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase (optional for uncased models)
    text = text.lower()
    return text

# Apply cleaning function to the dataset's text column
dataset = dataset.map(lambda x: {'essay': clean_text(x['essay'])})


In [4]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer (uncased version)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['essay'], padding='max_length', truncation=True, max_length=512)

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Check tokenized data
print(tokenized_datasets['train'][0])


{'essay_set': 1, 'essay': 'dear local newspaper i think effects computers have on people are great learning skillsaffects because they give us time to chat with friendsnew people helps us learn about the globeastronomy and keeps us out of troble thing about dont you think so how would you feel if your teenager is always on the phone with friends do you ever time to chat with your friends or buisness partner about things well now  theres a new way to chat the computer theirs plenty of sites on the internet to do so organization1 organization2 caps1 facebook myspace ect just think now while your setting up meeting with your boss on the computer your teenager is having fun on the phone not rushing to get off cause you want to use it how did you learn about other countrysstates outside of yours well i have by computerinternet its a new way to learn about what going on in our time you might think your child spends a lot of time on the computer but ask them so question about the economy sea 

In [5]:
import torch
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Step 1: Extract the scores from the dataset
labels = torch.tensor(dataset['train']['domain1_score'], dtype=torch.float32)

# Step 2: Normalize using MinMaxScaler
scaler = MinMaxScaler()
normalized_array = scaler.fit_transform(labels.unsqueeze(1).numpy())  # Convert to NumPy array and reshape

# Step 3: Convert to flat list (1D)
normalized_labels = normalized_array.flatten().tolist()

# Step 4: Add normalized labels to dataset
dataset['train'] = dataset['train'].add_column('normalized_score', normalized_labels)


In [6]:
# Remove the column if it already exists
if 'normalized_score' in dataset['train'].column_names:
    dataset['train'] = dataset['train'].remove_columns(['normalized_score'])

# Now add it cleanly
dataset['train'] = dataset['train'].add_column('normalized_score', normalized_labels)

from sklearn.preprocessing import MinMaxScaler
import numpy as np





In [7]:
# Check the first few rows to ensure the column is added
print(dataset['train'][0])


{'essay_set': 1, 'essay': 'dear local newspaper i think effects computers have on people are great learning skillsaffects because they give us time to chat with friendsnew people helps us learn about the globeastronomy and keeps us out of troble thing about dont you think so how would you feel if your teenager is always on the phone with friends do you ever time to chat with your friends or buisness partner about things well now  theres a new way to chat the computer theirs plenty of sites on the internet to do so organization1 organization2 caps1 facebook myspace ect just think now while your setting up meeting with your boss on the computer your teenager is having fun on the phone not rushing to get off cause you want to use it how did you learn about other countrysstates outside of yours well i have by computerinternet its a new way to learn about what going on in our time you might think your child spends a lot of time on the computer but ask them so question about the economy sea 

In [9]:
# Use the .train_test_split method from Hugging Face Datasets
split_dataset = tokenized_datasets['train'].train_test_split(test_size=0.2, seed=42)

# Access the train and test datasets
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

# Step 1: Extract raw scores from both train and test splits
train_scores = np.array(train_dataset['domain1_score']).reshape(-1, 1)
test_scores = np.array(test_dataset['domain1_score']).reshape(-1, 1)

# Step 2: Fit scaler on train only (standard practice)
scaler = MinMaxScaler()
scaler.fit(train_scores)

# Step 3: Normalize both sets
train_normalized = scaler.transform(train_scores).flatten().tolist()
test_normalized = scaler.transform(test_scores).flatten().tolist()

# Step 4: Add them back to datasets
# Remove the column if it already exists
if "normalized_score" in train_dataset.column_names:
    train_dataset = train_dataset.remove_columns("normalized_score")
if "normalized_score" in test_dataset.column_names:
    test_dataset = test_dataset.remove_columns("normalized_score")

# Add the new normalized score column
train_dataset = train_dataset.add_column("normalized_score", train_normalized)
test_dataset = test_dataset.add_column("normalized_score", test_normalized)


# Check the sizes
print("Train size:", len(train_dataset))
print("Test size:", len(test_dataset))


Train size: 2866
Test size: 717


In [None]:
import torch
from torch.utils.data import TensorDataset

# Convert Hugging Face columns to tensors
train_data = TensorDataset(
    torch.tensor(train_dataset['input_ids']),
    torch.tensor(train_dataset['attention_mask']),
    torch.tensor(train_dataset['normalized_score'], dtype=torch.float32)
)

test_data = TensorDataset(
    torch.tensor(test_dataset['input_ids']),
    torch.tensor(test_dataset['attention_mask']),
    torch.tensor(test_dataset['normalized_score'], dtype=torch.float32)
)



In [11]:
# Convert to pandas DataFrame
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()

# Save as CSV
train_df.to_csv("train_dataset.csv", index=False)
test_df.to_csv("test_dataset.csv", index=False)

print("✅ Saved train_dataset.csv and test_dataset.csv with normalized scores.")


✅ Saved train_dataset.csv and test_dataset.csv with normalized scores.


In [None]:
# fine tuning the Bert + biLstm model to our data
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
class BertBiLSTMRegressor(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', hidden_dim=128, num_layers=1, dropout=0.3):
        super(BertBiLSTMRegressor, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)

        self.lstm = nn.LSTM(
            input_size=self.bert.config.hidden_size,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )

        self.dropout = nn.Dropout(dropout)
        self.regressor = nn.Linear(hidden_dim * 2, 1)  # *2 because of BiLSTM

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            sequence_output = outputs.last_hidden_state  # shape: (batch_size, seq_len, hidden_size)

        lstm_output, _ = self.lstm(sequence_output)  # shape: (batch_size, seq_len, hidden_dim * 2)
        pooled_output = torch.mean(lstm_output, dim=1)  # mean pooling across sequence length
        pooled_output = self.dropout(pooled_output)

        score = self.regressor(pooled_output)  # shape: (batch_size, 1)
        return score.squeeze(1)  # shape: (batch_size,)


In [13]:
model = BertBiLSTMRegressor()




In [None]:
# making the test data and train data set into tesor type to give it to the model
from torch.utils.data import TensorDataset, DataLoader

def create_dataloader(dataset, batch_size=8):
    input_ids = torch.tensor(dataset['input_ids'])
    attention_mask = torch.tensor(dataset['attention_mask'])
    labels = torch.tensor(dataset['normalized_score'], dtype=torch.float)

    data = TensorDataset(input_ids, attention_mask, labels)
    dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)
    return dataloader

train_loader = create_dataloader(train_dataset)
test_loader = create_dataloader(test_dataset, batch_size=8)


In [15]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()


In [None]:
# trainning the model
def train(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

for epoch in range(3):  # You can adjust the number of epochs
    avg_loss = train(model, train_loader, optimizer, loss_fn, device)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")


Epoch 1, Loss: 0.0473
Epoch 2, Loss: 0.0150
Epoch 3, Loss: 0.0125


In [17]:
torch.save(model.state_dict(), "bert_bilstm_essay_scorer.pt")


In [19]:
# Load the model from the saved file
model = BertBiLSTMRegressor()
model.load_state_dict(torch.load("model/bert_bilstm_essay_scorer.pt"))
model.eval()  # Set the model to evaluation mode


BertBiLSTMRegressor(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme

In [25]:
def preprocess_essay(essay):
    # Clean the essay
    cleaned_essay = clean_text(essay)
    
    # Tokenize
    inputs = tokenizer(cleaned_essay, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    return inputs

essay = """
In recent years, technology has dramatically transformed the way we communicate, learn, and work. The rapid development of the internet and digital devices has created new opportunities for people around the world to connect with one another. While this technological revolution has brought about many positive changes, it has also raised concerns about its potential negative effects on society.

One of the most significant benefits of modern technology is its ability to facilitate communication. Social media platforms, video conferencing tools, and messaging apps have made it easier for people to stay in touch with friends, family, and colleagues, regardless of distance. These tools have been particularly important during the COVID-19 pandemic, when many individuals and businesses had to rely on virtual communication to continue functioning.

Another major advantage of technology is its role in education. Online learning platforms and digital resources have made education more accessible to individuals who may not have had the opportunity to attend traditional schools. Students can now access courses and materials from top universities and educators, regardless of their location or financial situation. This has the potential to level the playing field and provide more people with the skills they need to succeed in the modern workforce.

However, despite these advantages, there are several concerns associated with the rapid growth of technology. One of the most pressing issues is the potential for increased social isolation. As more people spend time interacting with others online, they may neglect face-to-face relationships, leading to feelings of loneliness and disconnection. Additionally, the overuse of technology can lead to addiction, as individuals may become overly reliant on their devices for entertainment, socialization, and work.

Another concern is the impact of technology on privacy. With the rise of social media, online shopping, and data-driven advertising, personal information is being collected and shared more than ever before. This has raised concerns about the potential for misuse of personal data and the erosion of privacy rights. Many people are unaware of the extent to which their data is being collected, and there is a growing need for stronger regulations to protect individuals' privacy.

In conclusion, while technology has brought about numerous benefits, it is important to recognize the potential downsides. As we continue to embrace new digital tools, we must be mindful of their impact on our social lives, privacy, and overall well-being. By finding a balance between the advantages and disadvantages of technology, we can ensure that it continues to serve humanity in a positive and meaningful way.
"""


# Preprocess the essay
inputs = preprocess_essay(new_essay)


In [26]:
# Ensure model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Preprocess the essay
inputs = preprocess_essay(new_essay)

# Move inputs to the same device as the model
inputs = {k: v.to(device) for k, v in inputs.items()}

# Predict the score
with torch.no_grad():
    predicted_score = model(inputs['input_ids'], inputs['attention_mask'])
    predicted_score = predicted_score.item()  # Get the scalar value from tensor

# Denormalize the predicted score
predicted_score_denormalized = scaler.inverse_transform([[predicted_score]])[0][0]
print(f"Predicted Essay Score: {predicted_score_denormalized}")


Predicted Essay Score: 4.311216354370117
