In [2]:
import pandas as pd

train_data = pd.read_csv('seed.csv')
test_data = pd.read_csv('test.csv')

# Check the first few rows of the data
print(train_data.head())
print(test_data.head())

                                             example        label
0  Seller: Hello Buyer: Wow love the Couch. SO lo...    furniture
1  Seller: I am selling this for $28500. Buyer: I...          car
2  Buyer: I'd like to negotiate a lower price for...      housing
3  Seller: Hi!  Are you interested in my headphon...  electronics
4  Seller: Hi. Were you interested in the mirror?...    furniture
                                             example      label
0  Seller: Hi are you interested in buying my Pin...  furniture
1  Buyer: Hello I am interested in your property ...    housing
2  Buyer: Hello . How long have you owned the dre...  furniture
3  Buyer: I am very interested place you have for...    housing
4  Buyer: Hey, nice car you have here, how long h...        car


In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(data):
    return tokenizer(data['example'].tolist(), padding=True, truncation=True, return_tensors='pt')

train_inputs = tokenize_data(train_data)
test_inputs = tokenize_data(test_data)

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train_labels = label_encoder.fit_transform(train_data['label'])
test_labels = label_encoder.transform(test_data['label'])

In [5]:
import torch
from transformers import BertForSequenceClassification, AdamW

# ...

# Create Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Move the model to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

train_labels = torch.tensor(train_labels, dtype=torch.long).to(device)

train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

for epoch in range(3):  # You may need to adjust the number of epochs
    model.train()
    for batch in train_loader:
        inputs, attention_mask, labels = batch
        inputs, attention_mask, labels = inputs.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()




In [7]:
model.eval()

with torch.no_grad():
    test_inputs = {key: val.to(device) for key, val in test_inputs.items()}
    outputs = model(**test_inputs)
    predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

# Evaluate accuracy or other metrics
accuracy = (predictions == test_labels).mean()
print(f"Accuracy: {accuracy}")


Accuracy: 0.827


In [8]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load and preprocess data
train_data = pd.read_csv('seed.csv')
train_inputs = tokenize_data(train_data)
train_labels = label_encoder.transform(train_data['label'])
train_labels = torch.tensor(train_labels, dtype=torch.long).to(device)
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
model.to(device)

# Training hyperparameters
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

# Active learning loop
num_iterations = 5  # You can adjust the number of active learning iterations

for iteration in range(num_iterations):
    # Train the model
    model.train()
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    
    for batch in train_loader:
        inputs, attention_mask, labels = batch
        inputs, attention_mask, labels = inputs.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Calculate accuracy on the entire training set
    model.eval()
    with torch.no_grad():
        train_inputs = {key: val.to(device) for key, val in train_inputs.items()}
        outputs = model(**train_inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        correct_predictions = (predictions == train_labels).sum().item()
        accuracy = correct_predictions / len(train_labels)

    print(f"Iteration {iteration + 1}: Accuracy = {accuracy * 100:.2f}%")

    # Calculate entropy on the entire training set
    with torch.no_grad():
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=1)
        entropy = -torch.sum(probabilities * torch.log2(probabilities + 1e-10), dim=1)

    # Select the top k examples with highest entropy for retraining
    k = int(len(train_dataset) * 0.1)  # You can adjust the fraction of examples to retrain
    uncertain_indices = torch.argsort(entropy, descending=True)[:k]

    # Retrain the model on the selected uncertain examples
    uncertain_inputs = train_inputs['input_ids'][uncertain_indices]
    uncertain_attention_mask = train_inputs['attention_mask'][uncertain_indices]
    uncertain_labels = train_labels[uncertain_indices]
    uncertain_dataset = TensorDataset(uncertain_inputs, uncertain_attention_mask, uncertain_labels)
    
    train_loader = DataLoader(uncertain_dataset, batch_size=8, shuffle=True)

    for batch in train_loader:
        inputs, attention_mask, labels = batch
        inputs, attention_mask, labels = inputs.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iteration 1: Accuracy = 30.00%
Iteration 2: Accuracy = 57.00%
Iteration 3: Accuracy = 91.00%
Iteration 4: Accuracy = 98.00%
Iteration 5: Accuracy = 99.00%


In [29]:
from sklearn.metrics import precision_score

# Load the test dataset (assuming it's in a pandas DataFrame)
test_data = pd.read_csv('test.csv')

# Tokenize and encode the test dataset
test_inputs = tokenizer(test_data['example'].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=256)
test_labels = label_encoder.transform(test_data['label'])

# Move the test dataset to the GPU if available
test_inputs = {key: val.to(device) for key, val in test_inputs.items()}
test_labels = torch.tensor(test_labels, dtype=torch.long).to(device)

# Run the model on the test dataset
model.eval()
with torch.no_grad():
    outputs = model(**test_inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

# Decode the predictions using the label encoder
decoded_predictions = label_encoder.inverse_transform(predictions.cpu().numpy())

# Calculate precision score
precision = precision_score(test_labels.cpu().numpy(), predictions.cpu().numpy(), average='weighted')

# Print the classification results
results = pd.DataFrame({'Example': test_data['example'], 'True Label': test_data['label'], 'Predicted Label': decoded_predictions})
print(results)

# Print precision score
print(f"\nPrecision on Test Set: {precision:.4f}")


                                               Example   True Label  \
0    Seller: Hi are you interested in buying my Pin...    furniture   
1    Buyer: Hello I am interested in your property ...      housing   
2    Buyer: Hello . How long have you owned the dre...    furniture   
3    Buyer: I am very interested place you have for...      housing   
4    Buyer: Hey, nice car you have here, how long h...          car   
..                                                 ...          ...   
995  Buyer: Hi! Seller: Hello.  How are you? Buyer:...    furniture   
996  Seller: Hi how are you? Buyer: I'm wonderful! ...  electronics   
997  Buyer: hello I am interested in the yukon you ...          car   
998  Seller: Hi there, are you interested in my pro...      housing   
999  Seller: Hello. Buyer: Hi. Would you do 10$ on ...         bike   

    Predicted Label  
0         furniture  
1           housing  
2         furniture  
3           housing  
4               car  
..             