In [None]:
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import tqdm
tqdm.tqdm.pandas()
import torch
from sklearn.metrics import classification_report

In [None]:
model_name ='epfl-dhlab/CatastroBERT'
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Make sure to move the model to the correct device (either 'cpu' or 'cuda')
model.to('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:

def predict(text):
    # Prepare the text data
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        return_token_type_ids=True,
        padding=True,
        max_length=512,
        truncation=True,
        return_tensors='pt'
    )

    ids = inputs['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
    mask = inputs['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')

    # Get predictions
    with torch.no_grad():
        outputs = model(ids, mask)
        logits = outputs.logits

    # Apply sigmoid function to get probabilities
    probs = torch.sigmoid(logits).cpu().numpy()

    # Return the probability of the class (1)
    return probs[0][0]




If you just want to run individual tests to play around with the model, you can modify the text in the following cell and run it.

In [None]:
# Example usage
text = "Un violent ouragan est passé cette nuit sur Lausanne."
print(f"Prediction: {predict(text)}")

The following definitions implement a simple torch dataset and a method to run inference on batches instead of a single example at a time. You can use this to run inference on a larger dataset and store the results in a file for later use.


In [None]:

class TextInferenceDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten()
        }
def create_data_loader(texts, tokenizer, batch_size=32, max_length=512):
    dataset = TextInferenceDataset(texts, tokenizer, max_length)
    return DataLoader(dataset, batch_size=batch_size)

# Function to run inference
def run_inference(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm.tqdm(data_loader, total=len(data_loader),desc='Inference'):
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            
            outputs = model(input_ids, attention_mask)
            logits = outputs.logits
            probs = torch.sigmoid(logits).cpu().numpy()
            predictions.extend(probs[:, 0])  # Assuming binary classification
    
    return predictions

You can modify the batch size to fit your hardware constraints. If your dataset is large, I would advise to use gpu acceleration. If you don't have a gpu, you can use [Google Colab](https://colab.research.google.com/) to run the notebook on a gpu for free.

In [None]:
# Step 1: Load your data into a Pandas dataframe
data = pd.read_csv('ressources/data.csv')

#Step 2: set your batch size and Create a DataLoader for your text data 
batch_size = 512
data_loader = create_data_loader(data['text'], tokenizer, batch_size, max_length=512)

# Step 3: Run inference on your data
predictions = run_inference(model, data_loader)
predictions = [1 if x > 0.5 else 0 for x in predictions]
predictions = pd.DataFrame({'summary':text, 'pred':predictions})

# step 4: save your predictions
predictions.to_csv('predictions.csv')