# Text Annotation.

### Define the json file to train a model. Run the bellow code if you want to train the model with your data.

- Set your own paths for the text dataset and the labels (.txt files) within the JSON file.

In [1]:
import active_learning_text
import json

# Define the JSON structure.
annotation_config_text = {
  "data_type": "text", # Type "image" for images or "text" for text dataset.
  "data": "./unlabeled_text_data.txt", # Type the exact path to the data.
  "labels": "./true_text_labels.txt", # Type the exact path to the labels.
  "active_learning_method": "uncertainty" # Type "uncertainty" for uncertainty sampling or "entropy" for entropy sampling.
}

# Create the JSON file.
path_to_save_the_json_file = "./annotation_config_text.json"
with open(path_to_save_the_json_file, "w") as json_file:
    json.dump(annotation_config_text, json_file, indent=4)

with open(path_to_save_the_json_file, 'r') as f:  # Open the JSON file in read mode.
    json_file = json.load(f)
data_type = json_file['data_type']

if data_type == 'text':
    active_learning_text.active_learning("annotation_config_text.json") # Train a model with the dataset.

Load the configuration from the JSON file.
Read data from the specified text files and create a DataFrame.
Prepare the data for training and testing.
Selected indices for uncertainty: [2 0 1 5 4 3]
No more samples left in X_test to annotate.
Model saved successfully.


### Use the model to annotate your text data.

- Set your own paths for the text dataset and the labels (.txt files) within the JSON file.

In [2]:
# Define the JSON structure.
annotation_config_text_annotations = {
    "data_type": "text",  # Type "image" for images or "text" for text dataset.
    "classification_type": "multi-class",
    "data": "./unlabeled_data_2.txt",  # Type the exact path to the data.
    "labels": "./true_labels_2.txt",  # Type the exact path to the labels.
    "active_learning_method": "uncertainty"  # Type "uncertainty" for uncertainty sampling or "entropy" for entropy sampling.
}

# Create the JSON file.
path_to_save_the_json_file = "./annotation_config_text_annotations.json"
with open(path_to_save_the_json_file, "w") as json_file:
    json.dump(annotation_config_text_annotations, json_file, indent=4)

# Load model and JSON configuration for predictions.
model_path = "./trained_model_for_text_annotation.joblib"  # Path to the model.
output_file = "./predicted_text_labels.txt"  # Path to the output file that will contain the predicted labels.

# Now pass the file path, not the loaded dictionary.
active_learning_text.predict_and_evaluate(model_path=model_path, json_file=path_to_save_the_json_file, output_file=output_file)

Loading data and labels from the JSON file.
Predicted labels saved to ./predicted_text_labels.txt.
JSON configuration saved to ./results.json.


# Image annotation.

- Example for image annotation using the CIFAR-10 dataset. Train a model. Set your own paths for the data (images) and the labels.

In [3]:
import os
from PIL import Image
import numpy as np
from torchvision import datasets, transforms
import active_learning_images

# Upload your images and the labels into your folder. This is an example for CIFAR-10 dataset.

# Load the CIFAR-10 dataset.
# Define transformations for CIFAR-10.
transform = transforms.Compose([transforms.ToPILImage()])  # We will save as PIL images.

# Download CIFAR-10 dataset
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.ToTensor())

# Directory to save images.
save_dir = './cifar_images'
os.makedirs(save_dir, exist_ok=True)

# File to save image paths and labels
image_paths_file = os.path.join(save_dir, 'image_paths.txt')
labels_file = os.path.join(save_dir, 'labels.txt')

# Save images and labels from the CIFAR-10 dataset.
with open(image_paths_file, 'w') as img_file, open(labels_file, 'w') as label_file:
    for idx, (image_tensor, label) in enumerate(train_dataset):
        # Convert tensor to PIL Image.
        image = transforms.ToPILImage()(image_tensor)
        
        # Save the image.
        image_path = os.path.join(save_dir, f"image_{idx}.png")
        image.save(image_path)

        # Write the image path and label to the respective files.
        img_file.write(f"{image_path}\n")
        label_file.write(f"{label}\n")

print(f"Images saved to {save_dir}.")
print(f"Image paths saved to {image_paths_file}.")
print(f"Labels saved to {labels_file}.")

Files already downloaded and verified
Files already downloaded and verified
Images saved to ./cifar_images.
Image paths saved to ./cifar_images/image_paths.txt.
Labels saved to ./cifar_images/labels.txt.


### Train a model.

In [4]:
import active_learning_images
import json

# Create the JSON file. Define the correct paths for your data.
annotation_config_image = {
    "data_type": "image",
    "classification_type": "multi-class",
    "data": "./cifar_images/image_paths.txt",
    "labels": "./cifar_images/labels.txt",
    "active_learning_method": "uncertainty"  # You can also try "entropy".
}

with open('annotation_config_image.json', 'w') as f:
    json.dump(annotation_config_image, f, indent=4)

print("JSON config file created.")

# Call the active_learning function for images.
active_learning_images.active_learning('./annotation_config_image.json')

JSON config file created.
No more samples left to annotate.
Model saved successfully.


### Use the model to annotate your image data.

In [13]:
import active_learning_images
import json

# Define the JSON structure.
annotation_config_image = {
    "data_type": "image",
    "classification_type": "multi-class",
    "data": "./cifar_images/image_paths.txt",
    "labels": "./cifar_images/labels.txt",
    "active_learning_method": "uncertainty"  # You can also try "entropy".
}

with open('annotation_config_image.json', 'w') as f:
    json.dump(annotation_config_image, f, indent=4)

# Load model and JSON configuration for predictions.
json_file = "./annotation_config_image.json"
model_path = "./trained_model_for_image_annotation.joblib"  # Path to the model.
output_file = "./predicted_image_labels.txt"  # Path to the output file that will contain the predicted labels.

# Now pass the file path, not the loaded dictionary.
active_learning_images.predict_and_evaluate(model_path=model_path, json_file=json_file, output_file=output_file)

Predicted labels saved to ./predicted_image_labels.txt.
JSON configuration saved to ./results.json.
