## Basics

### Imports

In [39]:
import pandas as pd
import numpy as np
import os

### Working Directory Path

In [40]:
# Google Drive Path for Running on Colab
# from google.colab import drive
# drive.mount('/content/drive')
# path = "drive/MyDrive/MSc Thesis/"

In [55]:
# Local Path for Running Locally
path = "./data/"
path_images = path + "images/"
path_export = path + "features/"
os.makedirs(path_export, exist_ok=True)

### Read Pre-Processed Data

In [42]:
data = pd.read_csv(path + 'agora12_data_pp.csv')
data

Unnamed: 0,Id,FullText,ImageFilename,StartYear,EndYear
0,Agora:Object:Agora XII:1,foot missing. offset neck and echinoid mouth; ...,Agora_Image_2012.54.1450.jpg,-575,-550
1,Agora:Object:Agora XII:2,spreading ring foot with torus outer and conve...,,-525,-500
2,Agora:Object:Agora XII:3,flaring ring foot. torus mouth. ridge at junct...,,-500,-500
3,Agora:Object:Agora XII:4,ring foot. echinoid mouth inset from neck and ...,,-500,-500
4,Agora:Object:Agora XII:5,echinus ring foot. torus mouth; strap handles....,Agora_Image_2012.27.0009.jpg,-525,-500
...,...,...,...,...,...
1985,Agora:Object:Agora XII:2036,fragment of rim and floor with handle. deep ba...,,-350,-320
1986,Agora:Object:Agora XII:2037,handles missing. basin and lid; handles probab...,Agora_Image_2012.55.1261.jpg,-350,-301
1987,Agora:Object:Agora XII:2038,small series. plain flat-topped rim; flaring b...,Agora_Image_2012.25.0184.jpg,-435,-425
1988,Agora:Object:Agora XII:2039,"small series. rim flat on top, roughly moulded...",Agora_Image_2012.55.1268.jpg,-375,-325


## Feature Extraction from Text Data

### TF-IDF

✔ **Best for:** Traditional ML models (Random Forest, XGBoost, SVM).

✔ **Why use it?**
- Captures word importance across the dataset.
- Works well with structured text like archaeological descriptions.
- Computationally efficient, doesn’t require a large dataset.

🔹 **Pros:** Fast, interpretable, low resource usage.

🔹 **Cons:** Doesn’t capture word relationships (e.g., "small bowl" and "bowl small" are treated differently).

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=300,  # Use top 300 important words
    stop_words='english'
)

In [44]:
 # Vectorize the text using TF-IDF
tfidf_vectors = vectorizer.fit_transform(data["FullText"]).toarray()

# Convert to DataFrame
tfidf_vectors = pd.DataFrame(tfidf_vectors, columns=[f"F{i}" for i in range(300)])
tfidf_vectors

Unnamed: 0,F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F290,F291,F292,F293,F294,F295,F296,F297,F298,F299
0,0.000000,0.0,0.0,0.112297,0.0,0.0,0.0,0.136783,0.0,0.0,...,0.057870,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
1,0.198781,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.091611,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
2,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.150587,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
3,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.066499,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
4,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1985,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.226684,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
1986,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.165994,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
1987,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.150721,0.000000,0.0,0.0,0.0,0.000000,0.0,0.221368,0.0
1988,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.195492,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0


In [47]:
tfidf_vectors.to_csv(path_export + 'text_tfidf_vectors.csv', index=False, encoding='utf-8', sep=',', header=True)

### BERT

 Pretrained BERT models convert text (pottery descriptions, classifications, and dimensions) into dense numerical embeddings.

✅ Pros:
- Fast with low compute cost.
- Good for small datasets.
- Understands context better than TF-IDF
- Combines perfectly with images, dimensions, deposits for mixed models.

❌ Cons:
- Doesn’t adapt BERT to archaeology-specific vocabulary.

In [11]:
import torch
from transformers import AutoTokenizer, AutoModel
import os

os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [12]:
print(torch.__version__)
print(torch.cuda.is_available())  # True if CUDA is installed
print(torch.cuda.get_device_name(0))  # Should show NVIDIA GeForce RTX 4080

2.5.1
True
NVIDIA GeForce RTX 4080


**Load BERT**
- "bert-base-uncased" is the standard English BERT — lowercased, pretrained on Wikipedia and BooksCorpus.
- Tokenizer will convert your text into token IDs and attention masks.
- Model will turn those token IDs into a vector representation (embedding).

In [13]:
# Load the tokenizer and BERT model
model_name = "bert-base-uncased"  # "sentence-transformers/all-MiniLM-L6-v2" for optimized embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

**Text to BERT Embedding Function**
- The function takes raw text and returns a 768-dimensional vector (BERT’s hidden size).
- `tokenizer` converts text to token IDs and attention masks.
- `inputs = {key: val.to("cuda")}` moves all input tensors to GPU.
- ``model.to("cuda")`` ensures the model uses the GPU too.
- ``with torch.no_grad()`` tells PyTorch: no gradients needed (inference mode, not training).
- ``outputs.last_hidden_state[:, 0, :]``: grabs the first token’s embedding ([CLS]) which BERT uses to summarize the whole sequence.

In [33]:
def get_bert_embedding_tensor(text):
    if pd.isna(text) or text.strip() == "":
        return torch.zeros(768).to("cuda")  # Fallback for empty text

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Move each tensor in the dict to CUDA
    inputs = {key: val.to("cuda") for key, val in inputs.items()}
    model.to("cuda")  # Make sure the model is on CUDA too!

    with torch.no_grad():
        outputs = model(**inputs)

    bert_embedding_tensor = outputs.last_hidden_state[:, 0, :].squeeze()
    return bert_embedding_tensor

**BERT Embedding Tensors**

Apply text to BERT embedding tensor transformation to all records

In [34]:
bert_embedding_tensor_list = data["FullText"].apply(get_bert_embedding_tensor).tolist()

Stack tensors into single tensor for neural network pipelines

In [49]:
bert_embedding_tensors = torch.stack(bert_embedding_tensor_list)
bert_embedding_tensors.shape

torch.Size([1990, 768])

In [53]:
torch.save(bert_embedding_tensors, path_export + "text_bert_embedding_tensors.pt")
# bert_embedding_tensors = torch.load(path + "text_bert_embedding_tensors.pt")

**BERT Embedding Vectors**

Convert tensors to 2D NumPy Matrix for classic ML methods

In [35]:
bert_embedding_vectors = [tensor.cpu().numpy() for tensor in bert_embedding_tensor_list]
bert_embedding_vectors = np.vstack(bert_embedding_vectors)
bert_embedding_vectors = pd.DataFrame(
    bert_embedding_vectors,
    columns=[f"F{i}" for i in range(bert_embedding_vectors.shape[1])]
)
bert_embedding_vectors.shape

(1990, 768)

In [48]:
bert_embedding_vectors.to_csv(path_export + 'text_bert_embedding_vectors.csv', index=False, encoding='utf-8', sep=',',header=True)

### Fine-Tuning BERT

✅ Pros:
- BERT learns from the specific archaeology vocabulary of the data.
- Usually higher performance for text-heavy tasks.

❌ Cons:
- Takes longer to train.
- Harder to combine with images.

In [54]:
#### TO DO: TRY FINE TUNING ######

## Feature Extraction from Images

### Pretrained CNNs (Transfer Learning)

Pretrained Convolutional Neural Networks (CNNs) trained on ImageNet (millions of images) to extract feature vectors.

✅ Pros:
- Good for small dataset, no training needed.
- Despite being trained on everyday objects, they can recognize visual patterns of pottery like:
  - shape contours,
  - textures,
  - edges and symmetries

❌ Cons:
- Not specialized to archaeological images.


#### ResNet

In [60]:
from torchvision import transforms
from torchvision.models import resnet50
from PIL import Image

In [57]:
resnet_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [59]:
resnet = resnet50(weights='DEFAULT')
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])  # Remove classification layer
resnet.eval().cuda()

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to C:\Users\dimit/.cache\torch\hub\checkpoints\resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:12<00:00, 8.40MB/s]


Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [61]:
def extract_resnet_features(image_path):
    image = Image.open(image_path).convert("RGB")
    image_tensor = resnet_transform(image).unsqueeze(0).cuda()
    with torch.no_grad():
        features = resnet(image_tensor).squeeze().cpu().numpy()
    return features  # Shape: 2048

### Vision Transformers (ViT)

Transformer-based model from Hugging Face that treats images like sequences (patches).

- advanced alternative
- modern / cutting-edge method



### Fine-Tuning Pretrained CNNs (End-to-End Training)

✅ Pros:
- Learns specifically from images with pottery
- Potentially higher accuracy

❌ Cons:
- Requires large dataset and more computing power
- Risk of overfitting with only ~500 images.

