# Notebook setup

Installing torch package

In [4]:
!pip install torch torchvision

Collecting torch
  Downloading torch-2.8.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting torchvision
  Downloading torchvision-0.23.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting filelock (from torch)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-cupti-cu12==12.8.90 (

In [33]:
import os, random, json
from pathlib import Path
import numpy as np
import pandas as pd
import torch, torch.nn as nn
from torch.utils.data import DataLoader, Subset, WeightedRandomSampler
from torchvision import datasets
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter
from PIL import Image
import os
from sklearn.model_selection import train_test_split
import torch


In [None]:
# import timm
# from timm.data import resolve_model_data_config, create_transform
# from tqdm import tqdm

In [6]:
# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Folders
DATA_DIR = Path("../raw_data/sample1000BALANCED")
OUT_DIR  = Path("../models")

# Checking if it exists
status = {}

status["DATA_DIR"] = "OK" if DATA_DIR.exists() else "NON"
if not OUT_DIR.exists():
    OUT_DIR.mkdir(parents=True, exist_ok=True)
status["OUT_DIR"] = "OK" if OUT_DIR.exists() else "NON"

# Print
print("Vérification des dossiers :")
for key, val in status.items():
    print(f"  {key}: {val}")

Vérification des dossiers :
  DATA_DIR: OK
  OUT_DIR: OK


## Dataset loading (1000 imgs / 10 classes)

In [10]:
df = pd.read_csv('../raw_data/data_sampling1000_topstyles10.csv')
df.head()

Unnamed: 0,artist,style,movement,tags,url,img,file_name,genre_list,Artwork,Date
0,Panayiotis Tetsis,Impressionism,Post-Impressionism,,https://www.wikiart.org/en/panayiotis-tetsis/s...,https://uploads3.wikiart.org/images/panayiotis...,127519-sunset.jpg,['cloudscape'],,
1,Henry Scott Tuke,Impressionism,Impressionism,"['Fun', 'Vacation', 'summer']",https://www.wikiart.org/en/henry-scott-tuke/th...,https://uploads5.wikiart.org/00261/images/henr...,100529-henry-scott-tuke-the-bathers-1922.jpg,['genre painting'],,
2,Alfred Sisley,Impressionism,Impressionism,"['fires-and-floods', 'forests-and-trees', 'Ban...",https://www.wikiart.org/en/alfred-sisley/the-f...,https://uploads1.wikiart.org/images/alfred-sis...,92312-the-flood-on-the-road-to-saint-germain-1...,['landscape'],,
3,Camille Pissarro,Impressionism,Impressionism,"['Grassland', 'Meadow', 'Pasture', 'Plain']",https://www.wikiart.org/en/camille-pissarro/ro...,https://uploads2.wikiart.org/images/camille-pi...,90106-rolling-landscape-in-winter-1875.jpg,['landscape'],,
4,Guy Rose,Impressionism,Impressionism,"['seas-and-oceans', 'cliffs-and-rocks', 'Bank'...",https://www.wikiart.org/en/guy-rose/grey-after...,https://uploads3.wikiart.org/images/guy-rose/g...,104411-grey-afternoon.jpg,['landscape'],,


In [13]:
list_images= df['file_name'].unique().tolist()

# Train test split

In [21]:
X = df[['file_name']]  # files name only
y = df['style']        # classes

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=SEED
)

In [22]:
X_train.head()

Unnamed: 0,file_name
523,188044-untitled-man-with-hat-and-horn-1961.jpg
341,133894-dois-nus-1930.jpg
981,123606-el-paseo-de-colon-1917.jpg
903,158255-soldier-of-the-first-division-1914.jpg
856,34241-the-tambourine-player-1909.jpg


# Create a dataframe with 2 columns : img and style (for test and for train)

In [None]:
# Training dataframe
df_train = X_train.copy()
df_train['style'] = y_train
df_train = df_train.reset_index(drop=True)

In [None]:
# Testing dataframe
df_test = X_test.copy()
df_test['style'] = y_test
df_test = df_test.reset_index(drop=True)

In [25]:
df_train.head()

Unnamed: 0,file_name,style
0,188044-untitled-man-with-hat-and-horn-1961.jpg,Surrealism
1,133894-dois-nus-1930.jpg,Expressionism
2,123606-el-paseo-de-colon-1917.jpg,Cubism
3,158255-soldier-of-the-first-division-1914.jpg,Cubism
4,34241-the-tambourine-player-1909.jpg,Neoclassicism


In [26]:
df_test.head()

Unnamed: 0,file_name,style
0,185097-leda-atomica.jpeg!Large.jpeg,Surrealism
1,111637-seated-man-study-for-bathers-at-asniere...,Post-Impressionism
2,38857-charlet-n-t-lithography-a-moi-les-ancien...,Romanticism
3,122700-forest-path-1911.jpg,Post-Impressionism
4,188227-suite-instruments-de-tortura-1956-1.jpg,Surrealism


# Encode labels

In [34]:
# mapp to have a number between 0 and 10 and not a str for the style
label2idx = {label: i for i, label in enumerate(df_train['style'].unique())}

# Ajouter la colonne label_idx dans train et test
df_train['label_idx'] = df_train['style'].map(label2idx)
df_test['label_idx']  = df_test['style'].map(label2idx)

# Load the model 

In [28]:
!pip install open-clip-torch

Collecting open-clip-torch
  Downloading open_clip_torch-3.1.0-py3-none-any.whl.metadata (32 kB)
Collecting regex (from open-clip-torch)
  Downloading regex-2025.9.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting ftfy (from open-clip-torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting huggingface-hub (from open-clip-torch)
  Using cached huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting safetensors (from open-clip-torch)
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting timm>=1.0.17 (from open-clip-torch)
  Downloading timm-1.0.19-py3-none-any.whl.metadata (60 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub->open-clip-torch)
  Downloading hf_xet-1.1.9-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.7 kB)
Downloading open_clip_torch-3.1.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━

In [29]:
import open_clip

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model, _, preprocess = open_clip.create_model_and_transforms(
    'ViT-B-32',
    pretrained='laion2b_s34b_b79k'
)

Check if the computer has a GPU or no and chose wait to do it :

In [31]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

    Found GPU0 Quadro P520 which is of cuda capability 6.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (7.0) - (12.0)
    
    Please install PyTorch with a following CUDA
    configurations:  12.6 following instructions at
    https://pytorch.org/get-started/locally/
    
Quadro P520 with CUDA capability sm_61 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_70 sm_75 sm_80 sm_86 sm_90 sm_100 sm_120.
If you want to use the Quadro P520 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



# Make all parameters stay the same

In [35]:
for param in model.parameters():
    param.requires_grad = False

# Create a dataset readable by pytorch

In [40]:
from torch.utils.data import Dataset
from PIL import Image
import os
import torch

class PaintingsDataset(Dataset):
    def __init__(self, df, image_dir, preprocess, device):
        self.df = df.reset_index(drop=True)
        self.image_dir = image_dir
        self.preprocess = preprocess
        self.device = device

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Récupérer le nom de l'image
        img_name = self.df.loc[idx, 'file_name']
        img_path = os.path.join(self.image_dir, img_name)

        # Ouvrir l'image et appliquer le preprocess
        image = Image.open(img_path).convert("RGB")
        image_tensor = self.preprocess(image).to(self.device)

        # Récupérer le label entier
        label = torch.tensor(self.df.loc[idx, 'label_idx']).to(self.device)

        return image_tensor, label

In [41]:
train_dataset = PaintingsDataset(df_train, DATA_DIR, preprocess, device)
test_dataset  = PaintingsDataset(df_test, DATA_DIR, preprocess, device)

In [42]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=16)