https://github.com/huggingface/transformers/issues/17971

In [1]:

from PIL import Image
import pandas as pd

def file_check(x):
    try:
        img = Image.open(f'images/{x}.jpg')
        return img.mode
    except:
        return False
    
df = pd.read_csv('dataset_curso.csv')
df['has_image'] = df['id'].apply(file_check)


In [2]:
df.groupby('has_image').count()

Unnamed: 0_level_0,clean_title,created_utc,id,image_url,linked_submission_id,num_comments,score,upvote_ratio,2_way_label
has_image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,16586,16586,16586,16451,14564,2022,16586,2022,16586
CMYK,1,1,1,1,1,0,1,0,1
L,32,32,32,32,10,22,32,22,32
P,2699,2699,2699,2699,96,2603,2699,2603,2699
RGB,37073,37073,37073,37073,2099,34974,37073,34974,37073
RGBA,9,9,9,9,9,0,9,0,9


In [3]:

image_dataset = df[df['has_image'] == 'RGB']

image_dataset['image'] = image_dataset['id'].apply(lambda x: f'images/{x}.jpg')
image_dataset['labels'] = image_dataset['2_way_label']
df = image_dataset[['image', 'labels']]

df.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  image_dataset['image'] = image_dataset['id'].apply(lambda x: f'images/{x}.jpg')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  image_dataset['labels'] = image_dataset['2_way_label']


Unnamed: 0,labels
count,37073.0
mean,0.552531
std,0.497239
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [4]:
df['image'].describe()

count                 37073
unique                37073
top       images/239vnh.jpg
freq                      1
Name: image, dtype: object

In [5]:
from datasets import Dataset, Image

image_dataset = Dataset.from_pandas(df) \
    .cast_column('image', Image()) \
    .train_test_split(test_size=0.2)
image_dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['image', 'labels', '__index_level_0__'],
        num_rows: 29658
    })
    test: Dataset({
        features: ['image', 'labels', '__index_level_0__'],
        num_rows: 7415
    })
})

In [6]:
from transformers import AutoImageProcessor

model_id = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(model_id)

In [7]:
def transform(example_batch):
  inputs = image_processor([x for x in example_batch["image"]], return_tensors="pt")
  inputs["labels"] = example_batch["labels"]
  return inputs

In [8]:
prepared_ds = image_dataset.with_transform(transform)

In [9]:
import torch

def collate_fn(batch):
  return {
      "pixel_values": torch.stack([x["pixel_values"] for x in batch]),
      "labels": torch.tensor([x["labels"] for x in batch])
  }

In [10]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(prediction):
  return metric.compute(
    predictions=np.argmax(prediction.predictions, axis=1), 
    references=prediction.label_ids
)

  metric = load_metric("accuracy")


In [11]:
from transformers import ViTForImageClassification


model = ViTForImageClassification.from_pretrained(
    model_id,
    num_labels=2,
    # id2label={str(i): c for i, c in enumerate(labels)},
    # label2id={c: str(i) for i,c in enumerate(labels)}
)


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./image_model",
    evaluation_strategy="steps",
    num_train_epochs=4,
    learning_rate=2e-4,
    load_best_model_at_end=True,
    remove_unused_columns=False,
)

In [13]:
image_dataset["train"].features

{'image': Image(decode=True, id=None),
 'labels': Value(dtype='int64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["test"],
    tokenizer=image_processor
)

In [15]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)

  3%|▎         | 500/14832 [16:18<6:40:03,  1.67s/it] 

{'loss': 0.6897, 'learning_rate': 0.00019325782092772384, 'epoch': 0.13}


                                                     
  3%|▎         | 500/14832 [26:58<6:40:03,  1.67s/it]

{'eval_loss': 0.6864238977432251, 'eval_accuracy': 0.5611598111935266, 'eval_runtime': 639.9837, 'eval_samples_per_second': 11.586, 'eval_steps_per_second': 1.448, 'epoch': 0.13}


  7%|▋         | 1000/14832 [43:24<7:05:12,  1.84s/it]  

{'loss': 0.6896, 'learning_rate': 0.0001865156418554477, 'epoch': 0.27}


                                                      
  7%|▋         | 1000/14832 [51:56<7:05:12,  1.84s/it]

{'eval_loss': 0.679474413394928, 'eval_accuracy': 0.5832771409305462, 'eval_runtime': 511.8828, 'eval_samples_per_second': 14.486, 'eval_steps_per_second': 1.811, 'epoch': 0.27}


 10%|█         | 1500/14832 [1:07:56<8:19:58,  2.25s/it] 

{'loss': 0.6926, 'learning_rate': 0.00017977346278317153, 'epoch': 0.4}


                                                        
 10%|█         | 1500/14832 [1:16:27<8:19:58,  2.25s/it]

{'eval_loss': 0.6855337619781494, 'eval_accuracy': 0.5595414699932569, 'eval_runtime': 510.8248, 'eval_samples_per_second': 14.516, 'eval_steps_per_second': 1.815, 'epoch': 0.4}


 13%|█▎        | 2000/14832 [1:32:21<6:34:51,  1.85s/it]   

{'loss': 0.6886, 'learning_rate': 0.00017303128371089536, 'epoch': 0.54}


                                                        
 13%|█▎        | 2000/14832 [1:41:13<6:34:51,  1.85s/it]

{'eval_loss': 0.6853688955307007, 'eval_accuracy': 0.5611598111935266, 'eval_runtime': 531.867, 'eval_samples_per_second': 13.941, 'eval_steps_per_second': 1.743, 'epoch': 0.54}


 17%|█▋        | 2500/14832 [1:57:03<6:23:24,  1.87s/it]   

{'loss': 0.6876, 'learning_rate': 0.00016628910463861921, 'epoch': 0.67}


                                                        
 17%|█▋        | 2500/14832 [2:05:35<6:23:24,  1.87s/it]

{'eval_loss': 0.6869639754295349, 'eval_accuracy': 0.5611598111935266, 'eval_runtime': 512.657, 'eval_samples_per_second': 14.464, 'eval_steps_per_second': 1.808, 'epoch': 0.67}


 20%|██        | 3000/14832 [2:21:42<5:54:53,  1.80s/it]   

{'loss': 0.6872, 'learning_rate': 0.00015954692556634304, 'epoch': 0.81}


                                                        
 20%|██        | 3000/14832 [2:30:19<5:54:53,  1.80s/it]

{'eval_loss': 0.6846593618392944, 'eval_accuracy': 0.5614295347269049, 'eval_runtime': 516.518, 'eval_samples_per_second': 14.356, 'eval_steps_per_second': 1.795, 'epoch': 0.81}


 24%|██▎       | 3500/14832 [2:46:23<6:06:12,  1.94s/it]   

{'loss': 0.6871, 'learning_rate': 0.00015280474649406687, 'epoch': 0.94}


                                                        
 24%|██▎       | 3500/14832 [2:54:55<6:06:12,  1.94s/it]

{'eval_loss': 0.6835988163948059, 'eval_accuracy': 0.5611598111935266, 'eval_runtime': 512.2217, 'eval_samples_per_second': 14.476, 'eval_steps_per_second': 1.81, 'epoch': 0.94}


 27%|██▋       | 4000/14832 [3:10:56<5:51:32,  1.95s/it]   

{'loss': 0.6868, 'learning_rate': 0.00014606256742179073, 'epoch': 1.08}


                                                        
 27%|██▋       | 4000/14832 [3:19:22<5:51:32,  1.95s/it]

{'eval_loss': 0.6839420795440674, 'eval_accuracy': 0.5616992582602832, 'eval_runtime': 506.1689, 'eval_samples_per_second': 14.649, 'eval_steps_per_second': 1.831, 'epoch': 1.08}


 30%|███       | 4500/14832 [3:35:03<5:08:33,  1.79s/it]   

{'loss': 0.6842, 'learning_rate': 0.00013932038834951456, 'epoch': 1.21}


                                                        
 30%|███       | 4500/14832 [3:43:29<5:08:33,  1.79s/it]

{'eval_loss': 0.7286498546600342, 'eval_accuracy': 0.4426163182737694, 'eval_runtime': 506.5578, 'eval_samples_per_second': 14.638, 'eval_steps_per_second': 1.83, 'epoch': 1.21}


 34%|███▎      | 5000/14832 [3:59:17<4:47:41,  1.76s/it]   

{'loss': 0.6885, 'learning_rate': 0.00013257820927723842, 'epoch': 1.35}


                                                        
 34%|███▎      | 5000/14832 [4:07:43<4:47:41,  1.76s/it]

{'eval_loss': 0.6806064248085022, 'eval_accuracy': 0.584086311530681, 'eval_runtime': 505.9861, 'eval_samples_per_second': 14.655, 'eval_steps_per_second': 1.832, 'epoch': 1.35}


 37%|███▋      | 5500/14832 [4:23:11<4:51:35,  1.87s/it]   

{'loss': 0.6858, 'learning_rate': 0.00012583603020496225, 'epoch': 1.48}


                                                        
 37%|███▋      | 5500/14832 [4:31:35<4:51:35,  1.87s/it]

{'eval_loss': 0.6828432679176331, 'eval_accuracy': 0.5616992582602832, 'eval_runtime': 503.706, 'eval_samples_per_second': 14.721, 'eval_steps_per_second': 1.84, 'epoch': 1.48}


 40%|████      | 6000/14832 [4:47:21<4:31:40,  1.85s/it]   

{'loss': 0.6843, 'learning_rate': 0.00011909385113268609, 'epoch': 1.62}


                                                        
 40%|████      | 6000/14832 [4:55:51<4:31:40,  1.85s/it]

{'eval_loss': 0.6802411675453186, 'eval_accuracy': 0.5811193526635199, 'eval_runtime': 509.8615, 'eval_samples_per_second': 14.543, 'eval_steps_per_second': 1.818, 'epoch': 1.62}


 44%|████▍     | 6500/14832 [5:11:41<4:14:45,  1.83s/it]   

{'loss': 0.6805, 'learning_rate': 0.00011235167206040992, 'epoch': 1.75}


                                                        
 44%|████▍     | 6500/14832 [5:20:03<4:14:45,  1.83s/it]

{'eval_loss': 0.6830241680145264, 'eval_accuracy': 0.5700606877950101, 'eval_runtime': 501.5634, 'eval_samples_per_second': 14.784, 'eval_steps_per_second': 1.848, 'epoch': 1.75}


 47%|████▋     | 7000/14832 [5:35:37<3:55:19,  1.80s/it]   

{'loss': 0.6782, 'learning_rate': 0.00010560949298813377, 'epoch': 1.89}


                                                        
 47%|████▋     | 7000/14832 [5:44:06<3:55:19,  1.80s/it]

{'eval_loss': 0.6787985563278198, 'eval_accuracy': 0.5807147673634525, 'eval_runtime': 509.1635, 'eval_samples_per_second': 14.563, 'eval_steps_per_second': 1.821, 'epoch': 1.89}


 51%|█████     | 7500/14832 [5:59:57<3:46:34,  1.85s/it]   

{'loss': 0.6783, 'learning_rate': 9.886731391585761e-05, 'epoch': 2.02}


                                                        
 51%|█████     | 7500/14832 [6:08:34<3:46:34,  1.85s/it]

{'eval_loss': 0.6831994652748108, 'eval_accuracy': 0.5761294672960215, 'eval_runtime': 516.7385, 'eval_samples_per_second': 14.35, 'eval_steps_per_second': 1.794, 'epoch': 2.02}


 54%|█████▍    | 8000/14832 [6:23:49<3:25:21,  1.80s/it]   

{'loss': 0.6862, 'learning_rate': 9.212513484358145e-05, 'epoch': 2.16}


                                                        
 54%|█████▍    | 8000/14832 [6:32:11<3:25:21,  1.80s/it]

{'eval_loss': 0.6789869666099548, 'eval_accuracy': 0.5838165879973027, 'eval_runtime': 501.5732, 'eval_samples_per_second': 14.783, 'eval_steps_per_second': 1.848, 'epoch': 2.16}


 57%|█████▋    | 8500/14832 [6:47:30<3:08:23,  1.79s/it]   

{'loss': 0.6841, 'learning_rate': 8.53829557713053e-05, 'epoch': 2.29}


                                                        
 57%|█████▋    | 8500/14832 [6:55:52<3:08:23,  1.79s/it]

{'eval_loss': 0.6784464716911316, 'eval_accuracy': 0.5890761968981794, 'eval_runtime': 502.2036, 'eval_samples_per_second': 14.765, 'eval_steps_per_second': 1.846, 'epoch': 2.29}


 61%|██████    | 9000/14832 [7:13:15<3:17:31,  2.03s/it]   

{'loss': 0.6761, 'learning_rate': 7.864077669902913e-05, 'epoch': 2.43}


                                                        
 61%|██████    | 9000/14832 [7:22:00<3:17:31,  2.03s/it]

{'eval_loss': 0.6748365759849548, 'eval_accuracy': 0.596223870532704, 'eval_runtime': 524.6276, 'eval_samples_per_second': 14.134, 'eval_steps_per_second': 1.767, 'epoch': 2.43}


 64%|██████▍   | 9500/14832 [7:37:47<3:00:24,  2.03s/it]   

{'loss': 0.6757, 'learning_rate': 7.189859762675297e-05, 'epoch': 2.56}


                                                        
 64%|██████▍   | 9500/14832 [7:46:06<3:00:24,  2.03s/it]

{'eval_loss': 0.6769204139709473, 'eval_accuracy': 0.5885367498314228, 'eval_runtime': 499.3299, 'eval_samples_per_second': 14.85, 'eval_steps_per_second': 1.856, 'epoch': 2.56}


 67%|██████▋   | 10000/14832 [8:01:35<2:32:00,  1.89s/it]  

{'loss': 0.6751, 'learning_rate': 6.51564185544768e-05, 'epoch': 2.7}


                                                         
 67%|██████▋   | 10000/14832 [8:10:00<2:32:00,  1.89s/it]

{'eval_loss': 0.6756672859191895, 'eval_accuracy': 0.5913688469318948, 'eval_runtime': 504.9822, 'eval_samples_per_second': 14.684, 'eval_steps_per_second': 1.836, 'epoch': 2.7}


 71%|███████   | 10500/14832 [8:25:37<2:10:26,  1.81s/it]   

{'loss': 0.6777, 'learning_rate': 5.841423948220065e-05, 'epoch': 2.83}


                                                         
 71%|███████   | 10500/14832 [8:33:55<2:10:26,  1.81s/it]

{'eval_loss': 0.6729500889778137, 'eval_accuracy': 0.5939312204989885, 'eval_runtime': 497.8827, 'eval_samples_per_second': 14.893, 'eval_steps_per_second': 1.862, 'epoch': 2.83}


 74%|███████▍  | 11000/14832 [8:49:17<1:57:49,  1.84s/it]   

{'loss': 0.6712, 'learning_rate': 5.1672060409924495e-05, 'epoch': 2.97}


                                                         
 74%|███████▍  | 11000/14832 [8:57:48<1:57:49,  1.84s/it]

{'eval_loss': 0.6710619330406189, 'eval_accuracy': 0.5975724881995954, 'eval_runtime': 511.6107, 'eval_samples_per_second': 14.493, 'eval_steps_per_second': 1.812, 'epoch': 2.97}


 78%|███████▊  | 11500/14832 [9:13:27<1:36:50,  1.74s/it]   

{'loss': 0.6762, 'learning_rate': 4.4929881337648325e-05, 'epoch': 3.1}


                                                         
 78%|███████▊  | 11500/14832 [9:21:50<1:36:50,  1.74s/it]

{'eval_loss': 0.6702340245246887, 'eval_accuracy': 0.6006743088334457, 'eval_runtime': 502.8142, 'eval_samples_per_second': 14.747, 'eval_steps_per_second': 1.844, 'epoch': 3.1}


 81%|████████  | 12000/14832 [9:37:26<1:56:01,  2.46s/it]   

{'loss': 0.6664, 'learning_rate': 3.818770226537217e-05, 'epoch': 3.24}


                                                         
 81%|████████  | 12000/14832 [9:45:49<1:56:01,  2.46s/it]

{'eval_loss': 0.6701256036758423, 'eval_accuracy': 0.5968981793661498, 'eval_runtime': 502.9959, 'eval_samples_per_second': 14.742, 'eval_steps_per_second': 1.843, 'epoch': 3.24}


 84%|████████▍ | 12500/14832 [10:01:26<1:09:04,  1.78s/it]  

{'loss': 0.672, 'learning_rate': 3.144552319309601e-05, 'epoch': 3.37}


                                                          
 84%|████████▍ | 12500/14832 [10:10:03<1:09:04,  1.78s/it]

{'eval_loss': 0.6699047684669495, 'eval_accuracy': 0.5978422117329737, 'eval_runtime': 516.6017, 'eval_samples_per_second': 14.353, 'eval_steps_per_second': 1.794, 'epoch': 3.37}


 88%|████████▊ | 13000/14832 [10:26:04<54:43,  1.79s/it]     

{'loss': 0.6693, 'learning_rate': 2.470334412081985e-05, 'epoch': 3.51}


                                                        
 88%|████████▊ | 13000/14832 [10:34:33<54:43,  1.79s/it]

{'eval_loss': 0.6685929894447327, 'eval_accuracy': 0.5995954146999326, 'eval_runtime': 508.9621, 'eval_samples_per_second': 14.569, 'eval_steps_per_second': 1.821, 'epoch': 3.51}


 91%|█████████ | 13500/14832 [10:50:27<40:22,  1.82s/it]    

{'loss': 0.6628, 'learning_rate': 1.796116504854369e-05, 'epoch': 3.64}


                                                        
 91%|█████████ | 13500/14832 [10:59:00<40:22,  1.82s/it]

{'eval_loss': 0.6754112839698792, 'eval_accuracy': 0.5937963587322994, 'eval_runtime': 513.4522, 'eval_samples_per_second': 14.441, 'eval_steps_per_second': 1.805, 'epoch': 3.64}


 94%|█████████▍| 14000/14832 [11:14:52<24:38,  1.78s/it]    

{'loss': 0.6677, 'learning_rate': 1.121898597626753e-05, 'epoch': 3.78}


                                                        
 94%|█████████▍| 14000/14832 [11:23:19<24:38,  1.78s/it]

{'eval_loss': 0.6673158407211304, 'eval_accuracy': 0.597167902899528, 'eval_runtime': 507.1392, 'eval_samples_per_second': 14.621, 'eval_steps_per_second': 1.828, 'epoch': 3.78}


 98%|█████████▊| 14500/14832 [11:38:51<09:57,  1.80s/it]    

{'loss': 0.6639, 'learning_rate': 4.47680690399137e-06, 'epoch': 3.91}


                                                        
 98%|█████████▊| 14500/14832 [11:47:10<09:57,  1.80s/it]

{'eval_loss': 0.6658162474632263, 'eval_accuracy': 0.6006743088334457, 'eval_runtime': 499.1411, 'eval_samples_per_second': 14.856, 'eval_steps_per_second': 1.857, 'epoch': 3.91}


100%|██████████| 14832/14832 [11:57:30<00:00,  2.90s/it]    


{'train_runtime': 43050.8307, 'train_samples_per_second': 2.756, 'train_steps_per_second': 0.345, 'train_loss': 0.6794167398249061, 'epoch': 4.0}
***** train metrics *****
  epoch                    =         4.0
  train_loss               =      0.6794
  train_runtime            = 11:57:30.83
  train_samples_per_second =       2.756
  train_steps_per_second   =       0.345


In [17]:
metrics = trainer.evaluate(prepared_ds["test"])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

100%|██████████| 927/927 [09:26<00:00,  1.64it/s]

***** eval metrics *****
  epoch                   =        4.0
  eval_accuracy           =     0.6007
  eval_loss               =     0.6658
  eval_runtime            = 0:09:33.07
  eval_samples_per_second =     12.939
  eval_steps_per_second   =      1.618



