In [16]:
import json
import numpy as np
import re
from urllib.request import urlopen
from tqdm import tqdm
import os

from PIL import Image
import wikipedia
import random
from icrawler.builtin import UrlListCrawler, GreedyImageCrawler

from wikidata.client import Client



# DO NOT RUN NEXT TWO CELLS UNLESS YOU ARE SURE

In [6]:
urls = []
c = 0

TOTAL = 2_500
with tqdm(total=TOTAL) as pbar:
    for _ in range(100_000):
        if c <= TOTAL:
            try:
                try:
                    page = wikipedia.WikipediaPage(wikipedia.random())
                except wikipedia.DisambiguationError as e:
                    page = wikipedia.WikipediaPage(e.options[0])
                splt = page.summary[:50].split(' ')
                if splt[0][0].isupper() and splt[1][0].isupper(): # probably a person
                    continue
                imgs = page.images
                imgs = [t for t in imgs if '.jpg' == t[-4:].lower() or '.jpeg' == t[-4:].lower()] # we do not want svg
                if imgs:
                    urls.append(imgs[random.randint(0, len(imgs) - 1)])
                    c += 1
                    pbar.update(1)
            except Exception as e:
                print(e)
        else:
            break


  0%|          | 0/2500 [00:00<?, ?it/s]2022-11-28 14:22:19,602 - INFO - downloader - downloader-002 is waiting for new download tasks
2022-11-28 14:22:19,602 - INFO - downloader - downloader-005 is waiting for new download tasks
2022-11-28 14:22:19,602 - INFO - downloader - downloader-003 is waiting for new download tasks
2022-11-28 14:22:19,602 - INFO - downloader - downloader-008 is waiting for new download tasks
2022-11-28 14:22:19,602 - INFO - downloader - downloader-009 is waiting for new download tasks
2022-11-28 14:22:19,603 - INFO - downloader - downloader-010 is waiting for new download tasks
2022-11-28 14:22:19,603 - INFO - downloader - downloader-007 is waiting for new download tasks
2022-11-28 14:22:19,603 - INFO - downloader - downloader-006 is waiting for new download tasks
2022-11-28 14:22:19,740 - INFO - downloader - downloader-001 is waiting for new download tasks
2022-11-28 14:22:19,755 - INFO - downloader - downloader-004 is waiting for new download tasks
  0%|     

KeyboardInterrupt: 

In [None]:
with open("../data/test2_v1/urls.txt", 'a') as f:
    f.writelines([t + '\n' for t in urls])

## Load images from wikipedia and filter

In [17]:
urllist_crawler = UrlListCrawler(downloader_threads=10,
                                 storage={'root_dir': r'../data/test2_v1/test2_images_v1/'})
urllist_crawler.crawl(r'../data/test2_v1/urls.txt', max_num=2500, overwrite=True)

2022-11-28 16:05:12,365 - INFO - icrawler.crawler - start crawling...
2022-11-28 16:05:12,366 - INFO - icrawler.crawler - starting 1 feeder threads...
2022-11-28 16:05:12,368 - INFO - icrawler.crawler - starting 1 parser threads...
2022-11-28 16:05:12,369 - INFO - icrawler.crawler - starting 10 downloader threads...
2022-11-28 16:05:12,610 - INFO - downloader - image #1	https://upload.wikimedia.org/wikipedia/commons/3/36/Capt._Antonio_de_los_Reyes_Correa_%28bust%29.jpg
2022-11-28 16:05:12,634 - INFO - downloader - image #2	https://upload.wikimedia.org/wikipedia/commons/6/65/Bundesarchiv_Bild_183-12190-0007%2C_Berlin%2C_Au%C3%9Ferordentliche_Volkskammersitzung.jpg
2022-11-28 16:05:12,642 - INFO - downloader - image #3	https://upload.wikimedia.org/wikipedia/commons/b/b8/Brazil-CopaAmerica-1919.jpg
2022-11-28 16:05:12,692 - INFO - downloader - image #4	https://upload.wikimedia.org/wikipedia/commons/e/e4/Bill-Lockyer.jpg
2022-11-28 16:05:12,863 - INFO - downloader - image #5	https://upload

In [18]:

pth = r"../data/test2_v1/test2_images_v1/"
c = 0
for filename in os.listdir(pth):
    img = Image.open(pth + filename)
    if not (img.size[0] > 450 and img.size[1] > 450 and 0.25 < img.size[0] / img.size[1] < 4):
        img.close()
        os.remove(pth + filename)
    c += 1
c



3907

## Creating dataset

In [19]:
from pathlib import Path
import logging
import json
from typing import *
import time

import pandas as pd
import numpy as np
import torch
from PIL import Image, ImageFile
import torch.nn as nn
from lavis.models import load_model_and_preprocess, BlipBase
from lavis.processors import load_processor
import torch.nn.functional as F
from transformers import get_linear_schedule_with_warmup
# from torchmetrics.functional import retrieval_reciprocal_rank, retrieval_hit_rate
from transformers import BatchEncoding
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm


## Config

Paths resolution:

In [20]:
DATASET_VERSION = "v1"
PART = "train"
PATH = Path("../data").resolve() / f"{PART}_{DATASET_VERSION}"
DATA_PATH = PATH / f"{PART}.data.{DATASET_VERSION}.txt"
LABELS_PATH = PATH / f"{PART}.gold.{DATASET_VERSION}.txt"
IMAGES_PATH = PATH / f"{PART}_images_{DATASET_VERSION}"
TRAIN_SPLIT_PATH = PATH / "split_train.txt"
VALIDATION_SPLIT_PATH = PATH / "split_valid.txt"
TEST_SPLIT_PATH = PATH / "split_test.txt"
SAVE_CHECKPOINT_PATH = Path("checkpoints").resolve() / "BLIP-ITM-2"
SAVE_CHECKPOINT_PATH.mkdir(parents=True, exist_ok=True)

pth = r"../data/test2_v1/test2_images_v1/"

Environment settings:

In [21]:
import sys
import logging

sys.path.append('..')
from src.utils import evaluate

import torch
from PIL import Image
import imp
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from sklearn.model_selection import train_test_split

from shutil import copy

from PIL import Image
from pathlib import Path

In [22]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# some images from train might not load without the following settings or warnings would be thrown
Image.MAX_IMAGE_PIXELS = None
ImageFile.LOAD_TRUNCATED_IMAGES = True

writer = SummaryWriter()

In [23]:
RANDOM_STATE = 42
torch.manual_seed(RANDOM_STATE)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on {DEVICE}")

Running on cuda


Model & training settings

In [24]:
df = pd.read_csv(DATA_PATH, sep='\t', header=None)
df.columns = ["word", "context"] + [f"image{i}" for i in range(10)]
df["label"] = pd.read_csv(LABELS_PATH, sep='\t', header=None)

train_df = df.loc[pd.read_csv(TRAIN_SPLIT_PATH, sep='\t', header=None).T.values[0]]
validation_df = df.loc[pd.read_csv(VALIDATION_SPLIT_PATH, sep='\t', header=None).T.values[0]]
test_df = df.loc[pd.read_csv(TEST_SPLIT_PATH, sep='\t', header=None).T.values[0]]

In [25]:
for idx, s in test_df.iterrows():
    if s['label'] not in train_df.iloc[:, 2:-1].values.flatten():
        copy(IMAGES_PATH / s['label'], pth + s['label'])

In [27]:
np.random.seed(RANDOM_STATE)
keep_rows = []
for idx, s in test_df.iterrows():
    if s['label'] not in train_df.iloc[:, 2:-1].values.flatten():
        new_row = [s['word'], s['context']]
        neg_samples = [t for t in os.listdir(pth) if t != s['label']]
        neg_samples = list(np.random.choice(neg_samples, 9, replace=False))
        neg_samples += [s['label']]
        np.random.shuffle(neg_samples)
        new_row += neg_samples + [s['label']]
        keep_rows.append(new_row)

test2_df = pd.DataFrame(keep_rows, columns=test_df.columns)
test2_df

Unnamed: 0,word,context,image0,image1,image2,image3,image4,image5,image6,image7,image8,image9,label
0,leucaena,leucaena genus,image.14125.jpg,000130.jpg,001803.JPG,image.105.jpg,002346.jpg,000631.jpg,image.9129.jpg,000417.jpg,image.8695.jpg,001468.jpg,image.105.jpg
1,mahonia,mahonia genus,image.7133.jpg,image.120.jpg,002013.jpg,image.2958.jpg,image.7509.jpg,image.2497.jpg,image.1442.jpg,000776.jpg,image.10379.jpg,001810.jpg,image.120.jpg
2,breakdown,breakdown failure,002214.jpg,001948.jpg,image.1087.jpg,image.12589.jpg,image.9637.jpg,image.6025.jpg,image.3418.jpg,image.239.jpg,001782.jpg,000056.JPG,image.239.jpg
3,boletellus,boletellus genus,001792.jpg,000130.jpg,image.13972.jpg,image.7442.jpg,image.5989.jpg,image.10686.jpg,image.324.jpg,001009.jpg,001750.jpg,image.6057.jpg,image.324.jpg
4,capparis,capparis genus,image.7365.jpg,001513.jpg,000011.jpg,002333.JPG,image.2614.jpg,image.450.jpg,001522.jpg,image.6465.jpg,image.11316.jpg,image.359.jpg,image.359.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1554,aspergillus,aspergillus genus,001132.jpg,image.10614.jpg,image.9686.jpg,image.8255.jpg,001720.jpg,image.4465.jpg,002249.jpg,image.7093.jpg,000269.jpg,001882.jpg,image.10614.jpg
1555,mantophasmatodea,mantophasmatodea order,000914.JPG,001148.jpg,image.2321.jpg,image.10271.jpg,001243.JPG,image.9187.jpg,image.8826.jpg,image.3305.jpg,000716.JPG,image.3355.jpg,image.3355.jpg
1556,make,make persuade,000488.jpg,image.14404.jpg,image.2970.jpg,image.6992.jpg,001701.jpg,000914.JPG,image.16257.jpg,001030.jpg,image.9126.jpg,001998.jpg,image.9126.jpg
1557,lookout,lookout watcher,image.10322.jpg,000455.jpg,image.1334.jpg,001937.jpg,image.12314.jpg,000525.JPG,image.5665.jpg,001979.jpg,image.4060.jpg,000521.jpg,image.4060.jpg


In [28]:
keep_imgs = np.unique(test2_df.iloc[:, 2:-1].values.flatten())
for filename in os.listdir(pth):
    if filename not in keep_imgs:
        os.remove(pth + filename)

In [29]:
np.intersect1d(test2_df.iloc[:, 2:].values.flatten(), train_df.iloc[:, 2:].values.flatten())

array([], dtype=object)

In [30]:
test2_df.iloc[:, :-1].to_csv(r"../data/test2_v1/" + "test2.data.v1.txt", sep='\t', header=False, index=False)
test2_df.iloc[:, [-1]].to_csv(r"../data/test2_v1/" + "test2.gold.v1.txt", sep='\t', header=False, index=False)