From 63d3090120cda4c998dff55bef658f5368402433 Mon Sep 17 00:00:00 2001 From: Hu Xu Date: Thu, 18 Jan 2024 21:16:39 +0000 Subject: [PATCH] add code to build metadata --- README.md | 9 +- metaclip/README_metadata.md | 32 ++++++ metaclip/build_metadata.py | 188 ++++++++++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+), 2 deletions(-) create mode 100644 metaclip/README_metadata.md create mode 100644 metaclip/build_metadata.py diff --git a/README.md b/README.md index 74d93dc..f44ad44 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ MetaCLIP is trained w/ face blurred images. ``` ## Updates +* 01/18/2024: add [code](metaclip/README_metadata.md) for building metadata. * 12/25/2023: [Huggingface Space](https://huggingface.co/spaces/activebus/MetaCLIP) demo and [Colab](https://colab.research.google.com/drive/1V0Rv1QQJkcolTjiwJuRsqWycROvYjOwg?usp=sharing) released. * 12/21/2023: ViT-G/14 released. * 09/28/2023: initial release. @@ -167,6 +168,10 @@ python submitit_openclip.py b32_400m Please config the corresponding `training_data` in `run_configs_400m.py`. +### Build Your Own Metadata +Consider start from our [code](metaclip/README_metadata.md) for building CLIP's 500k metadata. + + ## Bugs or questions? If you have any questions related to the code or the paper, feel free to email Hu Xu (`huxu@meta.com`). @@ -190,8 +195,8 @@ Please cite our paper if MetaCLIP helps your work: The training code is developed based on [OpenCLIP](https://github.com/mlfoundations/open_clip), modified to the vanilla CLIP training setup. ## TODO -- code for building metadata; -- numpy implementation for matching and balancing; +- v0.1 code release; +- refactor openclip as v0.2; - (welcome your use cases or suggestions to update this codebase regularly) diff --git a/metaclip/README_metadata.md b/metaclip/README_metadata.md new file mode 100644 index 0000000..c38b0b5 --- /dev/null +++ b/metaclip/README_metadata.md @@ -0,0 +1,32 @@ +# Building Metadata Entries for MetaCLIP + +tl;dr: +```bash +python metaclip/build_metadata.py +``` + + +## Part 1: WordNet synsets +`metaclip/build_metadata.py:wordnet_synsets` + +```bash +pip install nltk +python -m nltk.downloader wordnet +python -m nltk.downloader omw-1.4 +``` + +## Part 2: Wiki Unigram +`metaclip/build_metadata.py:wiki_unigram` + +Keep unigrams more than `100` occurences. + +## Part 3: Wiki Bigrams +`metaclip/build_metadata.py:wiki_bigrams` + +Computing pointwise mutual information more than 30. + +## Part 4: Wiki Article Titles +`metaclip/build_metadata.py:wiki_title` + +Keep view frequency more than `70`. +We randomly sample 25 days of [pageviews](https://dumps.wikimedia.org/other/pageviews) from past 5 years. diff --git a/metaclip/build_metadata.py b/metaclip/build_metadata.py new file mode 100644 index 0000000..a71300f --- /dev/null +++ b/metaclip/build_metadata.py @@ -0,0 +1,188 @@ +import json +import string + +from datetime import datetime + + +def wordnet_synsets(): + from nltk.corpus import wordnet as wn + entries = [] + for ss in wn.all_synsets(): + name = ss.name() + dot_idx = name.find(".") + name = name[:dot_idx].replace("_", " ") + entries.append(name) + return entries + + +def wiki_unigram(thres=100): + entries = [] + with open("data/wiki/enwiki-unigram.txt") as fr: + for line in fr: + name, count = line.strip().split() + count = int(count) + if count >= thres: # at least + entries.append(name) + return entries + + +def wiki_bigrams(): + import os + import gzip + + if not os.path.exists("data/wiki/bigram_pmi_cache.txt.gz"): + from nltk.probability import FreqDist + + word_fd = FreqDist() + + with gzip.open("data/wiki/1gram.txt.gz") as fr: + for line in fr: + count, name = line.decode().split("\t") + count = int(count) + name = name.strip() + if len(name) > 0 and count > 0: + word_fd[name] = count + + bigram_fd = FreqDist() + missing_word_count, total_count = 0, 0 + with gzip.open("data/wiki/2gram.txt.gz") as fr: + for line in fr: + count, word1, word2 = line.decode().split("\t") + count = int(count) + word1 = word1.strip() + word2 = word2.strip() + if len(word1) > 0 and len(word2) > 0 and count > 0: + total_count += 1 + if word1 not in word_fd or word2 not in word_fd: + missing_word_count += 1 + if missing_word_count % 500000 == 0: + print("missing words in unigram", line.decode()) + continue + bigram_fd[(word1, word2)] = count + + if len(bigram_fd) % 500000 == 0: + print("sample bi-gram", word1, word2) + + print(f"bigram stats: {missing_word_count} / {total_count}") + + from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures + + bigram_measures = BigramAssocMeasures() + finder = BigramCollocationFinder(word_fd, bigram_fd) + with gzip.open("data/wiki/bigram_pmi_cache.txt.gz", "wb") as fw: + for query in finder.score_ngrams(bigram_measures.pmi): + fw.write(f"{query[1]}\t{query[0][0]}\t{query[0][1]}\n".encode()) + + + pmi_thres = 30. + print(f"use pmi_thres={pmi_thres}") + entries = [] + with gzip.open("data/wiki/bigram_pmi_cache.txt.gz") as fr: + for line in fr: + pmi, word1, word2 = line.decode().strip().split("\t") + pmi = float(pmi) + if pmi >= pmi_thres: + entries.append(f"{word1} {word2}") + return entries + + +def wiki_title(budget=100000): + import urllib.request + import os + import gzip + + from collections import defaultdict + + if not os.path.exists("data/wiki/title_counts.json"): + dates = [ + "20180419", + "20180510", + "20180530", + "20180914", + "20181119", + "20190406", + "20190928", + "20191026", + "20191208", + "20200417", + "20200610", + "20200813", + "20200824", + "20201018", + "20210112", + "20210123", + "20210305", + "20210920", + "20211127", + "20211211", + "20220116", + "20220322", + "20220430", + "20220529", + "20220618", + "20220829" + ] + + title_counts = defaultdict(int) + + for date in dates: + print("date", date) + for idx in range(0, 240000, 10000): + fn = f"pageviews-{date}-{idx:06}.gz" + local_path = f"data/wiki/pageviews/{fn}" + if not os.path.exists(local_path): + urllib.request.urlretrieve(f"https://dumps.wikimedia.org/other/pageviews/{date[:4]}/{date[:4]}-{date[4:6]}/{fn}", local_path) + with gzip.open(local_path) as fr: + for idx, line in enumerate(fr): + line = line.decode().strip() + if line.startswith("en "): + orgin, title, count, hour = line.split(" ") + assert orgin == "en", orgin + title = title.strip().replace("_", " ") + count = int(count) + count_filter = int(count / 50) + if count_filter > 0 and ":" not in title: + if title not in title_counts and (len(title_counts)+1) % 10000 == 0: + print("len(title_counts)", len(title_counts)) + title_counts[title] += count + + with open("data/wiki/title_counts.json", "w") as fw: + json.dump(title_counts, fw) + + + with open("data/wiki/title_counts.json") as fr: + title_counts = json.load(fr) + + view_thres = 70 + print(f"use view_thres={view_thres}") + + entries = [] + for title, count in title_counts.items(): + if count >= view_thres: + entries.append(title) + return entries + + +def main(): + num_entries = 500000 + + forbidden = set(string.punctuation) + sources = {"wordnet": wordnet_synsets, "wiki_unigram": wiki_unigram, "wiki_bigrams": wiki_bigrams, "wiki_title": wiki_title} + + entries = set([str(ix) for ix in range(100)]) + + for source_name in sources: + source_entries = set(sources[source_name]()) + for entry in source_entries: + if len(entry) > 0 and entry not in forbidden: + entries.add(entry) + + if len(entries) >= num_entries: + today = datetime.today().strftime('%Y-%m-%d') + with open(f"metadata_{today}.json", "w") as fw: + json.dump(list(entries), fw) + print(f"after adding {source_name}: len(entries)={len(entries)}.") + + +if __name__ == "__main__": + main()