# Extracting English Dataset
## Extracting the english stories from the Bloom Library dataset and filtering only for the ones that have 99.99% confidence that use pure english language

In [None]:
from google.colab import drive
import json
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define path to your JSON file (adjust the path accordingly)
json_path = '/content/drive/MyDrive/Project VIST with LLM/data.json'

# Load JSON file
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

Mounted at /content/drive


In [None]:
_DESCRIPTION = """\
This version of the Bloom Library data is developed specifically for the Visual Story Telling (VIST) task.
It includes data from 363 languages across 36 language families, with many of the languages represented
being extremely low resourced languages.
"""

_HOMEPAGE = "https://bloomlibrary.org/"
_LICENSE = "Attribution 4.0 International"
_URLS = {"train": "data.json"}

# Languages included in Bloom VIST
languages = ['afr', 'aaa', 'abc', 'ada', 'adq', 'aeu', 'agq', 'ags', 'ahk', 'aia', 'ajz', 'aka', 'ame', 'amh',
'amp', 'amu', 'ann', 'aph', 'awa', 'awb', 'azn', 'azo', 'bag', 'bam', 'baw', 'bax', 'bbk', 'bcc', 'bce', 'bec',
'bef', 'ben', 'bfd', 'bfm', 'bfn', 'bgf', 'bho', 'bhs', 'bis', 'bjn', 'bjr', 'bkc', 'bkh', 'bkm', 'bkx', 'bob',
'bod', 'boz', 'bqm', 'bra', 'brb', 'bri', 'brv', 'bss', 'bud', 'buo', 'bwt', 'bwx', 'bxa', 'bya', 'bze', 'bzi',
'cak', 'cbr', 'ceb', 'cgc', 'chd', 'chp', 'cim', 'clo', 'cmn', 'cmo', 'csw', 'cuh', 'cuv', 'dag', 'ddg', 'ded',
'deu', 'dig', 'dje', 'dmg', 'dnw', 'dtp', 'dtr', 'dty', 'dug', 'eee', 'ekm', 'enb', 'enc', 'eng', 'ewo', 'fas',
'fil', 'fli', 'fon', 'fra', 'fub', 'fuh', 'gal', 'gbj', 'gou', 'gsw', 'guc', 'guj', 'guz', 'gwc', 'hao', 'hat',
'hau', 'hbb', 'hig', 'hil', 'hin', 'hla', 'hna', 'hre', 'hro', 'idt', 'ilo', 'ind', 'ino', 'isu', 'ita', 'jgo',
'jmx', 'jpn', 'jra', 'kak', 'kam', 'kan', 'kau', 'kbq', 'kbx', 'kby', 'kek', 'ken', 'khb', 'khm', 'kik', 'kin',
'kir', 'kjb', 'kmg', 'kmr', 'kms', 'kmu', 'kor', 'kqr', 'krr', 'ksw', 'kur', 'kvt', 'kwd', 'kwu', 'kwx', 'kxp',
'kyq', 'laj', 'lan', 'lao', 'lbr', 'lfa', 'lgg', 'lgr', 'lhm', 'lhu', 'lkb', 'llg', 'lmp', 'lns', 'loh', 'lsi',
'lts', 'lug', 'luy', 'lwl', 'mai', 'mal', 'mam', 'mar', 'mdr', 'mfh', 'mfj', 'mgg', 'mgm', 'mgo', 'mgq', 'mhx',
'miy', 'mkz', 'mle', 'mlk', 'mlw', 'mmu', 'mne', 'mnf', 'mnw', 'mot', 'mqj', 'mrn', 'mry', 'msb', 'muv', 'mve',
'mxu', 'mya', 'myk', 'myx', 'mzm', 'nas', 'nco', 'nep', 'new', 'nge', 'ngn', 'nhx', 'njy', 'nla', 'nld', 'nlv',
'nod', 'nsk', 'nsn', 'nso', 'nst', 'nuj', 'nwe', 'nwi', 'nxa', 'nxl', 'nya', 'nyo', 'nyu', 'nza', 'odk', 'oji',
'oki', 'omw', 'ori', 'ozm', 'pae', 'pag', 'pan', 'pbt', 'pce', 'pcg', 'pdu', 'pea', 'pex', 'pis', 'pkb', 'pmf',
'pnz', 'por', 'psp', 'pwg', 'qaa', 'qub', 'quc', 'quf', 'quz', 'qve', 'qvh', 'qvm', 'qvo', 'qxh', 'rel', 'rnl',
'ron', 'roo', 'rue', 'rug', 'rus', 'san', 'saq', 'sat', 'sdk', 'sea', 'sgd', 'shn', 'sml', 'snk', 'snl', 'som',
'sot', 'sox', 'spa', 'sps', 'ssn', 'stk', 'swa', 'swh', 'sxb', 'syw', 'taj', 'tam', 'tbj', 'tdb', 'tdg', 'tdt',
'teo', 'tet', 'tgk', 'tha', 'the', 'thk', 'thl', 'thy', 'tio', 'tkd', 'tnl', 'tnn', 'tnp', 'tnt', 'tod', 'tom',
'tpi', 'tpl', 'tpu', 'tsb', 'tsn', 'tso', 'tuv', 'tuz', 'tvs', 'udg', 'unr', 'urd', 'uzb', 'ven', 'vie', 'vif',
'war', 'wbm', 'wbr', 'wms', 'wni', 'wnk', 'wtk', 'xho', 'xkg', 'xmd', 'xmg', 'xmm', 'xog', 'xty', 'yas', 'yav',
'ybb', 'ybh', 'ybi', 'ydd', 'yea', 'yet', 'yid', 'yin', 'ymp', 'zaw', 'zho', 'zlm', 'zuh', 'zul']

# Map of langauge codes
lang_map = {'kwd': 'kwd', 'lo': 'lao', 'sps': 'sps', 'cry': 'cry', 'wms': 'wms', 'prs': 'fas',
'gwc': 'gwc', 'bfn': 'bfn', 'kms': 'kms', 'oki': 'oki', 'quf': 'quf', 'wni': 'wni', 'ceb-x-boholano': 'ceb',
'laj': 'laj', 'kyq': 'kyq', 'or': 'ori', 'rue': 'rue', 'mve': 'mve', 'gsw': 'gsw', 'ru-KG': 'rus',
'ken': 'ken', 'ekm': 'ekm', 'tn': 'tsn', 'sw': 'swa', 'swh': 'swh', 'tdt': 'tdt', 'my': 'mya', 'kmr': 'kmr',
'syw': 'syw', 'xog': 'xog', 'ksw': 'ksw', 'pcg': 'pcg', 'guz': 'guz', 'khb': 'khb', 'clo': 'clo', 'bob': 'bob',
'pbt': 'pbt', 'teo': 'teo', 'kxp': 'kxp', 'tet': 'tet', 'ts': 'tso', 'ha': 'hau', 've': 'ven', 'mxu': 'mxu',
'es-PE': 'spa', 'de': 'deu', 'cmo': 'cmo', 'am': 'amh', 'bef': 'bef', 'bn': 'ben', 'ro': 'ron', 'bzi': 'bzi',
'ml': 'mal', 'af': 'afr', 'mzm': 'mzm', 'yas': 'yas', 'bec': 'bec', 'awa': 'awa', 'bkm': 'bkm', 'so': 'som',
'tnn': 'tnn', 'the': 'the', 'ann': 'ann', 'myx': 'myx', 'ddg': 'ddg', 'yet': 'yet', 'hbb': 'hbb', 'adq': 'adq',
'sok': 'sok', 'bfm': 'bfm', 'bra': 'bra', 'csw-Cans-CA': 'csw', 'ilo': 'ilo', 'lhm': 'lhm', 'rug': 'rug',
'lmp': 'lmp', 'mnf': 'mnf', 'tdb': 'tdb', 'ada': 'ada', 'tvs': 'tvs', 'th': 'tha', 'wbr': 'wbr', 'dtp': 'dtp',
'pnz': 'pnz', 'sea': 'sea', 'brv': 'brv', 'xmg': 'xmg', 'saq-x-Ilchamus': 'saq', 'sa': 'san', 'mne': 'mne',
'lwl': 'lwl', 'dty': 'dty', 'chd': 'chd', 'lg': 'lug', 'vif': 'vif', 'lkb': 'lkb', 'fuv-Arab': 'fuv', 'ded': 'ded',
'nsk': 'nsk', 'fub': 'fub', 'mhx': 'mhx', 'pa': 'pan', 'isu': 'isu', 'bjn': 'bjn', 'tl': 'fil', 'ht': 'hat',
'boz': 'boz', 'zgh': 'zgh', 'enb-x-Sengwer': 'enb', 'bxa': 'bxa', 'qub': 'qub', 'hla': 'hla', 'jgo': 'jgo',
'taj': 'taj', 'ajz': 'ajz', 'kmu': 'kmu', 'psp': 'psp', 'xmm': 'xmm', 'shn': 'shn', 'zuh': 'zuh', 'cim': 'cim',
'lan': 'lan', 'st': 'sot', 'lfa': 'lfa', 'es': 'spa', 'tio': 'tio', 'mdr': 'mdr', 'ydd': 'ydd', 'fil': 'fil',
'nod': 'nod', 'id': 'ind', 'csw-Latn': 'csw', 'bhs': 'bhs', 'udg': 'udg', 'gou': 'gou', 'hre': 'hre', 'ahk': 'ahk',
'bi': 'bis', 'qvh': 'qvh', 'mfj': 'mfj', 'miy': 'miy', 'mgm': 'mgm', 'ybh': 'ybh', 'ymp': 'ymp', 'es-GT': 'spa',
'fal': 'fal', 'pex': 'pex', 'bfd': 'bfd', 'zh': 'zho', 'pwg': 'pwg', 'bgf': 'bgf', 'omw': 'omw', 'dtr': 'dtr',
'ku': 'kur', 'cak': 'cak', 'ky': 'kir', 'sgd': 'sgd', 'quc': 'quc', 'cuh': 'cuh', 'eee': 'eee', 'bze': 'bze',
'bo': 'bod', 'jmx-x-smp': 'jmx', 'new': 'new', 'dig': 'dig', 'gu': 'guj', 'dug': 'dug', 'pag': 'pag', 'aaa': 'aaa',
'bag': 'bag', 'oj': 'oji', 'ngn': 'ngn', 'ny': 'nya', 'zlm': 'zlm', 'kbq': 'kbq', 'baw': 'baw', 'ak': 'aka',
'mot': 'mot', 'jmx': 'jmx', 'amu': 'amu', 'ta': 'tam', 'tg': 'tgk', 'nlv': 'nlv', 'aph': 'aph', 'mlk': 'mlk',
'en-Brai-IN-x-Chetana-Trust': 'eng', 'dje': 'dje', 'nyu': 'nyu', 'tnl': 'tnl', 'xkg': 'xkg', 'gbj': 'gbj',
'snk-Arab': 'snk', 'ino': 'ino', 'sox': 'sox', 'tpu': 'tpu', 'bya': 'bya', 'pmf': 'pmf', 'krr': 'krr', 'nuj': 'nuj',
'kn': 'kan', 'ne': 'nep', 'mgg': 'mgg', 'pkb': 'pkb', 'snl': 'snl', 'mgo': 'mgo', 'km': 'khm', 'kmg': 'kmg',
'mkz': 'mkz', 'bkc': 'bkc', 'pea': 'pea', 'mnw': 'mnw', 'msb': 'msb', 'lhu': 'lhu', 'tpl': 'tpl', 'ybi': 'ybi',
'bqm': 'bqm', 'uz': 'uzb', 'zu': 'zul', 'hi': 'hin', 'kwx': 'kwx', 'ike': 'ike', 'njy': 'njy', 'yav': 'yav',
'nge': 'nge', 'nxa': 'nxa', 'nxl': 'nxl', 'ybb': 'ybb', 'en-Dupl': 'eng', 'tdc': 'tdc', 'bcc': 'bcc',
'jmx-x-EnAm': 'jmx', 'kbx': 'kbx', 'chp': 'chp', 'rme': 'rme', 'pis': 'pis', 'snk': 'snk', 'sdk': 'sdk', 'abc': 'abc',
'zaw': 'zaw', 'mle': 'mle', 'kby': 'kby', 'nsk-Latn': 'nsk', 'ceb': 'ceb', 'bbk': 'bbk', 'hro': 'hro', 'kvt': 'kvt',
'tnp': 'tnp', 'kak': 'kak', 'nwi': 'nwi', 'th-TH': 'tha', 'xh': 'xho', 'lbr': 'lbr', 'fon': 'fon', 'dag': 'dag',
'bim': 'bim', 'qve': 'qve', 'vi': 'vie', 'wnk': 'wnk', 'mqj': 'mqj', 'ags': 'ags', 'nsp': 'nsp', 'ur': 'urd',
'lgr': 'lgr', 'war': 'war', 'pt': 'por', 'lgg': 'lgg', 'loh': 'loh', 'tnt': 'tnt', 'cgc': 'cgc', 'thy': 'thy',
'luy': 'luy', 'aeu': 'aeu', 'kek': 'kek', 'tdg': 'tdg', 'sml': 'sml', 'tkd': 'tkd', 'sat': 'sat', 'nso': 'nso',
'bce': 'bce', 'dmg': 'dmg', 'bjr': 'bjr', 'hil': 'hil', 'gal': 'gal', 'fr': 'fra', 'ssn': 'ssn', 'saq': 'saq',
'mr': 'mar', 'gwc-Arab': 'gwc', 'ame': 'ame', 'ki': 'kik', 'hig': 'hig', 'myk': 'myk', 'qvm': 'qvm', 'pdu': 'pdu',
'sor': 'sor', 'enc': 'enc', 'nsn': 'nsn', 'mlw': 'mlw', 'ja': 'jpn', 'en-IN': 'eng', 'mry': 'mry', 'nco': 'nco',
'bss': 'bss', 'fli': 'fli', 'nza': 'nza', 'stk': 'stk', 'bwt': 'bwt', 'yi': 'yid', 'roo': 'roo', 'csw': 'csw',
'kr': 'kau', 'rel': 'rel', 'en': 'eng', 'mrn': 'mrn', 'amp': 'amp', 'nwe': 'nwe', 'kqr': 'kqr', 'kjb': 'kjb',
'hna': 'hna', 'xmd': 'xmd', 'mai': 'mai', 'lts': 'lts', 'bho': 'bho', 'jra': 'jra', 'jmx-x-coi': 'jmx',
'zh-CN': 'zho', 'qvo': 'qvo', 'cuv': 'cuv', 'bkh': 'bkh', 'mmu': 'mmu', 'rw': 'kin', 'agq': 'agq', 'wbm': 'wbm',
'kam': 'kam', 'buo': 'buo', 'bud': 'bud', 'azn': 'azn', 'yea': 'yea', 'mgq': 'mgq', 'cmn': 'cmn', 'pae': 'pae',
'bri': 'bri', 'bkx': 'bkx', 'idt': 'idt', 'mfh': 'mfh', 'lsi': 'lsi', 'xty': 'xty', 'cbr': 'cbr', 'tsb': 'tsb',
'brb': 'brb', 'guc': 'guc', 'qxh': 'qxh', 'fuh': 'fuh', 'pce': 'pce', 'tuv': 'tuv', 'awb': 'awb', 'mam': 'mam',
'nst': 'nst', 'bm': 'bam', 'hao': 'hao', 'nla': 'nla', 'wtk': 'wtk', 'odk': 'odk', 'tom': 'tom', 'thl': 'thl',
'tuz': 'tuz', 'ewo': 'ewo', 'azo': 'azo', 'aia': 'aia', 'dnw': 'dnw', 'tpi': 'tpi', 'nyo': 'nyo', 'nas': 'nas',
'llg': 'llg', 'mxl': 'mxl', 'tbj': 'tbj', 'muv': 'muv', 'lns': 'lns', 'qaa': 'qaa', 'bwx': 'bwx', 'ko': 'kor',
'yin': 'yin', 'nhx': 'nhx', 'sxb': 'sxb', 'kwu': 'kwu', 'ru': 'rus', 'it': 'ita', 'rnl': 'rnl', 'tod': 'tod',
'thk': 'thk', 'unr': 'unr', 'nl': 'nld', 'ozm': 'ozm', 'bax': 'bax', 'quz': 'quz'}

In [None]:
def get_annotations(data, lang):

    # Get quarantined stories
    quarantined = []
    for s in data['stories']:
        if data['stories'][s]['quarantine']:
            quarantined.append(s)

    # Loop over annotations
    annotations = {}
    for a in data['annotations']:

        # Filter for language
        if lang_map[a[0]['lang']] == lang and a[0]['story_id'] not in quarantined:

            # Add to the output
            if a[0]['album_id'] not in annotations.keys():
                annotations[a[0]['album_id']] = []

            annotations[a[0]['album_id']].append({
                'image_id': a[0]['photo_flickr_id'],
                'story_index': a[0]['worker_arranged_photo_order'],
                'story_id': a[0]['story_id'],
                'text': a[0]['text']
            })

    return annotations

In [None]:

len(get_annotations(data, "eng"))

2632

In [None]:
# get_image_urls matches image URLs with parsed annotations
def get_image_urls(data, annotations):

    # Get image urls in a dictionary
    images = {}
    for image in data['images']:
        images[image['id']] = image['url_o']

    # Add this data into the annotations
    annotations_rev = []
    for a in annotations:
        story = []
        for story_entry in annotations[a]:
            story_entry['image_url'] = images[story_entry['image_id']]
            story.append(story_entry)
        annotations_rev.append({
            'album_id': a,
            'story': story
        })

    return annotations_rev

In [None]:
stories_with_images_eng = get_image_urls(data, get_annotations(data, "eng"))

In [None]:
stories_with_images_eng

[{'album_id': 'ef8d5b77-dcc9-4600-be6b-c7ee2c5c0e46',
  'story': [{'image_id': '29d65a9d-6882-4158-a018-cec63ec1d8e8',
    'story_index': 0,
    'story_id': '670fd2d9-8e96-4f08-8744-f23cc39b15bc',
    'text': 'My face can be worried.',
    'image_url': 'https://bloom-vist.s3.amazonaws.com/My%20Face%20Tells%20a%20Story/2%20Thoughtful.jpg'},
   {'image_id': '6add527b-cf88-4303-b4cb-4d959ef8df83',
    'story_index': 1,
    'story_id': '670fd2d9-8e96-4f08-8744-f23cc39b15bc',
    'text': 'My face can be shy.',
    'image_url': 'https://bloom-vist.s3.amazonaws.com/My%20Face%20Tells%20a%20Story/3%20Shy.jpg'},
   {'image_id': 'c0d6cc2f-9eb8-4015-a943-fae4f5566d0f',
    'story_index': 2,
    'story_id': '670fd2d9-8e96-4f08-8744-f23cc39b15bc',
    'text': 'My face can be happy.',
    'image_url': 'https://bloom-vist.s3.amazonaws.com/My%20Face%20Tells%20a%20Story/4%20Happy2.jpg'},
   {'image_id': '77f64cdd-1a2e-4155-b904-639e60717fde',
    'story_index': 3,
    'story_id': '670fd2d9-8e96-4f08-874

In [None]:
# Install the language detector
!pip -q install langdetect

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/981.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [None]:
from langdetect import detect_langs

def is_english(text: str, p: float = 0.9999) -> bool: #English probability of a 99.99% confidence
    """
    Return True if `text` is very likely English.
    `p` is the minimum confidence (0–1).
    """
    try:
        top = detect_langs(text)[0]      # e.g. [en:0.9999]
        return top.lang == "en" and top.prob >= p
    except Exception:
        return False  # detector failed / text too short


In [None]:
# Filter the data
english_albums = []
for album in stories_with_images_eng:
    # Concatenate all text fields in the album (join the pages)
    full_text = " ".join(page["text"] for page in album["story"])

    if is_english(full_text):
        english_albums.append(album)
    # else:
    #   print(full_text)

print(f"Kept {len(english_albums)} of {len(stories_with_images_eng)} albums")

Kept 2531 of 2632 albums


In [None]:
import json
from pathlib import Path

out_path = Path('/content/drive/MyDrive/Project VIST with LLM/dataset_english.json')

# Save to JSON
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(english_albums, f, ensure_ascii=False, indent=2)

print(f"JSON saved → {out_path.resolve()}")

JSON saved → /content/drive/MyDrive/Project VIST with LLM/dataset_english.json


In [None]:
# setup
from pathlib import Path
from urllib.parse import urlparse
import requests, json, os, tqdm

# Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")

# Paths
DRIVE_ROOT = Path("/content/drive/MyDrive/Project VIST with LLM/english_img_albums")   # target root

DRIVE_ROOT.mkdir(parents=True, exist_ok=True)


# Helper
def download(url: str, dest: Path, chunk: int = 1 << 14) -> None:
    """Streams *url* to *dest* in chunks; overwrites if file exists and size=0."""
    if dest.exists() and dest.stat().st_size > 0:
        return                                                     # already done
    resp = requests.get(url, stream=True, timeout=10)
    resp.raise_for_status()
    with open(dest, "wb") as fh:
        for blk in resp.iter_content(chunk):
            if blk: fh.write(blk)

# Iterate stories
for album_idx, album in enumerate(tqdm.tqdm(english_albums, desc="Albums")):
    # Folder name: 4-digit index + album_id
    album_id = album["album_id"]
    folder   = DRIVE_ROOT / f"{album_idx:04d}_{album_id}"
    folder.mkdir(parents=True, exist_ok=True)

    for part in album["story"]:
        idx        = part["story_index"]                            # ordering
        img_url    = part["image_url"]
        ext        = Path(urlparse(img_url).path).suffix or ".jpg"  # fallback
        filename   = folder / f"{idx:02d}{ext}"

        try:
            download(img_url, filename)
        except Exception as e:
            print(f"[WARN] {img_url} → {filename.name}: {e}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Albums: 100%|██████████| 2531/2531 [2:26:04<00:00,  3.46s/it]


In [None]:
import json, random
from pathlib import Path
from PIL import Image

# paths
BASE      = Path("/content/drive/MyDrive/Project VIST with LLM")
JSON_FILE_eng = BASE / "dataset_english.json"
JSON_FILE_spa = BASE / "dataset_spanish.json"
IMG_ROOT  = BASE / "english_img_albums"          # each sub-folder = one album

# load json
with open(JSON_FILE_eng, encoding="utf-8") as f:
    albums_english = json.load(f)        # list[...]  all English albums

with open(JSON_FILE_spa, encoding="utf-8") as f:
    albums_spanish = json.load(f)        # list[...]  all English albums

In [None]:
len(albums_english)

2531

In [None]:
len(albums_spanish)

510