In [1]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os 
import pandas as pd 
import numpy as np
from tqdm import tqdm
from google.colab import files

In [3]:
!wget https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip

--2023-01-16 15:20:17--  https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36745453 (35M) [application/zip]
Saving to: ‘caption_datasets.zip’


2023-01-16 15:20:19 (39.2 MB/s) - ‘caption_datasets.zip’ saved [36745453/36745453]



In [4]:
!unzip caption_datasets.zip -d caption_datasets

Archive:  caption_datasets.zip
  inflating: caption_datasets/dataset_coco.json  
  inflating: caption_datasets/dataset_flickr30k.json  
  inflating: caption_datasets/dataset_flickr8k.json  


In [5]:
dfjson = pd.read_json('./caption_datasets/dataset_flickr30k.json')

list_df = [('image_ID', 'caption_ID', 'caption', 'split')]

for k_img in range(len(dfjson['images'])):
    dict_img = dfjson['images'][k_img]
    imgid = dict_img['filename'].split('.')[0]
    split = dict_img['split']
    text = dict_img['sentences']
    # for each caption
    for capt in text:
        # IID, CID, caption, split
        list_df.append((imgid, capt['sentid'], capt['raw'], split))
        
df = pd.DataFrame(list_df[1:], columns=list_df[0])#.set_index(['IID', 'CID'])
df

Unnamed: 0,image_ID,caption_ID,caption,split
0,1000092795,0,Two young guys with shaggy hair look at their ...,train
1,1000092795,1,"Two young, White males are outside near many b...",train
2,1000092795,2,Two men in green shirts are standing in a yard.,train
3,1000092795,3,A man in a blue shirt standing in a garden.,train
4,1000092795,4,Two friends enjoy time spent together.,train
...,...,...,...,...
155065,998845445,155065,A man in shorts and a Hawaiian shirt leans ove...,train
155066,998845445,155066,"A young man hanging over the side of a boat, w...",train
155067,998845445,155067,A man is leaning off of the side of a blue and...,train
155068,998845445,155068,"A man riding a small boat in a harbor, with fo...",train


In [None]:
df.to_csv('Caption_all.tsv', sep='\t', index=False)
files.download('Caption_all.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Skills

In [6]:
from skillschecker import skillsChecker



In [7]:
checker = skillsChecker()
list_captions = df.caption.values
dft = df[df.split == 'train']
skill = 'gender'

In [None]:
list_tuple_new_captions, wds = checker.change_captions_skill(list_captions=dft.caption, skill=skill, verbose=True)

100%|██████████| 36/36 [00:00<00:00, 3308.46it/s]
30it [00:00, 978.84it/s]


[('A man holds a large stuffed lion toy.',
  ['A woman holds a large stuffed lion toy.']),
 ('A man is smiling at a stuffed lion',
  ['A woman is smiling at a stuffed lion']),
 ('A girl is on rollerskates talking on her cellphone standing in a parking lot.',
  ['A boy is on rollerskates talking on his cellphone standing in a parking lot.'])]

In [None]:
list_tuple_new_captions, wds = checker.change_captions_skill(list_captions=dft.caption, skill=skill, verbose=True)

capt_selected = [k[0] for k in list_tuple_new_captions]
dftskill = dft[dft.caption.isin(capt_selected)]

100%|██████████| 145000/145000 [00:20<00:00, 7101.07it/s]
83164it [00:58, 1416.94it/s]


In [None]:
dftskill['word_detected'] = wds
dftskill['augmented_captions'] = [k[1] for k in list_tuple_new_captions]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dftskill['word_detected'] = wds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dftskill['augmented_captions'] = [k[1] for k in list_tuple_new_captions]


In [None]:
import spacy 
nlp = spacy.load("en_core_web_sm")

def find_prompt(x):
    """
    """
    prompt_segmentation = []
    doc = nlp(x['caption'])
    for wd in x['word_detected']:
        list_candidates = [k.text for k in doc.noun_chunks if wd in k.text.lower()]
        if len(list_candidates) > 1:
            prompt_segmentation.append('')
        elif len(list_candidates) == 0:
            prompt_segmentation.append(wd)
        else:
            prompt_segmentation.append(list_candidates[0])
    return prompt_segmentation

In [None]:
dftskill['prompt_segmentation'] = dftskill.apply(find_prompt, axis=1)

In [21]:
# Only keep when the noun phrase has been detected
dftskill = dftskill[dftskill['prompt_segmentation'].map(lambda x: np.mean([len(k) > 0 for k in x]) == 1)]

In [22]:
dftskill

Unnamed: 0,image_ID,caption_ID,caption,split,word_detected,augmented_captions,prompt_segmentation
0,1000092795,0,Two young guys with shaggy hair look at their ...,train,['guys'],['Two young girls with shaggy hair look at the...,[Two young guys]
1,1000092795,2,Two men in green shirts are standing in a yard.,train,['men'],['Two women in green shirts are standing in a ...,[Two men]
2,1000092795,3,A man in a blue shirt standing in a garden.,train,['man'],['A woman in a blue shirt standing in a garden.'],[A man]
3,10002456,5,Several men in hard hats are operating a giant...,train,['men'],['Several women in hard hats are operating a g...,[Several men]
4,10002456,7,Two men working on a machine wearing hard hats.,train,['men'],['Two women working on a machine wearing hard ...,[Two men]
...,...,...,...,...,...,...,...
83159,998845445,155065,A man in shorts and a Hawaiian shirt leans ove...,train,['man'],['A woman in shorts and a Hawaiian shirt leans...,[A man]
83160,998845445,155066,"A young man hanging over the side of a boat, w...",train,['man'],['A young woman hanging over the side of a boa...,[A young man]
83161,998845445,155067,A man is leaning off of the side of a blue and...,train,['man'],['A woman is leaning off of the side of a blue...,[A man]
83162,998845445,155068,"A man riding a small boat in a harbor, with fo...",train,['man'],"['A woman riding a small boat in a harbor, wit...",[A man]


In [None]:
dftskill.to_csv('Caption_training_%s.tsv'%skill, sep='\t', index=False)
files.download('Caption_training_%s.tsv'%skill)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
dftest = df[df.split == 'test']

for sk in ['gender', 'color', 'emotion', 'counting']:
  capts, _ = checker.find_captions_skill(list_captions=dftest.caption, skill=sk)
  dftestskill = dftest[dftest.caption.isin(capts)]
  dftestskill.to_csv('Caption_testing_%s.tsv'%sk, sep='\t', index=False)
  files.download('Caption_testing_%s.tsv'%sk)