In [140]:
import cv2
from pycocotools.coco import COCO
import urllib
import numpy as np
import matplotlib.pyplot as plt
import csv
import json
import pandas as pd

In [17]:
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')

In [18]:
# sees whether a face can be detected in the image
def detect_faces(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    if len(faces) > 0:
        return True
    else:
        return False

In [5]:
instancesFile = '../annotations/instances_val2017.json'
annFile = '../annotations/captions_val2017.json'
cocoInstances = COCO(instancesFile)
cocoAnn = COCO(annFile)

loading annotations into memory...
Done (t=0.99s)
creating index...
index created!
loading annotations into memory...
Done (t=0.30s)
creating index...
index created!


In [8]:
# get annotation id of those within the people subset
catIds = cocoInstances.getCatIds(catNms=['person']) # category Id of people subset
pplIds = cocoInstances.getImgIds(catIds=catIds) # image Ids containing people category
annIds = cocoAnn.getAnnIds(imgIds=pplIds)
imgs = cocoInstances.loadImgs(ids=pplIds)

In [96]:
femaleIndicators = set(['woman', 'women', 'lady', 'girl', 'girls', 'female', 'her'])
maleIndicators = set(['male', 'man', 'boy', 'boys', 'men', 'guy', 'dude', 'his'])

In [41]:
imgId = []
count = 0
with open('../annotations/faceblockNeutral.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['image_id', 'caption'])
    for img in imgs:
        resp = urllib.request.urlopen(img['coco_url'])
        image = np.asarray(bytearray(resp.read()), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        if detect_faces(image):
            continue
        else:
            imgId.append(img['id'])
#             annIds = cocoAnn.getAnnIds(imgIds=[img['id']])
#             anns = cocoAnn.loadAnns(annIds)
#             for ann in anns:
#                 if len(set.intersection(femaleIndicators, ann)) > 0 \
#                     or len(set.intersection(maleIndicators, ann)) > 0:
#                     writer.writerow([img['id'], ann])

In [169]:
count = 0
with open('../annotations/faceblockNeutral.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['image_id', 'caption'])
    for img in imgId:
        annIds = cocoAnn.getAnnIds(imgIds=[img])
        anns = cocoAnn.loadAnns(annIds)
        for ann in anns:
            caption = ann['caption'].split()
            if len(set.intersection(femaleIndicators, set(caption))) > 0 \
            or len(set.intersection(maleIndicators, set(caption))) > 0:
                writer.writerow([img, ann['caption']])
                count += 1
print('Gender Assumed Images: {0}'.format(count))

Gender Assumed Images: 4416


In [157]:
df = pd.read_csv('../annotations/faceblockNeutral.csv')
captions = df['caption']
new_captions = []
for caption in captions:
    caption = caption.lower()
    if len(set.intersection(femaleIndicators, set(caption.split()))) > 0:
        female_words = set.intersection(femaleIndicators, set(caption.split()))
        while len(female_words) > 0:
            female_word = female_words.pop()
            if female_word is 'women':
                caption = caption.replace(female_word, "people") 
            elif female_word is 'her':
                caption = caption.replace(female_word, "their") 
            elif female_word is 'she':
                caption = caption.replace(female_word, "they") 
            else:
                caption = caption.replace(female_word, "person")
    if len(set.intersection(maleIndicators, set(caption.split()))) > 0:
        male_words = set.intersection(maleIndicators, set(caption.split()))
        while len(male_words) > 0:
            male_word = male_words.pop()
            if male_word is 'men':
                caption = caption.replace(male_word, "people") 
            elif male_word is 'his':
                caption = caption.replace(male_word, "their") 
            elif female_word is 'he':
                caption = caption.replace(female_word, "they") 
            else:
                caption = caption.replace(male_word, "person")
    new_captions.append(caption)
df['new_caption'] = new_captions
df.to_csv('../annotations/faceblockNeutral.csv', index=False)

In [154]:
count = 0
with open('../annotations/faceblockMale.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['image_id', 'caption'])
    for img in imgId:
        annIds = cocoAnn.getAnnIds(imgIds=[img])
        anns = cocoAnn.loadAnns(annIds)
        for ann in anns:
            caption = ann['caption'].split()
            if len(set.intersection(maleIndicators, set(caption))) > 0:
                writer.writerow([img, ann['caption']])
                count += 1
print('Male Assumed Images: {0}'.format(count))

Male Assumed Images: 3137


In [175]:
df = pd.read_csv('../annotations/faceblockMale.csv')
captions = df['caption']
new_captions = []
for caption in captions:
    caption = caption.lower()
    if len(set.intersection(maleIndicators, set(caption.split()))) > 0 and len(set.intersection(femaleIndicators, set(caption.split()))) == 0:
        male_words = set.intersection(maleIndicators, set(caption.split()))
        while len(male_words) > 0:
            male_word = male_words.pop()
            if male_word is 'men':
                caption = caption.replace(male_word, "women") 
            elif male_word is 'his':
                caption = caption.replace(male_word, "her") 
            elif female_word is 'he':
                caption = caption.replace(female_word, "she")                 
            else:
                caption = caption.replace(male_word, "woman")
    caption.replace("wowoman", "woman")
    new_captions.append(caption)
df['new_caption'] = new_captions
df.to_csv('../annotations/faceblockMale.csv', index=False)

In [176]:
def createAnnFile(df, filename):
    captions = list(df['new_caption'])
    image_id = list(df['image_id'])
    visited = {}
    output = {"annotations": [], "images": []}
    for index, i in enumerate(image_id):
        if i not in visited:
            visited[i] = True
            output['images'].append({"id": int(i),"file_name": 'COCO_val2014_' + str(i).zfill(12) + ".jpg"})
        output["annotations"].append({"image_id": int(i), "caption": captions[index].strip()})
    print(len(visited.keys()))
    with open(filename, 'w') as f:
        json.dump(output, f)

In [177]:
output = createAnnFile(df, '../annotations/blockMale.json')

1128


In [161]:
print(output)

{'annotations': [{'image_id': 532481, 'caption': 'a person is flying up in the air and having fun.'}, {'image_id': 532481, 'caption': 'a person is waterboarding in the ocean on a windy day.'}, {'image_id': 458755, 'caption': 'young person with sheep on straw covered floor.'}, {'image_id': 458755, 'caption': 'a child places their hands on the head and neck of a sheep while another sheep looks at their face.'}, {'image_id': 458755, 'caption': 'a person kneeling to pet animals while others wait.'}, {'image_id': 385029, 'caption': 'a person cuts a pizza to serve with salad and wine.'}, {'image_id': 393226, 'caption': 'a person who is walking across the street.'}, {'image_id': 393226, 'caption': 'a person walks behind an ice cream truck'}, {'image_id': 393226, 'caption': 'a person is crossing a street near an icecream truck.'}, {'image_id': 393226, 'caption': 'the person is walking behind the concession bus.'}, {'image_id': 532493, 'caption': 'a person surfer is riding a wave on a sunny day