In [1]:
from pycocotools.coco import COCO
import matplotlib.pyplot as plt
import csv
import pandas as pd
import numpy as np

In [2]:
instancesFile = 'annotations/instances_val2017.json'
annFile = 'annotations/captions_val2017.json'
cocoInstances = COCO(instancesFile)
cocoAnn = COCO(annFile)

loading annotations into memory...
Done (t=0.59s)
creating index...
index created!
loading annotations into memory...
Done (t=0.05s)
creating index...
index created!


In [3]:
# get annotation id of those within the people subset
catIds = cocoInstances.getCatIds(catNms=['person']) # category Id of people subset
pplIds = cocoInstances.getImgIds(catIds=catIds) # image Ids containing people category
annIds = cocoAnn.getAnnIds(imgIds=pplIds)

In [4]:
# get corresponding annotations
anns = cocoAnn.loadAnns(annIds)
print(anns[2])

{'image_id': 532481, 'id': 547713, 'caption': 'A man is flying up in the air and having fun. '}


In [5]:
# get the annotations for each image and do cursory check for any gender specific language
femaleIndicators = set(['woman', 'women', 'lady', 'girl', 'girls', 'female'])
maleIndicators = set(['male', 'man', 'boy', 'boys', 'men', 'guy', 'dude'])
with open('peopleAnnotations.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['image_id', 'id', 'caption', 'female_indicator', 'male_indicator'])
    for ann in anns:
        caption = set(ann['caption'].split(' '))
        female = male = 0
        if len(set.intersection(femaleIndicators, caption)) > 0:
            female = 1
        if len(set.intersection(maleIndicators, caption)) > 0:
            male = 1
        writer.writerow([ann['image_id'], ann['id'], ann['caption'], female, male])

In [6]:
# code images as containing "gendered" or "non-gendered" captions
df = pd.read_csv("peopleAnnotations.csv")
df = df.groupby("image_id").sum()

In [7]:
img_df = pd.DataFrame()
img_df['image_id'] = df.index
img_df['female_indicator'] = np.where(df['female_indicator'] >= 3, 1, 0)
img_df['male_indicator'] = np.where(df['male_indicator'] >= 3, 1, 0)
img_df = img_df.sort_values('image_id')

In [8]:
img_df

Unnamed: 0,image_id,female_indicator,male_indicator
0,139,1,0
1,785,1,0
2,872,0,0
3,885,0,1
4,1000,0,0
...,...,...,...
2688,580418,0,0
2689,581062,0,1
2690,581206,0,0
2691,581317,0,0


In [12]:
categories = []
for index, pplId in enumerate(pplIds):
    ann = cocoInstances.getAnnIds(imgIds=pplId)
    annotations = []
    for a in ann:
        ann_j = cocoInstances.loadAnns(ids=a)
        annotations.append(ann_j[0]['category_id'])
    cat_occurences = np.zeros(91)
    for index, cat in enumerate(annotations):
        cat_occurences[annotations[index]] += 1
    categories.append(cat_occurences)
dictionary = {}
categories = np.array(categories)
for i in range(1,91,1):
    dictionary[i] = categories[:, i]

In [14]:
df = pd.DataFrame(dictionary)
df.insert(0, 'image_id', pplIds)
df = df.sort_values('image_id')
fem = img_df['female_indicator'].tolist()
mal = img_df['male_indicator'].tolist()
df.insert(1, 'female_indicator', fem)
df.insert(2, 'male_indicator', mal)

In [148]:
df.to_csv('genderCorr.csv', index = False)

In [15]:
# find which categories co-occur most with certain genders / which are most gender neutral
total = np.array(df.sum()[3:93])
fem_df = df[df['female_indicator']==1]
fem_count = np.array(fem_df.sum()[3:93])
fem_cooccur = np.divide(fem_count, total, out=np.zeros_like(fem_count), where=total!=0)
# count for men
male_df = df[df['male_indicator']==1]
male_count = np.array(male_df.sum()[3:93])
male_cooccur = np.divide(male_count, total, out=np.zeros_like(male_count), where=total!=0)

In [16]:
fem_cooccur = np.where(fem_cooccur > male_cooccur, fem_cooccur, 0)
male_cooccur = np.where(male_cooccur > fem_cooccur, male_cooccur, 0)
fem_cats = np.argsort(fem_cooccur)[::-1] + 1
male_cats = np.argsort(male_cooccur)[::-1] + 1

In [25]:
print(fem_cats)
cats = []
for i in range(1, 91):
    cats.append(i)
cocoInstances.loadCats(ids=cats[13:25])

[82 90 89 81 23 65 64 77 31 17 28 53 50 48 49 57 59 51 86 85 76 88 13  7
  5 26  4 27 24 29 30 32 33 34 25 21  2 14  6  8  9 10 11 12 15 22 16 36
 18 19 20  3 35 45 37 38 67 68 69 70 71 72 73 74 75 78 79 80 83 84 87 66
 63 62 46 39 40 41 42 43 44 47 61 52 54 55 56 58 60  1]


[{'supercategory': 'outdoor', 'id': 14, 'name': 'parking meter'},
 {'supercategory': 'outdoor', 'id': 15, 'name': 'bench'},
 {'supercategory': 'animal', 'id': 16, 'name': 'bird'},
 {'supercategory': 'animal', 'id': 17, 'name': 'cat'},
 {'supercategory': 'animal', 'id': 18, 'name': 'dog'},
 {'supercategory': 'animal', 'id': 19, 'name': 'horse'},
 {'supercategory': 'animal', 'id': 20, 'name': 'sheep'},
 {'supercategory': 'animal', 'id': 21, 'name': 'cow'},
 {'supercategory': 'animal', 'id': 22, 'name': 'elephant'},
 {'supercategory': 'animal', 'id': 23, 'name': 'bear'},
 {'supercategory': 'animal', 'id': 24, 'name': 'zebra'},
 {'supercategory': 'animal', 'id': 25, 'name': 'giraffe'}]

In [18]:
cocoInstances.loadCats(ids=male_cats[:25])

[{'supercategory': 'appliance', 'id': 80, 'name': 'toaster'},
 {'supercategory': 'accessory', 'id': 32, 'name': 'tie'},
 {'supercategory': 'sports', 'id': 41, 'name': 'skateboard'},
 {'supercategory': 'sports', 'id': 34, 'name': 'frisbee'},
 {'supercategory': 'animal', 'id': 16, 'name': 'bird'},
 {'supercategory': 'food', 'id': 56, 'name': 'broccoli'},
 {'supercategory': 'sports', 'id': 37, 'name': 'sports ball'},
 {'supercategory': 'sports', 'id': 43, 'name': 'tennis racket'},
 {'supercategory': 'electronic', 'id': 75, 'name': 'remote'},
 {'supercategory': 'furniture', 'id': 63, 'name': 'couch'},
 {'supercategory': 'sports', 'id': 42, 'name': 'surfboard'},
 {'supercategory': 'food', 'id': 58, 'name': 'hot dog'},
 {'supercategory': 'animal', 'id': 25, 'name': 'giraffe'},
 {'supercategory': 'appliance', 'id': 78, 'name': 'microwave'},
 {'supercategory': 'food', 'id': 55, 'name': 'orange'},
 {'supercategory': 'appliance', 'id': 79, 'name': 'oven'},
 {'supercategory': 'food', 'id': 54, 'n