# Gender Categories

In [1]:
from pycocotools.coco import COCO
import matplotlib.pyplot as plt
import csv
import pandas as pd
import numpy as np

In [5]:
instancesFile = '../annotations/instances_val2017.json'
annFile = '../annotations/captions_val2017.json'
cocoInstances = COCO(instancesFile)
cocoAnn = COCO(annFile)

loading annotations into memory...
Done (t=0.83s)
creating index...
index created!
loading annotations into memory...
Done (t=0.21s)
creating index...
index created!


In [8]:
# get annotation id of those within the people subset
catIds = cocoInstances.getCatIds(catNms=['person']) # category Id of people subset
pplIds = cocoInstances.getImgIds(catIds=catIds) # image Ids containing people category
annIds = cocoAnn.getAnnIds(imgIds=pplIds)
anns = cocoAnn.loadAnns(annIds)

In [9]:
# get the annotations for each image and do cursory check for any gender specific language
femaleIndicators = set(['woman', 'women', 'lady', 'girl', 'girls', 'female'])
maleIndicators = set(['male', 'man', 'boy', 'boys', 'men', 'guy', 'dude'])
with open('peopleAnnotations.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['image_id', 'id', 'caption', 'female_indicator', 'male_indicator'])
    for ann in anns:
        caption = set(ann['caption'].split(' '))
        female = male = 0
        if len(set.intersection(femaleIndicators, caption)) > 0:
            female = 1
        if len(set.intersection(maleIndicators, caption)) > 0:
            male = 1
        writer.writerow([ann['image_id'], ann['id'], ann['caption'], female, male])

In [10]:
# code images as containing "gendered" or "non-gendered" captions
df = pd.read_csv("../annotations/peopleAnnotations.csv")
df = df.groupby("image_id").sum()

In [11]:
img_df = pd.DataFrame()
img_df['image_id'] = df.index
img_df['female_indicator'] = np.where(df['female_indicator'] >= 3, 1, 0)
img_df['male_indicator'] = np.where(df['male_indicator'] >= 3, 1, 0)
img_df = img_df.sort_values('image_id')

In [12]:
categories = []
for index, pplId in enumerate(pplIds):
    ann = cocoInstances.getAnnIds(imgIds=pplId)
    annotations = []
    for a in ann:
        ann_j = cocoInstances.loadAnns(ids=a)
        annotations.append(ann_j[0]['category_id'])
    cat_occurences = np.zeros(91)
    for index, cat in enumerate(annotations):
        cat_occurences[annotations[index]] = 1
    categories.append(cat_occurences)
dictionary = {}
categories = np.array(categories)
for i in range(1,91,1):
    dictionary[i] = categories[:, i]

In [13]:
df = pd.DataFrame(dictionary)
df.insert(0, 'image_id', pplIds)
df = df.sort_values('image_id')
fem = img_df['female_indicator'].tolist()
mal = img_df['male_indicator'].tolist()
df.insert(1, 'female_indicator', fem)
df.insert(2, 'male_indicator', mal)

In [23]:
# save the df to a .csv file
df.to_csv('../annotations/genderCorr.csv', index = False)

In [14]:
# find which categories co-occur most with certain genders / which are most gender neutral
total = np.array(df.sum()[3:93])
fem_df = df[df['female_indicator']==1]
fem_count = np.array(fem_df.sum()[3:93])
fem_cooccur = np.divide(fem_count, total, out=np.zeros_like(fem_count), where=total!=0)
# count for men
male_df = df[df['male_indicator']==1]
male_count = np.array(male_df.sum()[3:93])
male_cooccur = np.divide(male_count, total, out=np.zeros_like(male_count), where=total!=0)

In [27]:
fem_cooccur = np.where(fem_cooccur > male_cooccur, fem_cooccur, 0)
male_cooccur = np.where(male_cooccur > fem_cooccur, male_cooccur, 0)
fem_cats = np.argsort(fem_cooccur)[::-1] + 1
male_cats = np.argsort(male_cooccur)[::-1] + 1

In [28]:
cocoInstances.loadCats(ids=fem_cats[:18])

[{'supercategory': 'indoor', 'id': 90, 'name': 'toothbrush'},
 {'supercategory': 'appliance', 'id': 82, 'name': 'refrigerator'},
 {'supercategory': 'indoor', 'id': 89, 'name': 'hair drier'},
 {'supercategory': 'appliance', 'id': 81, 'name': 'sink'},
 {'supercategory': 'animal', 'id': 23, 'name': 'bear'},
 {'supercategory': 'furniture', 'id': 64, 'name': 'potted plant'},
 {'supercategory': 'indoor', 'id': 88, 'name': 'teddy bear'},
 {'supercategory': 'electronic', 'id': 77, 'name': 'cell phone'},
 {'supercategory': 'food', 'id': 52, 'name': 'banana'},
 {'supercategory': 'accessory', 'id': 28, 'name': 'umbrella'},
 {'supercategory': 'accessory', 'id': 31, 'name': 'handbag'},
 {'supercategory': 'indoor', 'id': 86, 'name': 'vase'},
 {'supercategory': 'kitchen', 'id': 50, 'name': 'spoon'},
 {'supercategory': 'food', 'id': 57, 'name': 'carrot'},
 {'supercategory': 'indoor', 'id': 85, 'name': 'clock'},
 {'supercategory': 'animal', 'id': 17, 'name': 'cat'},
 {'supercategory': 'outdoor', 'id': 

In [29]:
cocoInstances.loadCats(ids=male_cats[:20])

[{'supercategory': 'appliance', 'id': 80, 'name': 'toaster'},
 {'supercategory': 'accessory', 'id': 32, 'name': 'tie'},
 {'supercategory': 'sports', 'id': 41, 'name': 'skateboard'},
 {'supercategory': 'sports', 'id': 34, 'name': 'frisbee'},
 {'supercategory': 'animal', 'id': 24, 'name': 'zebra'},
 {'supercategory': 'sports', 'id': 42, 'name': 'surfboard'},
 {'supercategory': 'sports', 'id': 43, 'name': 'tennis racket'},
 {'supercategory': 'electronic', 'id': 73, 'name': 'laptop'},
 {'supercategory': 'sports', 'id': 37, 'name': 'sports ball'},
 {'supercategory': 'furniture', 'id': 63, 'name': 'couch'},
 {'supercategory': 'food', 'id': 54, 'name': 'sandwich'},
 {'supercategory': 'electronic', 'id': 75, 'name': 'remote'},
 {'supercategory': 'appliance', 'id': 79, 'name': 'oven'},
 {'supercategory': 'appliance', 'id': 78, 'name': 'microwave'},
 {'supercategory': 'furniture', 'id': 65, 'name': 'bed'},
 {'supercategory': 'indoor', 'id': 84, 'name': 'book'},
 {'supercategory': 'kitchen', 'id'