In [None]:
# #############################################################################
# Importing necessary libraries
import numpy as np
import pandas as pd
import json
from pathlib import Path
from m3inference import M3Inference
from collections import OrderedDict
#import pprint
#import urllib.request
#import random
import re
import os
import csv 
import sys
sys.path.append('../utils/')
import string 
import operator
import sklearn
from sklearn import metrics
from io import BytesIO
from scipy.io import loadmat
from m3preprocess import extract_files, preprocess_images, extract_zip, get_scholar_data
from PIL import Image
import matplotlib as plt

This notebook allows you to run M3 on image datasets and get the performance statistics. <a href="https://github.com/euagendas/m3inference">This M3 implementation code </a> is used.

You should specify the following variables when running the notebook:
- **dataset**: which dataset you use. Can be **only** one of the following: wiki, IMDB, Twitter, Scholar, OUI, Gender Shade. For new datasets please modify the code for extracting the images and reading the annotations file
- **path_to_data**: path to the data (original data is saved here: 175.238.89:/bigdisk/gender_inference/Unpruned_data/) in .zip or .tar format
- **path_to_output**: path to where the data will be extracted, as well as where data.jsonl and result.csv will be saved
- **with_name**: boolean parameter to specify is M3 should run with names (by default is False)
<br>

In [None]:
# #############################################################################
# Necessary variables to change 
dataset = 'twitter' #twitter, wiki, imdb, gender_shade, scholar or oui
path_to_data = 'Twitter_full.zip'
path_to_output = 'twitter/'
with_name = True

In [None]:
# #############################################################################
# Unpack and extract images paths
path_to_images = extract_files(dataset, path_to_data, path_to_output)
#path_to_images = path_to_output + 'wiki/'

images = []
if len(next(os.walk(path_to_images))[1]) == 0: #case when images are in one folder
    for image in os.listdir(path_to_images): 
        if image[-3:] == 'jpg' or image[-3:] == 'png':
            images.append(path_to_images + image)
elif dataset == 'scholar':
    scholar = get_scholar_data(path_to_images, path_to_output)
    for file in os.listdir('scholar/'):
        if file.endswith('.png'):
            images.append('scholar/'+file)
    
else: #case when images are in several folders
    for path in [path_to_images + next(os.walk(path_to_images))[1][n] for n in range(len(next(os.walk(path_to_images))[1]))]:
        for image in os.listdir(path+'/'): 
            if image[-3:] == 'jpg' or image[-3:] == 'png':
                images.append(path+'/' + image)
            
print('Total number of images:', len(images))

In [None]:
# #############################################################################
# extract annotations file

### for .mat file for IMDB ###
def get_names(x):
    if len(x)>0:
        return x[0]
    else:
        return ''
    
### for .mat file for imdb and wiki ###
if dataset == 'imdb' or dataset == 'wiki':
    if dataset == 'imdb':
        path_to_meta =  path_to_output + 'imdb_crop/' + dataset + ".mat"
    else:
        path_to_meta =  path_to_output + dataset +'/' + dataset + ".mat"
    mat = loadmat(path_to_meta)  # load mat-file
    mdata = mat[dataset]  # variable in mat file
    mdtype = mdata.dtype
    ndata = {n: mdata[n][0, 0] for n in mdtype.names}
    columns = [n for n, v in ndata.items()]# if v.size == ndata['numIntervals']]

    dob = mdata['dob'][0,0][0]
    photo_taken = mdata['photo_taken'][0,0][0]
    full_path = [mdata['full_path'][0,0][0][n][0] for n in range(len(mdata['full_path'][0,0][0]))]
    gender = mdata['gender'][0,0][0]
    name = np.array(list(map(get_names, mdata['name'][0,0][0])))
    face_location = mdata['face_location'][0,0][0]
    face_score = mdata['face_score'][0,0][0]
    second_face_score = mdata['second_face_score'][0,0][0]
    #celeb_id = mdata['celeb_id'][0,0][0]

    metadf = pd.DataFrame({"dob": dob, "photo_taken":photo_taken, "full_path":full_path, "gender":gender, "name":name, "face_location":face_location, "face_score":face_score, "second_face_score":second_face_score})
                  #index=celeb_id)
    metadf['full_path'] = metadf['full_path'].apply(lambda x: x.split('/')[1])
    

elif dataset == 'oui':
    extract_zip('OUI_annotations.zip', 'OUI')
    metadf = pd.DataFrame()
    for file in ['OUI/fold_frontal_0_data.txt', 'OUI/fold_frontal_1_data.txt', 'OUI/fold_frontal_2_data.txt', 'OUI/fold_frontal_3_data.txt', 'OUI/fold_frontal_4_data.txt']:
        data = pd.read_csv(file, sep="	", header=None)
        data.columns = data.iloc[0]
        data.drop(0, inplace=True)
        metadf = pd.concat([metadf, data])
        
elif dataset == 'twitter':
    metadf = pd.DataFrame()
    meta_path = path_to_output + path_to_data[:-4] + '/_a_results32langs.zip'
    extract_zip(meta_path, meta_path[:-4]+'/')
    for file in os.listdir(meta_path[:-4]+'/'):
        if file.endswith('.csv'):
            df = pd.read_csv(meta_path[:-4]+'/' + file)
            metadf = metadf.append(df)
    metadf.reset_index(inplace = True)
    tw_names = pd.read_csv(path_to_output + path_to_data[:-4] + '/' + 'Twitter_names.csv')
    
else:
    metadf = scholar.copy()
    
             
display(metadf.head())

In [None]:
for image in images:
    tw_hash = image.split('/')[-1][:-4]
    print(tw_names[tw_names['hash']==tw_hash]['Name'].values[0])

In [None]:
tw_names

In [None]:
# #############################################################################
# preprocess images and create a json file for M3

data = {}
data['images'] = []
for image in images:
    valid = preprocess_images(image, 224, 224, skip = False) # specify skip = False if size condition should be ignored (height+width>=400)
    if valid == False:
        pass
    else:
        if dataset == 'scholar':
            name = image.split('/')[1][:-11].replace('+', ' ')
        elif dataset == 'wiki' or dataset == 'imdb':
            name = metadf[metadf['full_path'] == image.split('/')[-1]]['name'].values[0]
        elif dataset == 'twitter':
            tw_hash = image.split('/')[-1][:-4]
            name = tw_names[tw_names['hash']==tw_hash]['Name'].values
            if len(name)>0:
                name = name[0]
            else:
                name = ''
            if type(name)!=str:
                print(name)
                
        if with_name == True:
            data['images'].append({
                "description":"", 
                "id": image.split('/')[-1],
                "img_path": image, 
                "lang": "en", 
                "name": name, 
                "screen_name": ""
            })
        else:
            data['images'].append({
                "description":"", 
                "id": image.split('/')[-1],
                "img_path": image, 
                "lang": "en", 
                "name": "", 
                "screen_name": ""
            })
    
with open(path_to_output + path_to_data[:-4] + '/data.jsonl', 'w') as json_file: # json file for m3 is created  
    json.dump(data, json_file)

_json = path_to_output+ path_to_data[:-4] + '/data.jsonl'
print('Json saved at ', _json)

In [None]:
# #############################################################################
# run M3 and infer gender

def M3_inference(path_to_output, _json):
    try:
        with open(_json) as json_file:
            data = json.load(json_file)
        pred = m3.infer(data['images']) #get the predictions from json file
        #disc=pprint.pprint(pred)
        with open(path_to_output+ path_to_data[:-4] +'predictions.json', 'w') as pred_file:
            json.dump(pred, pred_file, indent=3)
        print(f'Predictions are finished for {len(pred)} images')

        with open(path_to_output+ path_to_data[:-4]+'result.csv', 'w', newline='') as output:  # output file is created
            wr = csv.writer(output,quoting=csv.QUOTE_ALL)
            wr.writerow(['Imagename','Predicted_Gender', 'Is_Org']) #header row
            for tup in pred.items():
                gender_conf = tup[1]['gender'] #extracting predictions for gender
                gender = max(gender_conf.items(), key=operator.itemgetter(1)) #50% threshold, choosing gender with max confidence score
                org = tup[1]['org'] #extracting predictions for org (if several people are presented on image)
                is_org = False #boolean variable will be stored in the output file
                if org['is-org'] > 0.5:
                    is_org = True
                wr.writerow([tup[0], gender[0], is_org]) #writing a row for every image with image name, predicted gender and is_org flag
    except urllib.error.HTTPError as e:
        print (e)

m3 = M3Inference()
M3_inference(path_to_output, _json)

In [None]:
# #############################################################################
# extract the result of predictions

def org(row):
    if row['Is_Org'] == True:
        return 'orga'
    else:
        return row['Predicted_Gender']

results = pd.read_csv(path_to_output+ path_to_data[:-4]+'result.csv')
if dataset == 'oui':
    results['Imagename'] = results['Imagename'].apply(lambda x: '.'.join(x.split('.')[-2:]))
    results_merged = pd.merge(results, metadf, left_on='Imagename', right_on='original_image')
    results_merged['Predicted_Gender'] = results_merged['Predicted_Gender'].apply(lambda x: x[0])
    
elif dataset == 'twitter':
    results['Imagename'] = results['Imagename'].apply(lambda x: x[:-4]) #for Twitter need to remove .png
    results_merged = pd.merge(results, metadf, left_on='Imagename', right_on='temp_file')
    results_merged['Predicted_Gender'] = results_merged.apply(lambda x: org(x), axis=1)
    results_merged = results_merged[results_merged['indicated_gender:confidence'] >= 0.8][(results_merged['indicated_gender']=='male') | (results_merged['indicated_gender']=='female') | (results_merged['indicated_gender']=='orga')]
    results_merged = results_merged.rename({"indicated_gender":"gender"}, axis=1)

elif dataset == 'imdb' or dataset =='wiki':
    results_merged = pd.merge(results, metadf, left_on='Imagename', right_on='full_path') #merge results and annotations
    results_merged['gender'] = results_merged['gender'].apply(lambda x: "female" if x == 0 else "male") #for wiki
    
elif dataset == 'scholar':
    results_merged = pd.merge(results, scholar, left_on='Imagename', right_on='file_name').drop_duplicates()
    
results_merged
    
#results_merged = results_merged[(results_merged['gender']=='f') | (results_merged['gender']=='m') ]

In [None]:
### This cell is used for scholar data, to see how predicted labels vary for 1 person ###
def f(group):
    return group['Predicted_Gender'].value_counts().min() / group['Predicted_Gender'].value_counts().sum()

if dataset == 'scholar':
    group_df = results_merged[['Name', 'Predicted_Gender', 'gender']].groupby(['Name'])
    variation = group_df.apply(lambda x: f(x))
    variation.hist()

In [None]:
# #############################################################################
# print the confusion matrix and performance metrics
# True values
y_true = results_merged['gender']
# Predicted values
y_pred = results_merged['Predicted_Gender']

# Print the confusion matrix
print(metrics.confusion_matrix(y_true, y_pred))
# Print the precision and recall, among other metrics
print(metrics.classification_report(y_true, y_pred, digits=3))

#printing the metrics
metrics_dict=metrics.classification_report(y_true, y_pred,output_dict=True)

#precision:
print('Precision:',round(metrics_dict['weighted avg']['precision'],4))
#Recall
print('Recall:',round(metrics_dict['weighted avg']['recall'],4))
#F1-score
print('F1-score:',round(metrics_dict['weighted avg']['f1-score'],4))
#accuracy
print('Accuracy:',round(metrics_dict['accuracy'],4))