In [None]:
import pandas as pd
import gender_guesser.detector as gender
from utils.preprocess import extract_files, extract_zip
import os
from sklearn import metrics
from scipy.io import loadmat
import numpy as np

This notebook uses python gender_guesser package to benchmark datasets. To run you need to specify the following variables: <br/>
1. dataset: can be "imdb", "wiki", "scholar" or "twitter"
2. path_to_data: path to the zip file 
3. path_to_output: path where the output folder will be created

In [None]:
dataset = 'scholar' #twitter, wiki, imdb, or scholar
path_to_data = 'Scholar.zip' #path to the archive file
path_to_output = 'scholar/' #output path to where the data will be saved

In [None]:
extract_files(dataset, path_to_data, path_to_output) #extract files from archive

In [None]:
def get_names(x):
    if len(x)>0:
        return x[0]
    else:
        return ''

if dataset == 'twitter':
    metadf = pd.DataFrame()
    meta_path = path_to_output + path_to_data[:-4] + '/_a_results32langs.zip'
    extract_zip(meta_path, meta_path[:-4]+'/')
    for file in os.listdir(meta_path[:-4]+'/'):
        if file.endswith('.csv'):
            df = pd.read_csv(meta_path[:-4]+'/' + file)
            metadf = metadf.append(df)
    metadf.reset_index(inplace = True)
    tw_names = pd.read_csv(path_to_output + path_to_data[:-4] + '/' + 'Twitter_names.csv')
    tw_meta = tw_names.merge(metadf[['temp_file', 'indicated_gender']], how='left', left_on='hash', right_on='temp_file')
    del metadf, tw_names
    tw_meta = tw_meta[['Name', 'indicated_gender']].rename(columns={'indicated_gender':'gender'})
    names_df = tw_meta[(tw_meta['gender']=='male') | (tw_meta['gender']=='female')]
    
elif dataset == 'imdb' or dataset == 'wiki':
    if dataset == 'imdb':
        path_to_meta =  path_to_output + 'imdb_crop/' + dataset + ".mat"
    else:
        path_to_meta =  path_to_output + dataset +'/' + dataset + ".mat"
    mat = loadmat(path_to_meta)  # load mat-file
    mdata = mat[dataset]  # variable in mat file
    mdtype = mdata.dtype
    ndata = {n: mdata[n][0, 0] for n in mdtype.names}
    columns = [n for n, v in ndata.items()]# if v.size == ndata['numIntervals']]

    dob = mdata['dob'][0,0][0]
    photo_taken = mdata['photo_taken'][0,0][0]
    full_path = [mdata['full_path'][0,0][0][n][0] for n in range(len(mdata['full_path'][0,0][0]))]
    gen = mdata['gender'][0,0][0]
    name = np.array(list(map(get_names, mdata['name'][0,0][0])))
    face_location = mdata['face_location'][0,0][0]
    face_score = mdata['face_score'][0,0][0]
    second_face_score = mdata['second_face_score'][0,0][0]
    #celeb_id = mdata['celeb_id'][0,0][0]

    metadf = pd.DataFrame({"dob": dob, "photo_taken":photo_taken, "full_path":full_path, "gender":gen, "name":name, "face_location":face_location, "face_score":face_score, "second_face_score":second_face_score})
                  #index=celeb_id)
    metadf['full_path'] = metadf['full_path'].apply(lambda x: x.split('/')[1])
    metadf = metadf[~metadf['gender'].isnull()]
    names_df = metadf[['name', 'gender']].replace({"gender": {1.:'male', 0:'female'}}).rename(columns={'name':'Name'})
    
else: #scholar
    names_df = pd.DataFrame()
    for file in os.listdir(path_to_output):
        if file.endswith('.csv'):
            scholar_temp = pd.read_csv(path_to_output + file, names=['Name', 'Image1', 'Image2', 'Image3', 'Image4', 'Image5'])
            scholar_temp['gender'] = file.split('_')[0]
            names_df = names_df.append(scholar_temp)[['Name', 'gender']]
            names_df['Name'] = names_df['Name'].str.replace('+', ' ')
            

In [None]:
d = gender.Detector()

def get_gender(name):
    first_name = name.split(" ")
    return d.get_gender(first_name[0])

names_df['predicted_gender'] = names_df['Name'].apply(get_gender)

In [None]:
names_df['predicted_gender'].value_counts()

In [None]:
gender_dict = {'mostly_male':'male', "mostly_female":'female', "andy":'unknown'}

names_df = names_df.replace({"predicted_gender": gender_dict})
coverage = 1 - names_df['predicted_gender'].value_counts()['unknown']/names_df['predicted_gender'].count()
print(f'Coverage: {coverage}')
names_df_clean = names_df[names_df['predicted_gender'] != 'unknown']

y_true = names_df_clean['gender'].astype(str)
y_pred = names_df_clean['predicted_gender'].astype(str)

# Print the confusion matrix
print(metrics.confusion_matrix(y_true, y_pred))
# Print the precision and recall, among other metrics
print(metrics.classification_report(y_true, y_pred, digits=3))

#printing the metrics
metrics_dict=metrics.classification_report(y_true, y_pred,output_dict=True)

#precision:
print('Precision:',round(metrics_dict['weighted avg']['precision'],4))
#Recall
print('Recall:',round(metrics_dict['weighted avg']['recall'],4))
#F1-score
print('F1-score:',round(metrics_dict['weighted avg']['f1-score'],4))
#accuracy
print('Accuracy:',round(metrics_dict['accuracy'],4))