In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ast import literal_eval
from IPython.display import display
import json
import numpy as np
import pandas as pd
import os
import random
import re
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
import tensorflow as tf

In [3]:
scored_data = 'gs://conversationai-models/biosbias/scored_data/test_male_female_standard_0415.csv'

In [4]:
df = pd.read_csv(tf.gfile.Open(scored_data)).drop_duplicates(subset=['tokens'])

In [5]:
MODEL_NAMES = {
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157': 'classif_male',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223214': 'classif_female',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117': 'classif_standard',
}

### Preprocessing

In [6]:
def get_class_from_col_name(col_name):
    #print(col_name)
    pattern = r'^.*_(\d+)$'
    return int(re.search(pattern, col_name).group(1))

def find_best_class(df, model_name, class_names):
    model_class_names = ['{}_{}'.format(model_name, class_name) for class_name in class_names]
    sub_df = df[model_class_names]
    df['{}_class'.format(model_name)] = sub_df.idxmax(axis=1).apply(get_class_from_col_name)

In [7]:
CLASS_NAMES = range(33)

for _model in MODEL_NAMES:
    find_best_class(df, _model, CLASS_NAMES)

In [8]:
# Labels with either gender having too few examples
bad_labels = df.groupby('label').gender.value_counts().reset_index(name = 'count').query('count < 5').label.values
assert len(bad_labels) == 0

### Accuracy Calculation

In [9]:
accuracy_list = []
for _model in MODEL_NAMES:
    is_correct = (df['{}_class'.format(_model)] == df['label'])
    _acc = sum(is_correct)/len(is_correct)
    accuracy_list.append(_acc)
    print ('Accuracy for model {}: {}'.format(MODEL_NAMES[_model], _acc))

Accuracy for model classif_female: 0.788696844076
Accuracy for model classif_standard: 0.8179326023
Accuracy for model classif_male: 0.781943701524


### Fairness Metrics

In [10]:
for _class in CLASS_NAMES:
    df['label_{}'.format(_class)] = (df['label'] == _class)

In [11]:
# Gender ratios of classes
gender_counts = df.groupby('label').gender.value_counts().reset_index(name = 'count')

def frac_female(df):
    m_count = df[df['gender'] == "M"]['count'].values[0]
    f_count = df[df['gender'] == "F"]['count'].values[0]
    return {'label': df['label'].values[0], 'frac_female': f_count/(m_count+f_count)}

frac_female_df = pd.DataFrame(list(gender_counts.groupby('label', as_index = False).apply(frac_female)))

In [12]:
#NITHUM IMPLEMENTATION

# def compute_tpr(df, _class, _model, threshold = 0.5):
#     tpr = metrics.recall_score(df['label_{}'.format(_class)],
#                                df['{}_{}'.format(_model,_class)] > threshold)
#     return tpr
    

# def compute_tpr_tnr(df, _class, _model, threshold = 0.5):
#     #cm = metrics.confusion_matrix(df['label_{}'.format(_class)],
#     #                              df['{}_{}'.format(_model,_class)] > threshold)
#     cm = pd.crosstab(df['label_{}'.format(_class)], df['{}_{}'.format(_model,_class)] > threshold)
#     #display(cm)
#     if cm.shape[1] > 1:
#         tn = cm.iloc[0,0]
#         fp = cm.iloc[0,1]
#         fn = cm.iloc[1,0]
#         tp = cm.iloc[1,1]
#         tpr = tp/(tp+fn)    
#         tnr = tn/(tn+fp)
#     else:
#         tpr = 0
#         tnr = 1
#     return tpr, tnr

In [13]:
# COMPUTING ourselves the values

# def compute_tpr(df, _class, _model, threshold = 0.5):
#     tpr = metrics.recall_score(df['label_{}'.format(_class)],
#                                df['{}_{}'.format(_model,_class)] > threshold)
#     return tpr

# def compute_tpr_tnr(df, _class, _model, threshold = 0.5):
#     true_col = 'label_{}'.format(_class)
#     pred_col = '{}_{}'.format(_model,_class)
#     tn = len(df.loc[(df[true_col] == False) & (df[pred_col] <= threshold)])
#     fp = len(df.loc[(df[true_col] == False) & (df[pred_col] > threshold)])
#     fn = len(df.loc[(df[true_col] == True) & (df[pred_col] <= threshold)])
#     tp = len(df.loc[(df[true_col] == True) & (df[pred_col] > threshold)])

#     if tp + fn == 0:
#         raise ValueError('class has no positive - impossible to define tpr')

#     if tn + fp == 0:
#         tpr = 1
#         tnr = 0
#     else:
#         tpr = tp/(tp+fn)    
#         tnr = tn/(tn+fp)
#     return tpr, tnr

In [14]:
# WITHOUT THRESHOLD

def compute_tpr(df, _class, _model, threshold=None):    
    tpr = metrics.recall_score(df['label_{}'.format(_class)],
                               df['{}_class'.format(_model)] == _class)
    return tpr

def compute_tpr_tnr(df, _class, _model, threshold=None):
    
    true_col = 'label_{}'.format(_class)
    pred_col = '{}_class'.format(_model)
    tn = len(df.loc[(df[true_col] == False) & (df[pred_col] != _class)])
    fp = len(df.loc[(df[true_col] == False) & (df[pred_col] ==_class)])
    fn = len(df.loc[(df[true_col] == True) & (df[pred_col] != _class)])
    tp = len(df.loc[(df[true_col] == True) & (df[pred_col] ==_class)])

    if tp + fn == 0:
        raise ValueError('class has no positive - impossible to define tpr')

    if tn + fp == 0:
        tpr = 1
        tnr = 0
    else:
        tpr = tp/(tp+fn)    
        tnr = tn/(tn+fp)
    return tpr, tnr

In [15]:
def compute_tpr_by_gender(df, _class, _model, threshold = 0.5):
    tpr_m = compute_tpr(df.query('gender == "M"'), _class, _model, threshold)
    tpr_f = compute_tpr(df.query('gender == "F"'), _class, _model, threshold)
    return {'M': tpr_m, 'F': tpr_f}

def compute_tr_by_gender(df, _class, _model, threshold = 0.5):
    tpr_m, tnr_m = compute_tpr_tnr(df.query('gender == "M"'), _class, _model, threshold)
    tpr_f, tnr_f = compute_tpr_tnr(df.query('gender == "F"'), _class, _model, threshold)
    return {'TPR_m': tpr_m, 'TPR_f': tpr_f, 'TNR_m': tnr_m, 'TNR_f': tnr_f}

In [16]:
tpr_df = pd.DataFrame()
for _class in frac_female_df.label:
    row = {}
    row['label'] = _class
    for _model, _model_type in MODEL_NAMES.items():
        tpr, tnr = compute_tpr_tnr(df, _class, _model)
        row['{}_tpr'.format(_model_type)] = tpr
        row['{}_tnr'.format(_model_type)] = tnr
        gender_trs = compute_tr_by_gender(df, _class, _model)
        row['{}_tpr_F'.format(_model_type)] = gender_trs['TPR_f']
        row['{}_tpr_M'.format(_model_type)] = gender_trs['TPR_m']
        row['{}_tpr_gender_gap'.format(_model_type)] = gender_trs['TPR_f'] - gender_trs['TPR_m']
        row['{}_tnr_F'.format(_model_type)] = gender_trs['TNR_f']
        row['{}_tnr_M'.format(_model_type)] = gender_trs['TNR_m']
        row['{}_tnr_gender_gap'.format(_model_type)] = gender_trs['TNR_f'] - gender_trs['TNR_m']
    tpr_df = tpr_df.append(row, ignore_index = True)

results_df = pd.merge(tpr_df, frac_female_df, on = 'label')

In [17]:
TITLE_LABELS = [
    'accountant', 'acupuncturist', 'architect', 'attorney', 'chiropractor', 'comedian', 'composer', 'dentist',
    'dietitian', 'dj', 'filmmaker', 'interior_designer', 'journalist', 'landscape_architect', 'magician',
    'massage_therapist', 'model', 'nurse', 'painter', 'paralegal', 'pastor', 'personal_trainer',
    'photographer', 'physician', 'poet', 'professor', 'psychologist', 'rapper',
    'real_estate_broker', 'software_engineer', 'surgeon', 'teacher', 'yoga_teacher']

results_df['label_profession'] = results_df['label'].apply(lambda x: TITLE_LABELS[int(x)])

In [18]:
results_df[['frac_female']+['{}_tpr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]].corr()[['frac_female']]


Unnamed: 0,frac_female
frac_female,1.0
classif_female_tpr_gender_gap,0.570389
classif_standard_tpr_gender_gap,0.789127
classif_male_tpr_gender_gap,0.459003


In [19]:
tpr_gender_gap_cols = ['{}_tpr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]
tnr_gender_gap_cols = ['{}_tnr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]

gender_gap_df = results_df[['label_profession', 'frac_female']+tpr_gender_gap_cols+tnr_gender_gap_cols]
gender_gap_df.sort_values('frac_female', ascending = False)

Unnamed: 0,label_profession,frac_female,classif_female_tpr_gender_gap,classif_standard_tpr_gender_gap,classif_male_tpr_gender_gap,classif_female_tnr_gender_gap,classif_standard_tnr_gender_gap,classif_male_tnr_gender_gap
7,dietitian,0.920561,0.121529,0.204738,-0.005076,-0.002883,-0.003695,-0.000553
13,nurse,0.914625,0.042678,0.113284,-0.010729,-0.006581,-0.01251,-0.002941
15,paralegal,0.866109,0.272343,0.281099,0.014493,4.1e-05,-0.000274,0.0
27,yoga_teacher,0.859712,-0.021886,0.167042,0.114258,-0.001846,-0.001631,-0.00132
12,model,0.819149,0.414693,0.492488,0.088299,0.001592,-0.003342,3.5e-05
10,interior_designer,0.783654,0.121609,0.255624,0.089025,0.000502,-0.000951,0.000132
22,psychologist,0.620665,0.048827,0.01992,0.043324,-0.004386,-0.004418,-0.008851
26,teacher,0.604545,0.113316,0.143698,0.108068,-0.005864,-0.008712,-0.005908
11,journalist,0.492167,-0.017291,0.038528,-0.033189,0.009347,0.000906,0.003571
19,physician,0.491782,0.066381,0.011794,0.06241,0.006025,0.008402,-0.01682


In [20]:
gender_gap_df.to_clipboard()

# Look at confusion matrix

### Compute confusion matrix

In [21]:
def plot_confusion_matrix(_df, model_name):
    if model_name not in MODEL_NAMES: #then invert
        for key in MODEL_NAMES:
            if MODEL_NAMES[key] == model_name:
                model_name = key

    return confusion_matrix(_df['label'],
                            _df[model_name + '_class'],
                            labels=range(33))

In [22]:
cm_standard_full = plot_confusion_matrix(df, 'classif_standard')
cm_male_full = plot_confusion_matrix(df, 'classif_male')
cm_female_full = plot_confusion_matrix(df, 'classif_female')

cm_standard_male = plot_confusion_matrix(df.query('gender == "M"'), 'classif_standard')
cm_male_male = plot_confusion_matrix(df.query('gender == "M"'), 'classif_male')
cm_female_male = plot_confusion_matrix(df.query('gender == "M"'), 'classif_female')

cm_standard_female = plot_confusion_matrix(df.query('gender == "F"'), 'classif_standard')
cm_male_female = plot_confusion_matrix(df.query('gender == "F"'), 'classif_male')
cm_female_female = plot_confusion_matrix(df.query('gender == "F"'), 'classif_female')

In [23]:
pd.DataFrame(cm_female_female).to_clipboard()

In [24]:
pd.DataFrame(TITLE_LABELS).to_clipboard()

In [67]:
np.sum(cm_female_female, axis=1)

array([ 315,    0,  339, 1823,  127,   98,  125,  766,  591,   34,  336,
        163, 1508,    0,    0,    0,  924, 2539,  546,  207,   84,   96,
       1302, 3052,  515, 8009, 1736,   17,    0,  168,  326, 1463,  239])

In [70]:
pd.DataFrame(cm_female_female/np.sum(cm_female_female, axis=1).reshape((33, 1)) - cm_female_male/np.sum(cm_female_male, axis=1).reshape((33, 1))).to_clipboard()

  """Entry point for launching an IPython kernel.


In [71]:
pd.DataFrame(cm_male_female/np.sum(cm_male_female, axis=1).reshape((33, 1)) - cm_male_male/np.sum(cm_male_male, axis=1).reshape((33, 1))).to_clipboard()

  """Entry point for launching an IPython kernel.


In [73]:
pd.DataFrame(cm_standard_female/np.sum(cm_standard_female, axis=1).reshape((33, 1)) - cm_standard_male/np.sum(cm_standard_male, axis=1).reshape((33, 1))).to_clipboard()

  """Entry point for launching an IPython kernel.


### Looking at some type of errors

In [25]:
df.head()

Unnamed: 0,tokens,gender,label,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_0,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_1,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_2,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_3,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_4,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_5,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_6,...,label_23,label_24,label_25,label_26,label_27,label_28,label_29,label_30,label_31,label_32
0,"[u'ms.', u'grandal', u'practices', u'medicine'...",F,23,8.862035e-08,2.046254e-20,9.056166e-08,1.89159e-05,1.188728e-05,5.181058e-10,1.456986e-11,...,True,False,False,False,False,False,False,False,False,False
1,"[u'her', u'clinical', u'practice', u',', u'tea...",F,25,7.482672e-06,4.967479e-15,2.414768e-05,0.0001151357,0.0007246675,8.731493e-08,6.299098e-07,...,False,False,True,False,False,False,False,False,False,False
2,"[u'dr.', u'pankaj', u'savla', u'practices', u'...",M,7,2.18715e-07,1.469046e-16,5.442479e-07,4.124972e-08,1.640611e-06,2.815475e-10,9.288192e-10,...,False,False,False,False,False,False,False,False,False,False
3,"[u'watch', u'her', u'discuss', u'her', u'resea...",F,25,0.007115533,2.614586e-14,0.003238914,0.0009694429,5.048897e-06,6.694302e-07,1.716063e-05,...,False,False,True,False,False,False,False,False,False,False
4,"[u'she', u'graduated', u'with', u'honors', u'f...",F,17,4.922343e-08,7.847238e-31,5.297969e-06,1.292784e-08,5.28043e-08,2.674072e-13,7.294467e-12,...,False,False,False,False,False,False,False,False,False,False


In [26]:
TITLE_LABELS

['accountant',
 'acupuncturist',
 'architect',
 'attorney',
 'chiropractor',
 'comedian',
 'composer',
 'dentist',
 'dietitian',
 'dj',
 'filmmaker',
 'interior_designer',
 'journalist',
 'landscape_architect',
 'magician',
 'massage_therapist',
 'model',
 'nurse',
 'painter',
 'paralegal',
 'pastor',
 'personal_trainer',
 'photographer',
 'physician',
 'poet',
 'professor',
 'psychologist',
 'rapper',
 'real_estate_broker',
 'software_engineer',
 'surgeon',
 'teacher',
 'yoga_teacher']

In [27]:
MODEL_NAMES 

{'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117': 'classif_standard',
 'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157': 'classif_male',
 'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223214': 'classif_female'}

In [28]:
def plot_some_examples(_df, model_name, true_occupation, predicted_occupation, verbose=True):
    
    if model_name not in MODEL_NAMES: #then invert
        for key in MODEL_NAMES:
            if MODEL_NAMES[key] == model_name:
                model_name = key
    
    if not isinstance(true_occupation, int):
        for i, title in enumerate(TITLE_LABELS):
            if title == true_occupation:
                true_occupation = i
    if not isinstance(predicted_occupation, int):
        for i, title in enumerate(TITLE_LABELS):
            if title == predicted_occupation:
                predicted_occupation = i
                

    pred_col = '{}_class'.format(model_name)
    selected_df = _df.loc[(df['label'] == true_occupation) & (_df[pred_col] == predicted_occupation)]
    if verbose:
        for row in selected_df['tokens']:
            print(' '.join(literal_eval(row)))
            print ()
    else:
        return selected_df

In [29]:
plot_some_examples(
    df.query('gender == "M"'),
    'classif_female',
    'dietitian',
    'professor')

after his phd , he was awarded a post-doctoral fellowship by the ingestive behavior research center at purdue university ( indiana , usa ) , where he worked with distinguished professor richard mattes for 3 years . dr. tan 's research includes : 1 ) post-ingestive physiological responses ( e.g . insulin release , energy expenditure and substrate oxidation ) , 2 ) human feeding behaviours ( e.g . objective and subjective assessment of appetite , sensory influences on food selection , and factors that determine food intake ) , 3 ) weight loss interventions with special interest in visceral fat mass changes , and 4 ) accelerometry in predicting physical activity .

at the university of illinois , he holds emeritus appointments in animal sciences , nutritional sciences , and internal medicine . he has been a member of the illinois faculty since 1967 , having received his ph.d. degree there in 1965 .

his research has focussed on protein , energy and phosphorus nutrition of grazing cattle a

In [30]:
plot_some_examples(
    df.query('gender == "F"'),
    'classif_female',
    'dietitian',
    'professor')

she continues to conduct research on problems that address both productivity and environmental issues such as healthy feed and low pollution diets . she received her ph.d. in fisheries from auburn university in 1978 . she will be studying the effects of pond age on bottom soil quality with crsp researcher claude boyd under this work plan .

she has been teaching principles of nutrition , nutrition for the lifecycle , and medical nutrition therapy for the past 12 years . she has published scientific articles on glycemic index , essential fatty acids for brain development , and clinical nutrition . in 2005 , the nutritionists-dietitians ’ association of the philippines awarded her as the most promising nutritionist-dietitian . she is happily married for the 9 years and a mother to their son , gabby .

her research focused on the effect of low energy dense diets and high-intensity physical activity on changes in body weight , waist circumference , and chronic disease risk factors in abdom

In [31]:
plot_some_examples(
    df.query('gender == "M"'),
    'classif_female',
    'dietitian',
    'dietitian')

he owns a doctorate degree in nutrition from mcgill university in canada . in addition , he completed a 7-year postdoctoral training at harvard medical school in massachusetts where he studied the impact of fat as it relates to heart disease .

his objective is to encourage and advocate and evidence-based evolutionary approach to health and wellness . he is a well known media personality through his nutrition writing for askmen.com the world 's largest online men 's magazine , video host of thrive and regular contributor to articles and videos in various media other channels . patrick has written many academic publications and contributions to books .

he ’ s also the founder of science-based website diet vs. disease . what he ’ s best known for , however , are his online courses . joe creates written and video content for those living with diet-related health conditions and diseases , and his online courses have gained a strong following . they serve to break down dense information – 

In [32]:
plot_some_examples(
    df.query('gender == "F"'),
    'classif_female',
    'dietitian',
    'dietitian')

she graduated with honors in 1994 . having more than 22 years of diverse experiences , especially in registered dietitian or nutrition professional , karen s basedow affiliates with newark beth israel medical center , and cooperates with other doctors and specialists in medical group newark beth israel medical center inc . call karen s basedow on phone number ( 973 ) 926-3312 for more information and advises or to book an appointment .

prior to this position she spent eight years as a clinical dietitian at uhs wilson hospital and two years as a nutrition care manager in long term care .

ms. sajida afreen practices at truweight wellness pvt ltd in himayat nagar , hyderabad . she completed msc - dietitics / nutrition from koti women 's college in 2013 .

she earned a bachelor of science degree in nutrition and dietetics from the university of southern mississippi . fortenberry believes that lifestyle changes and wholesome nutrition are obtainable , and she brings real-life understandin

# Compute biasA by analyzing how different the predictions are per class

In [33]:
df.head()

Unnamed: 0,tokens,gender,label,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_0,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_1,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_2,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_3,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_4,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_5,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157_6,...,label_23,label_24,label_25,label_26,label_27,label_28,label_29,label_30,label_31,label_32
0,"[u'ms.', u'grandal', u'practices', u'medicine'...",F,23,8.862035e-08,2.046254e-20,9.056166e-08,1.89159e-05,1.188728e-05,5.181058e-10,1.456986e-11,...,True,False,False,False,False,False,False,False,False,False
1,"[u'her', u'clinical', u'practice', u',', u'tea...",F,25,7.482672e-06,4.967479e-15,2.414768e-05,0.0001151357,0.0007246675,8.731493e-08,6.299098e-07,...,False,False,True,False,False,False,False,False,False,False
2,"[u'dr.', u'pankaj', u'savla', u'practices', u'...",M,7,2.18715e-07,1.469046e-16,5.442479e-07,4.124972e-08,1.640611e-06,2.815475e-10,9.288192e-10,...,False,False,False,False,False,False,False,False,False,False
3,"[u'watch', u'her', u'discuss', u'her', u'resea...",F,25,0.007115533,2.614586e-14,0.003238914,0.0009694429,5.048897e-06,6.694302e-07,1.716063e-05,...,False,False,True,False,False,False,False,False,False,False
4,"[u'she', u'graduated', u'with', u'honors', u'f...",F,17,4.922343e-08,7.847238e-31,5.297969e-06,1.292784e-08,5.28043e-08,2.674072e-13,7.294467e-12,...,False,False,False,False,False,False,False,False,False,False


In [34]:
MODEL_NAMES

{'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117': 'classif_standard',
 'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223157': 'classif_male',
 'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190413_223214': 'classif_female'}

In [35]:
def _get_proba_truth(row, _model):
    label = row['label']
    model_pred = row['{}_{}'.format(_model, label)]
    return model_pred

for _model in MODEL_NAMES:
    res = []
    for index, row in df.iterrows():
        res.append(_get_proba_truth(row, _model))
    df[MODEL_NAMES[_model] +'_proba_truth'] = res

In [36]:
df['male-female_proba_truth'] = (
    df['classif_male_proba_truth'] - df['classif_female_proba_truth'])


In [37]:
difference_per_class = df[['label', 'male-female_proba_truth']].groupby(['label']).mean().reset_index()
difference_per_class['occupation'] = list(map(lambda x: TITLE_LABELS[x], difference_per_class['label']))

In [38]:
difference_per_class.to_clipboard()

In [39]:
df.loc[df['label'] == TITLE_LABELS.index('dietitian')][['gender', 'label', 'classif_standard_proba_truth', 'classif_male_proba_truth', 'classif_female_proba_truth']].head(100)

Unnamed: 0,gender,label,classif_standard_proba_truth,classif_male_proba_truth,classif_female_proba_truth
165,F,8,0.379616,0.099653,0.199069
199,F,8,0.999961,0.289079,0.997856
205,M,8,0.608499,0.007926,0.749000
613,F,8,0.997947,0.228356,0.984116
720,F,8,0.998711,0.338512,0.957458
805,F,8,0.994421,0.676997,0.998105
884,F,8,0.957452,0.009827,0.926086
1037,F,8,0.992018,0.075462,0.988912
1047,F,8,0.999753,0.178680,0.995930
1198,F,8,0.982557,0.237771,0.990564


In [51]:
x = df.groupby('label').size().reset_index()
x['occupation'] = list(map(lambda x: TITLE_LABELS[x], x['label']))

In [52]:
x.to_clipboard()