In [None]:
import tensorflow as tf
import os

In [None]:
input_path = os.path.join(
    'gs://kaggle-model-experiments/resources/civil_comments_data/artificial_bias/fprost/20181012170507',
    'train_artificial_bias-00001-of-00004.tfrecord')

In [None]:
def get_civil_comments_spec():
  spec = {
      'comment_text': tf.FixedLenFeature([], dtype=tf.string),
      'id': tf.FixedLenFeature([], dtype=tf.string),
      'toxicity': tf.FixedLenFeature([], dtype=tf.float32),
      'severe_toxicity': tf.FixedLenFeature([], dtype=tf.float32),
      'obscene': tf.FixedLenFeature([], dtype=tf.float32),
      'sexual_explicit': tf.FixedLenFeature([], dtype=tf.float32),
      'identity_attack': tf.FixedLenFeature([], dtype=tf.float32),
      'insult': tf.FixedLenFeature([], dtype=tf.float32),
      'threat': tf.FixedLenFeature([], dtype=tf.float32),
      'toxicity_annotator_count': tf.FixedLenFeature([], dtype=tf.int64),
      'identity_annotator_count': tf.FixedLenFeature([], dtype=tf.int64),
    }
  for _id in ['male', 'female', 'transgender', 'other_gender', 'heterosexual', 
              'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation',
              'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist',
              'other_religion', 'black', 'white', 'asian', 'latino', 'other_race_or_ethnicity',
              'physical_disability', 'intellectual_or_learning_disability',
              'psychiatric_or_mental_illness', 'other_disability']:
    spec[_id] = tf.FixedLenFeature([], dtype=tf.float32, default_value=-1.)
  return spec

In [None]:
THRESHOLD_IDENTITY = 0.5
THRESHOLD_TOXICITY = 0.5

count_total = 50000
count_identity = 0
count_male_toxic = 0
count_male_non_toxic = 0
count_female_toxic = 0
count_female_non_toxic =0
count_female_and_male_toxic = 0
count_female_and_male_non_toxic = 0
count_heterosexual_toxic = 0
count_heterosexual_non_toxic = 0


with tf.Session() as sess:
    
    filename_queue = tf.train.string_input_producer([input_path], num_epochs=1)
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    parsed_element = tf.parse_single_example(
              serialized=serialized_example,
              features=get_civil_comments_spec(),
    )

    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    sess.run(init_op)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    for batch_index in range(count_total):
        el = sess.run([parsed_element])
        el = el[0]

        if el['male']>=0:
            count_identity +=1

            if el['male'] >= THRESHOLD_IDENTITY:
                if el['toxicity'] >= THRESHOLD_TOXICITY:
                    count_male_toxic += 1
                else:
                    count_male_non_toxic += 1
            if el['female'] >= THRESHOLD_IDENTITY:
                if el['toxicity'] >= THRESHOLD_TOXICITY:
                    count_female_toxic += 1
                else:
                    count_female_non_toxic += 1
            
            if  el['male'] >= THRESHOLD_IDENTITY and el['female'] >= THRESHOLD_IDENTITY:
                if el['toxicity'] >= THRESHOLD_TOXICITY:
                    count_female_and_male_toxic += 1
                else:
                    count_female_and_male_non_toxic += 1

In [None]:
print ('Number of examples seen: {}'.format(count_total))
print ('Number of examples labeled for identity: {}'.format(count_identity))
print
print ('Number of examples that were male: {}'.format(count_male_toxic + count_male_non_toxic))
print ('Ratio of examples that were male: {}'.format(float(count_male_toxic + count_male_non_toxic)/count_identity))
print ('Ratio of toxic male example: {}'.format(float(count_male_toxic)/ (count_male_toxic + count_male_non_toxic)))
print
print ('Number of examples that were female: {}'.format(count_female_toxic + count_female_non_toxic))
print ('Ratio of examples that were female: {}'.format(float(count_female_toxic + count_female_non_toxic)/count_identity))
print ('Ratio of toxic female example: {}'.format(float(count_female_toxic)/ (count_female_toxic + count_female_non_toxic)))
print
print ('Number of examples that were female and male: {}'.format(count_female_and_male_toxic + count_female_and_male_non_toxic))
print ('Ratio of toxic female and male example: {}'.format(float(count_female_and_male_toxic)/ (count_female_and_male_toxic + count_female_and_male_non_toxic)))
print