# Cleaning data for public release

In [1]:
import os
from collections import Counter
import pandas as pd

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# data_dir = '/content/drive/MyDrive/char_gender/data/'
data_dir = '../data/'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [4]:
# We don't release this dataset because contains annotator's Prolific IDs
df = pd.read_csv(os.path.join(data_dir, 'char_gender_final.csv'), header=1).iloc[1:].rename(columns={'Recorded Date': 'date',
                           'First Story': 'First_Story'.lower(),
                           'First Gender': 'First_Gender'.lower(),
                           'Your gender - Selected Choice':'gender',
                           'Suppose you had both novels in your hand. Which one would you continue reading?': 'chosen_story'})
print(len(df))

3508


In [5]:
rids_awesome = []

for index, row in df.iterrows():

  cond_hike = (row['In Novel A, how long is the trail that Sam takes?'] == '6 miles') and (row['In Novel A, what plant does Sam rest against?'] == 'Juniper tree') and (row['In Novel B, when does Alex go to the cafe?'] == 'Morning') and (row['In Novel B, what is the material of the chair Alex sketches?'] == 'Wood')

  cond_coffee = (row['In Novel B, how long is the trail that Sam takes?'] == '6 miles') and (row['In Novel B, what plant does Sam rest against?'] == 'Juniper tree') and (row['In Novel A, when does Alex go to the cafe?'] == 'Morning') and (row['In Novel A, what is the material of the chair Alex sketches?'] == 'Wood')

  # remove data from pilot
  recent = '2024-05' in row['date'] and int(row['date'].split('-')[2][:2]) > 16

  # only keep complete surveys
  complete = not pd.isna(row['chosen_story']) and not pd.isna(row['gender'])

  # only keep participants who did not pass reading comprehension check
  correct = cond_hike or cond_coffee

  if recent and complete and correct:

    rids_awesome.append(row['Response ID'])

print(len(rids_awesome))

3002


In [6]:
awesome = df.set_index('Response ID').loc[rids_awesome[:3000]]

In [7]:
# remove participants who identified as non-binary
enby = 0
rids_clean = []
for index, row in awesome.iterrows():
  if row['gender'] == 'Woman' or row['gender'] == 'Man':
    rids_clean.append(index)
  else:
    enby += 1
enby

17

In [8]:
clean = df.set_index('Response ID').loc[rids_clean]
len(clean)

2983

In [9]:
chose_hike = [] # cases where the participant chose the hike story
treatmentA = [] # cases where the hike story has woman protagonist

for index, row in clean.iterrows():

  if (row['first_story'] == 'Hike' and row['chosen_story'] == 'Novel A') or (row['first_story'] == 'Coffee' and row['chosen_story'] == 'Novel B'):
    chose_hike.append(1)
  else:
    chose_hike.append(0)

  if (row['first_gender'] == 'W' and row['first_story'] == 'Hike') or (row['first_gender'] == 'M' and row['first_story'] == 'Coffee'):
    treatmentA.append('Hike')
  else:
    treatmentA.append('Other')

clean['Treatment A'] = treatmentA
clean['chose_hike'] = chose_hike

In [10]:
clean['respondent_woman'] = clean['gender'].apply(lambda x: 1 if x == 'Woman' else 0)

In [11]:
W = 0
M = 0
for x in clean['gender']:
  if x == 'Woman':
    W += 1
  else:
    M += 1
print(W, M)

1492 1491


In [12]:
# remove columns with Prolific IDs and a column named "boh" created by mistake
public = clean[clean.columns[16:]].drop(columns=['PROLIFIC_PID', 'What is your Prolific ID?\n\nPlease note that this response should auto-fill with the correct ID.', 'boh'])

In [13]:
public.head()

Unnamed: 0_level_0,"If you agree to these conditions, please click ""I consent to participate"" below. If you do not agree, click the “I do not consent to participate” option.\n\nBy agreeing to participate, you confirm that you are over 18 years of age.\n\n \n\nI have read the above information.","In Novel A, how long is the trail that Sam takes?","In Novel A, what plant does Sam rest against?","In Novel B, when does Alex go to the cafe?","In Novel B, what is the material of the chair Alex sketches?",chosen_story,"Your opinion is fundamental to understand reader preferences. Please take your time to answer this question.\n\nIn your own words, briefly share your motivation for your choice (minimum 200 characters, or about 40 words).",Your age:,gender,Your gender - Prefer to self describe: - Text,...,How would you describe your political views?,"In Novel A, when does Alex go to the cafe?","In Novel A, what is the material of the chair Alex sketches?","In Novel B, how long is the trail that Sam takes?","In Novel B, what plant does Sam rest against?",first_gender,first_story,Treatment A,chose_hike,respondent_woman
Response ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R_7rxJeX0686uzPHi,I consent to participate.,6 miles,Juniper tree,Morning,Wood,Novel A,Novel A has more information about a potential...,25-35,Man,,...,Moderate,,,,,W,Hike,Hike,1,0
R_1Mng04p43Zl3xvb,I consent to participate.,,,,,Novel B,Novel B drew me in with the suspense of exactl...,36-45,Man,,...,Moderate,Morning,Wood,6 miles,Juniper tree,M,Coffee,Hike,1,0
R_1IHhRxaGPtHjhQ2,I consent to participate.,6 miles,Juniper tree,Morning,Wood,Novel A,I prefer to read novel A because it was easier...,25-35,Woman,,...,Very liberal,,,,,W,Hike,Hike,1,1
R_1oFSEE8V2SpwrhD,I consent to participate.,6 miles,Juniper tree,Morning,Wood,Novel B,I'm also interested in art and enjoy a main ch...,18-24,Woman,,...,Moderate,,,,,W,Hike,Hike,0,1
R_1jVUjK5fbarEH9T,I consent to participate.,6 miles,Juniper tree,Morning,Wood,Novel A,My curiosity about the contents of the note/le...,36-45,Woman,,...,Liberal,,,,,W,Hike,Hike,1,1


In [None]:
public.to_csv(os.path.join(data_dir, 'public_data.csv'))