In [263]:
import csv

# Data Intake

In [264]:
lines = []
with open('data.csv', mode = 'r') as f:
    reader = csv.DictReader(f)
    for line in reader:
        lines.append(line)

In [265]:
headers = lines[:2]
data = lines[2:]

In [266]:
likelihood_data = [line for line in data if line['FL_6_DO'] == 'Behavior|Intent']
posterior_data = [line for line in data if line['FL_6_DO'] == 'Intent|Behavior']

# Data Cleaning

In [267]:
likelihood_key_rewrite = {
    # relationship keys
    'P(Text|Relationship)_1': 'P(Friday|Relationship)',
    'P(Text|Relationship)_2': 'P(Museum|Relationship)',
    'P(Text|Relationship)_3': 'P(Sick|Relationship)',
    'P(Text|Relationship)_4': 'P(WYD|Relationship)',
    'P(Text|Relationship)_5': 'P(Bored|Relationship)',
    'P(Text|Relationship)_6': 'P(Missed Show|Relationship)',
    'P(Text|Relationship)_7': 'P(Oolong Tea|Relationship)',
    'P(Text|Relationship)_8': 'P(Lunch|Relationship)',
    'P(Text|Relationship)_9': 'P(Common Room|Relationship)',
    'P(Text|Relationship)_10': 'P(Party|Relationship)',
    # friendship keys
    'P(Text | Friendship)_1': 'P(Friday|Friendship)',
    'P(Text | Friendship)_2': 'P(Museum|Friendship)',
    'P(Text | Friendship)_3': 'P(Sick|Friendship)',
    'P(Text | Friendship)_4': 'P(WYD|Friendship)',
    'P(Text | Friendship)_5': 'P(Bored|Friendship)',
    'P(Text | Friendship)_6': 'P(Missed Show|Friendship)',
    'P(Text | Friendship)_7': 'P(Oolong Tea|Friendship)',
    'P(Text | Friendship)_8': 'P(Lunch|Friendship)',
    'P(Text | Friendship)_9': 'P(Common Room|Friendship)',
    'P(Text | Friendship)_10': 'P(Party|Friendship)',
    # hookup keys
    'P(Text | Hookup)_1': 'P(Friday|Hookup)',
    'P(Text | Hookup)_2': 'P(Museum|Hookup)',
    'P(Text | Hookup)_3': 'P(Sick|Hookup)',
    'P(Text | Hookup)_4': 'P(WYD|Hookup)',
    'P(Text | Hookup)_5': 'P(Bored|Hookup)',
    'P(Text | Hookup)_6': 'P(Missed Show|Hookup)',
    'P(Text | Hookup)_7': 'P(Oolong Tea|Hookup)',
    'P(Text | Hookup)_8': 'P(Lunch|Hookup)',
    'P(Text | Hookup)_9': 'P(Common Room|Hookup)',
    'P(Text | Hookup)_10': 'P(Party|Hookup)',
}

posterior_key_rewrite = {
    'Intent | Friday_1': 'P(Friendship|Friday)',
    'Intent | Friday_2': 'P(Relationship|Friday)',
    'Intent | Friday_3': 'P(Hookup|Friday)',
    'Intent | Museum_1': 'P(Friendship|Museum)',
    'Intent | Museum_2': 'P(Relationship|Museum)',
    'Intent | Museum_3': 'P(Hookup|Museum)',
    'Intent | Sick_1': 'P(Friendship|Sick)',
    'Intent | Sick_2': 'P(Relationship|Sick)',
    'Intent | Sick_3': 'P(Hookup|Sick)',
    'Intent | WYD_1': 'P(Friendship|WYD)',
    'Intent | WYD_2': 'P(Relationship|WYD)',
    'Intent | WYD_3': 'P(Hookup|WYD)',
    'Intent | Bored_1': 'P(Friendship|Bored)',
    'Intent | Bored_2': 'P(Relationship|Bored)',
    'Intent | Bored_3': 'P(Hookup|Bored)',
    'Intent | Missed Show_1': 'P(Friendship|Missed Show)',
    'Intent | Missed Show_2': 'P(Relationship|Missed Show)',
    'Intent | Missed Show_3': 'P(Hookup|Missed Show)',
    'Intent | Oolong Tea_1': 'P(Friendship|Oolong Tea)',
    'Intent | Oolong Tea_2': 'P(Relationship|Oolong Tea)',
    'Intent | Oolong Tea_3': 'P(Hookup|Oolong Tea)',
    'Intent | Lunch_1': 'P(Friendship|Lunch)',
    'Intent | Lunch_2': 'P(Relationship|Lunch)',
    'Intent | Lunch_3': 'P(Hookup|Lunch)',
    'Intent | Common Room_1': 'P(Friendship|Common Room)',
    'Intent | Common Room_2': 'P(Relationship|Common Room)',
    'Intent | Common Room_3': 'P(Hookup|Common Room)',
    'Intent | Party_1': 'P(Friendship|Party)',
    'Intent | Party_2': 'P(Relationship|Party)',
    'Intent | Party_3': 'P(Hookup|Party)',
}

multi_answer_demographic_keys = ['Race', 'Major']

single_answer_demographic_keys = ['Gender', 'Sexuality', 'Age']

In [268]:
intify = lambda x: 0 if x == '' else int(x)
BUG_FIX_TIME = '2022-12-15 17:00:00'

BUG_FIELDS = [
    'P(Friendship|Friday)',
    'P(Relationship|Friday)',
    'P(Hookup|Friday)',
    'P(Friendship|Bored)',
    'P(Relationship|Bored)',
    'P(Hookup|Bored)',
    'P(Friendship|Missed Show)',
    'P(Relationship|Missed Show)',
    'P(Hookup|Missed Show)',
]

def clean_data(data, is_likelihood):
    cleaned = {}
    main_keys = likelihood_key_rewrite if is_likelihood else posterior_key_rewrite
    cleaned = {new_key: intify(data[key]) for key, new_key in main_keys.items()}
    for k in multi_answer_demographic_keys:
        cleaned[k] = list(map(int, data[k].split(',')))
    for k in single_answer_demographic_keys:
        cleaned[k] = int(data[k])
    if not is_likelihood:
        cleaned['Has Bug'] = data['StartDate'] < BUG_FIX_TIME
        if cleaned['Has Bug']:
            for k in BUG_FIELDS:
                cleaned[k] = 0
    return cleaned

In [269]:
cleaned_likelihood_data = [clean_data(data, is_likelihood = True) for data in likelihood_data]
cleaned_posterior_data = [clean_data(data, is_likelihood = False) for data in posterior_data]

In [270]:
cleaned_likelihood_keys = list(likelihood_key_rewrite.values())
cleaned_posterior_keys = list(posterior_key_rewrite.values())

# Probability Distribution Calculations

In [271]:
def prob_dist(data, keys):
    total = sum(data[k] for k in keys)
    for k in keys:
        data[k] = 0.0 if total == 0 else data[k] / total
    return data

In [272]:
relationship_keys = [k for k in cleaned_likelihood_keys if k.endswith('Relationship)')]
friendship_keys = [k for k in cleaned_likelihood_keys if k.endswith('Friendship)')]
hookup_keys = [k for k in cleaned_likelihood_keys if k.endswith('Hookup)')]

bored_keys = [k for k in cleaned_posterior_keys if k.endswith('Bored)')]
common_room_keys = [k for k in cleaned_posterior_keys if k.endswith('Common Room)')]
friday_keys = [k for k in cleaned_posterior_keys if k.endswith('Friday)')]
lunch_keys = [k for k in cleaned_posterior_keys if k.endswith('Lunch)')]
missed_show_keys = [k for k in cleaned_posterior_keys if k.endswith('Missed Show)')]
museum_keys = [k for k in cleaned_posterior_keys if k.endswith('Museum)')]
oolong_keys = [k for k in cleaned_posterior_keys if k.endswith('Oolong Tea)')]
party_keys = [k for k in cleaned_posterior_keys if k.endswith('Party)')]
sick_keys = [k for k in cleaned_posterior_keys if k.endswith('Sick)')]
wyd_keys = [k for k in cleaned_posterior_keys if k.endswith('WYD)')]

likelihood_keys_set = [relationship_keys, friendship_keys, hookup_keys]
posterior_keys_set = [bored_keys, common_room_keys, friday_keys, lunch_keys, missed_show_keys,
                      museum_keys, oolong_keys, party_keys, sick_keys, wyd_keys]


In [273]:
likelihood_distro_data = []
posterior_distro_data = []

for data in cleaned_likelihood_data:
    for keys in likelihood_keys_set:
        data = prob_dist(data, keys)
    likelihood_distro_data.append(data)

for data in cleaned_posterior_data:
    for keys in posterior_keys_set:
        data = prob_dist(data, keys)
    posterior_distro_data.append(data)

# Saving Processed Data

In [274]:
with open('likelihood_data.csv', mode = 'w', newline = '') as f:
    writer = csv.DictWriter(f, fieldnames = likelihood_distro_data[0].keys())
    writer.writeheader()
    for row in likelihood_distro_data:
        writer.writerow(row)

with open('posterior_data.csv', mode = 'w', newline = '') as f:
    writer = csv.DictWriter(f, fieldnames = posterior_distro_data[0].keys())
    writer.writeheader()
    for row in posterior_distro_data:
        writer.writerow(row)