# Data Collection and Aggregation

To run the script:
```
pipenv run python manage.py shell_plus --notebook
```

## TODO

 - Get question sentences

## setup

In [3]:
import os
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'rest.settings')
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

In [4]:
import json
import sys
import re
import datetime
import csv
from tqdm.notebook import tqdm
from collections import defaultdict
from functools import lru_cache

DROPBOX_FOLER = f"{os.path.expanduser('~')}/Dropbox"
DATA_FOLDER = f"{DROPBOX_FOLER}/_Law/Documents_and_Adminitrata/_DATA/privacy_coding"

ModuleNotFoundError: No module named 'tqdm'

In [5]:
def ts_to_date(ts):
    return str(datetime.datetime.fromtimestamp(ts/1000).date())

# Basic Idea

Link all the objects together into macro objects

```
  Policy -> list<PolicyInstance>
  Policy -> list<Question>
      Question -> list<Answer>
      Answer -> CodingInstance, Coder, ect...
```

## Connectors

`connect` maps parents to children, and visa versa:
```
    Parent -> list<Child>
    Child -> Parent
```

There's weird magic for timing session because it only has a coding instance id, so I need to follow up to grandparents.

In [4]:
def load_json_folder(path):
    to_ret = {}
    to_skip = [f"api_{fn}.json" for fn in "coder project kvstore assignmenttype assignment".split()]
    for fname in os.listdir(path):
        if not fname.endswith(".json") or (fname in to_skip):
            continue
        with open(f"{path}/{fname}") as f:
            to_ret[fname[len("api_"):-len(".json")]] = json.load(f)
    return to_ret

def to_dicts(dataset, emails_to_remove):
    to_ret = {}
    for table, raw_data in dataset.items():
        to_ret[table] = {
            e['id']: expand_object_json(e)
            for e in raw_data 
            if (
                e.get("coder_email", "") not in emails_to_remove
                and e.get("project", 1) == 1
            )
        }
    return to_ret

def expand_object_json(e):
    for k, v in e.items():
        if type(v) == str and k != "coder_email":
            try:
                e[k] = json.loads(v)
            except:
                pass
    return e

BAD_EMAILS = ["davidbstein@gmail.com", "dbs438@nyu.edu", "emily.a.moberg@gmail.com"]
_data_1 = load_json_folder(f"{DATA_FOLDER}/RAW_DATA/privacycoding.com")
_data_2 = load_json_folder(f"{DATA_FOLDER}/RAW_DATA/documentcoding.com")
d1 = to_dicts(_data_1, BAD_EMAILS)
d2 = to_dicts(_data_2, BAD_EMAILS)
print(d1.keys())
print(d2.keys())

def connect(dataset, parent_name, child_name):
    """
    for all parents and children, add object references in both directions.
    """
    parent_id_key = f"{parent_name}_id"
    parent_ref_key = f"{parent_name}"
    child_ref_key = f"{child_name}"
    children = dataset[child_name.replace("_", "")]
    parents = dataset[parent_name.replace("_", "")]
    for child in children.values():
        parent_id = child.get(parent_id_key)
        parent = parents.get(parent_id)
        child[parent_ref_key] = parent
        if parent:
            if parent.get(child_ref_key):
                parent[child_ref_key].append(child)
            else:
                parent[child_ref_key] = [child]

def augment_timing_sessions(d, default_coding):
    """
    adds a `coding_instance_id` to each timing session.
     - determines coding instance using c_id, pi_id, and email.
    """
    c_pi_tuple2ci = {
        (ci['coding_id'], ci['policy_instance_id'], ci['coder_email']): ci
        for ci in d['codinginstance'].values()
    }
    problem_count = 0
    for t_sess in d['timingsession'].values():
        key = (t_sess['coding_id'], t_sess['policy_instance_id'], t_sess['coder_email'])
        altkey = (default_coding, *key[1:])
        if key not in c_pi_tuple2ci:
            if altkey in c_pi_tuple2ci:
                key=altkey
            else:
                c = d['coding'].get(key[0])
                pi = d['policyinstance'].get(key[1])
                info = [t_sess['id'], key, len(t_sess['question_timings']), c is None, pi is None]
                if not t_sess['question_timings']:
                    #print(f"PROBLEM:{info}")
                    problem_count+=1
                else:
                    #print(info)
                    pass
                continue
        assert c_pi_tuple2ci[key]['coder_email'] == t_sess['coder_email'], \
            f"DOESNT MATCH: {t_sess['id']}{key} {c_pi_tuple2ci[key]['coder_email']} {t_sess['coder_email']}"
        t_sess['coding_instance_id'] = c_pi_tuple2ci[key]['id']
    print(f"{problem_count}/{len(d['timingsession'])} problems")

dict_keys(['policyinstance', 'coding', 'timingsession', 'codinginstance', 'policy'])
dict_keys(['policyinstance', 'coding', 'timingsession', 'codinginstance', 'policy'])


## RUNNER: Connections

In [5]:
def do_connections(d, default_coding):
    connect(d, 'policy', 'policy_instance')
    connect(d, 'policy_instance', 'coding_instance')
    connect(d, 'coding', 'coding_instance')
    augment_timing_sessions(d, default_coding)
    connect(d, 'coding_instance', 'timing_session')

print("DOCUMENTCODING")
do_connections(d1, 6)

print("PRIVACYCODING")
do_connections(d2, 10)

DOCUMENTCODING
0/139 problems
PRIVACYCODING
89/1969 problems


### Lookup Tables

Question lookups:
```
    [year, c_id, q_id]: {
        category, type, option, info
    }
    
    lookup_question(year, c_id, q_id)  -> Question
    lookup_all_question_versions(q_id) -> list<Question>
```

In [6]:
def build_question_lookup_dict(d, year):
    """
    creates a dict of questions:
        [year, c_id, q_id]: {
            category, type, option, info
        }
    """
    to_ret = dict()
    for c_id, coding in d['coding'].items():
        if 'categories' in coding:
            id_key = "id"
            cats = coding['categories']
        else:
            id_key = "identifier"
            cats = [{"label": None, "questions": coding['questions']}]
        for cat in cats:
            for question in cat['questions']:
                if id_key not in question:
                    continue
                if "questionOptions" in question:
                    options = {
                        o['value']: o['label'] 
                        for o in question['questionOptions']
                    }
                elif "values" in question:
                    if question['values'] and type(question['values'][0]) == str:
                        options = {
                            o: None
                            for o in question['values']
                        }
                    else:
                        options = {
                            o['value']: o['label'] 
                            for o in question['values']
                        }
                else:
                    print(question.keys())
                type_ = question['type']
                if type(type_) != str:
                    type_ = type_['value']
                to_ret[(year, c_id, question[id_key])] = {
                    "category": cat['label'],
                    "type": type_,
                    "options": options,
                    "info": question.get("info")
                }
    return to_ret

In [7]:
_question_lookups = {}
_question_lookups.update(build_question_lookup_dict(d1, "2021"))
_question_lookups.update(build_question_lookup_dict(d2, "2020"))
def lookup_all_question_versions(q_id):
    return [q for k, q in _question_lookups.items() if k[2] == q_id]

def lookup_question(year, c_id, q_id, suppress_options=False):
    for possible_coding_id in range(c_id, 0, -1):
        key = (year, possible_coding_id, q_id)
        if key in _question_lookups:
            return {
                k: v 
                for k, v in _question_lookups[key].items() 
                if not suppress_options or k != "options"
            }

def question_id_sort_fn(q_id):
    re_number = "(\d+(?:\.\d+)?)"
    re_year = "(\d\d\d\d)"
    if q_id == "PP_in_TOU":
        q_id = "v999_2021"
    found = re.findall(f"v_?{re_number}(.*)_{re_year}", q_id)
    num, _, year = found[0]
    return year, float(num)

 - Policy
     - scrape_date
     - id
     - info `{company_name, site_name}`
     - questions `{question_id: list(<answer>)}`
 - Answer
     - values
     - confidence
     - comments
     - timing_focus
     - timing_blur
     - coder_email
     - coding_id
     - date
     - question

In [8]:
# for testing
def get_sample_pol(d):
    for p_id, p in d['policy'].items():
        pi_list = p.get("policy_instance", [])
        for pi in pi_list:
            ci_list = pi.get("coding_instance", [])
            for ci in ci_list:
                return p
    return None

## Cleaning phase 1:

Now that database objects are linked, normalize policy objects and map `Policy -> list<AnswerInstance>`

AnswerInstances are based on question id, and not grouped by coding.


In [9]:

def get_questions_from_ci(ci, year):
    """
    returns a dict of questions, with a sum of all timing records
    {
        <q_id>: AnswerInstance
    }
    """
    to_ret = {}
    for question_id, q in ci['coding_values'].items():
        if not q or not (question_id.startswith("v") or question_id == "PP_in_TOU"):
            continue
        # NEW: remove redundancy
        timing_focus, timing_blur = None, None
        coding_date = ci['created_dt'].split(" ")[0]
        if ci.get('timing_session'):
            timing_focus, timing_blur = 0, 0
        for t_sess in ci.get('timing_session', []):
            for q_id, t_record in t_sess['question_timings'].items():
                if q_id == question_id:
                    timing_blur += t_record['total_blur']/1000
                    timing_focus += t_record['total_focus']/1000
                    coding_date = ts_to_date(t_record['start_ts'])
        to_ret[question_id] = {
            "values": [k for k, v in q['values'].items() if v],
            "confidence": q['confidence'],
            "comments": q.get("comment"),
            "timing_focus": timing_blur,
            "timing_blur": timing_focus,
            "coder_email": ci['coder_email'],
            "coding_id": f"{year}-{ci['coding']['id']}",
            "date": coding_date,
            "question": lookup_question(year, ci['coding']['id'], question_id, suppress_options=True),
            
            # "sentences"
        }
    return to_ret

def clean_policy(p, year, info_keys):
    """
    for each policy, returns:
    {
        info: {<info_key>: value}
        questions: {<q_id>: list<Answer>}
    }
    
    Only picks one answer per coder
    """
    to_ret = {}
    to_ret['info'] = {k: p[k] for k in info_keys}
    pi_list = p.get("policy_instance", [])
    question_dict = defaultdict(list)
    to_ret['questions'] = {}
    for pi in pi_list:
        ci_list = pi.get("coding_instance", [])
        for ci in ci_list:
            for question_id, question in get_questions_from_ci(ci, year).items():
                question_dict[question_id].append(question)
        for question_id, ans_list in question_dict.items():
            to_ret['questions'][question_id] = ans_list
    return to_ret


## cleaning phase 2:

Remove any instances of multiple answers to the same question from the same coder.

Merge sites coded in both 2020 and 2021


__NOTE__ SKETCHY SORTING FOR PICKING BEST ANSWER

In [10]:
def filter_double_vote_answers(answer_list):
    return {
        ans['coder_email']: ans 
        for ans in sorted(answer_list, key=lambda q:q['date'])
    }.values()


def merge_pols(p_list):
    """
    Merge a list of policies that correspond to the same site.
    
     - overwrites info entries
     - extends repeat question instances {<q_id>: list<AnswerInstance>}
    
    """
    to_ret = {}
    to_ret['info']={}
    to_ret['questions'] = defaultdict(list)
    for p in p_list:
        to_ret['info'].update(p['info'])
        for q_id, q in p['questions'].items():
            to_ret['questions'][q_id].extend(q)
    for q_id, answer_list in to_ret['questions'].items():
        to_ret['questions'][q_id] = filter_double_vote_answers(answer_list)
    return to_ret


def get_cleaned_policies(d1, d2):
    """
    returns a dict of cleaned policy lists:
    {
        <Policy.site_name>: list<CleanPolicy>
    }
    """
    d1_info_keys = ["company_name", "site_name", "categories"]
    d2_info_keys = ["company_name", "site_name"]
    cleaned_pols = defaultdict(list)
    for p in d2['policy'].values():
        cp = clean_policy(p, '2020', d2_info_keys)
        cleaned_pols[cp['info']['site_name']].append(cp)
    for p in d1['policy'].values():
        cp = clean_policy(p, '2021', d1_info_keys)
        cleaned_pols[cp['info']['site_name']].append(cp)
    return [
        merge_pols(p_list)
        for p_list in cleaned_pols.values()
    ]

## Cleaning Step 3

Remove any questions with less than 2 answers.

Remove any policies that aren't mostly coded.

In [351]:
FINAL_QUESTION_IDS = []
for cat in d1['coding'][6]['categories']:
    if "Skip me" in cat['label']:
        continue
    for q in cat['questions']:
        FINAL_QUESTION_IDS.append(q['id'])
len(FINAL_QUESTION_IDS)

64

In [352]:
def worth_keeping(cp):
    return len(cp['questions'])>50

def clean_questions(cp):
    to_ret = {k:v for k, v in cp.items()}
    to_ret['questions'] = {
        q_id: question for q_id, question in cp['questions'].items()
        if len([answer for answer in question if True]) >= 2
        and q_id in FINAL_QUESTION_IDS
    }
    return to_ret

In [353]:
clean_pols = get_cleaned_policies(d1, d2)

CODED_POLICIES = [
    cp for cp in 
    map(clean_questions, clean_pols) 
    if worth_keeping(cp)
]
len(CODED_POLICIES)

113

In [354]:
ALL_TIMED_ANSWERS = []
ALL_ANSWERS = []
for pol in CODED_POLICIES:
    for q_id, answers in pol['questions'].items():
        for answer in answers:
            to_add = {"policy": pol['info']['site_name'], "question_id": q_id}
            to_add.update(answer)
            ALL_ANSWERS.append(to_add)
            if answer['timing_focus']:
                ALL_TIMED_ANSWERS.append(to_add)


# organizational scripts

In [293]:
def summarize_cleaned_pols(pol_list):
    print(''.join(
        f"\n{count:>3}. {pol['info']['site_name']+' ':_<30} {len(pol['questions'].keys())}"
         for count, pol in enumerate(sorted(pol_list, key=lambda p: len(p['questions'].keys())))
    ))

def summarize_cleaned_pol(pol):
    print(f"{pol['info']['site_name']+' ':_<30} {len(pol['questions'].keys())}")
    print(''.join(sorted((f"{k:<31}" for k, v in pol['questions'].items()), 
                          key=question_id_sort_fn)))
        

In [294]:
FMW_EMAIL = "florencia.m.wurgler@gmail.com"
NON_MATCH_KEY = "unresolved non-matching responses"

In [295]:
def _coders_match(answers):
    return len(set(tuple(a['values']) 
            for a in answers 
            if a['values'] and a['coder_email'] != FMW_EMAIL
    )) == 1

def resolve_answer(answer_list_for_question):
    answers = {answer['coder_email']: answer for answer in answer_list_for_question}
    if FMW_EMAIL in answers:
        return ','.join(answers["florencia.m.wurgler@gmail.com"]['values'])
    else:
        # This is a broken BODGE
        if _coders_match(answer_list_for_question):
            return ','.join(list(answers.values())[0]['values'])
        else:
#             print(answers)
            return NON_MATCH_KEY
        
def resolve_agreement(answer_list_for_question):
    answers = {answer['coder_email']: answer for answer in answer_list_for_question}
    if _coders_match(answer_list_for_question) and FMW_EMAIL in answers:
        return "reviewer_override"
    if _coders_match(answer_list_for_question):
        return "full_agreement"
    if not _coders_match(answer_list_for_question):
        return "disagreement"

def agreement_label(answer_list_for_question):
    answers = {answer['coder_email']: answer for answer in answer_list_for_question}
    if "florencia.m.wurgler@gmail.com" in answers:
            return 1
        
def make_row(pol, resolver):
    question_row = [[q_id, resolve_answer(q)] for q_id, q in pol['questions'].items()]
    return dict(
        company_name=pol['info']['site_name'],        
        **dict(question_row)
    )

def save_answer_csv(policy_list, resolver, filename):
    all_rows = [make_row(p, resolver) for p in policy_list]
    fieldnames = sorted(set([i for i in sum(
        (list(r.keys()) for r in all_rows)
        , [])]))
    with open(f"{DATA_FOLDER}/CSVs/{filename}.csv", 'w') as f:
        writer=csv.DictWriter(f, fieldnames)
        writer.writeheader()
        writer.writerows(all_rows)
        

In [296]:
save_answer_csv(CODED_POLICIES, resolve_answer, "coding_values")
save_answer_csv(CODED_POLICIES, resolve_agreement, "agreement")

# Summary Statistics

In [297]:
from textchart import textchart
from collections import Counter

Helper functions
```python
lookup_all_question_versions(question_id)
lookup_policy(site_name)
resolve_answer(policy['questions'][q_id])

CODED_POLICIES
ALL_TIMED_ANSWERS
ALL_ANSWERS
```

In [298]:
print(json.dumps(ALL_TIMED_ANSWERS[0], indent=1))

{
 "policy": "match.com",
 "question_id": "v17_2020",
 "values": [
  "DND"
 ],
 "confidence": "4",
 "comments": "",
 "timing_focus": 95679.711,
 "timing_blur": 74.386,
 "coder_email": "msr634@nyu.edu",
 "coding_id": "2020-6",
 "date": "2020-07-11",
 "question": {
  "category": null,
  "type": "singleselect",
  "info": "Third party tracking: site allows third parties to place advertisements that may track user behavior?"
 }
}


In [299]:
for p in CODED_POLICIES: 
    print(json.dumps(list(p['questions'][FINAL_QUESTION_IDS[0]]), indent=2))
    break

[
  {
    "values": [
      "1"
    ],
    "confidence": "4",
    "comments": "",
    "timing_focus": 0,
    "timing_blur": 0,
    "coder_email": "msr634@nyu.edu",
    "coding_id": "2020-10",
    "date": "2020-07-09",
    "question": {
      "category": null,
      "type": "singleselect",
      "info": "Is the CCPA section in a separate link (as opposed to in the same privacy policy?)"
    }
  },
  {
    "values": [
      "0"
    ],
    "confidence": "5",
    "comments": "",
    "timing_focus": 0,
    "timing_blur": 0,
    "coder_email": "ns4649@nyu.edu",
    "coding_id": "2020-10",
    "date": "2020-07-29",
    "question": {
      "category": null,
      "type": "singleselect",
      "info": "Is the CCPA section in a separate link (as opposed to in the same privacy policy?)"
    }
  }
]


In [300]:
def responses_for_question(q_id, timed_only=True):
    answer_list = ALL_TIMED_ANSWERS if timed_only else ALL_ANSWERS
    answer_dict = defaultdict(list) 
    for a in answer_list:
        if a['question_id'] == q_id:
            answer_dict[(a['coder_email'], a['policy'])].append(a)
    return [sorted(a, key=lambda e: e['date'])[-1] for a in answer_dict.values()]

In [491]:
def answers_for_question(q_id, timed_only=False):
    answer_list = ALL_TIMED_ANSWERS if timed_only else ALL_ANSWERS
    answer_dict = defaultdict(list) 
    for a in answer_list:
        if a['question_id'] == q_id:
            answer_dict[(a['policy'])].append(a)
    return [sorted(a, key=lambda e: e['date']) for a in answer_dict.values()]

In [329]:
def response_clusters_for_question(q_id):
    answer_list = ALL_ANSWERS
    answer_dict = defaultdict(list) 
    for a in answer_list:
        if a['question_id'] == q_id:
            answer_dict[(a['policy'])].append(a)
    return [sorted(a, key=lambda e: e['date']) for a in answer_dict.values()]

In [457]:
UNRECONSILED = "UNRECONSILED"
CONFIDENCES = ['', "1: lowest", "2: low", "3: moderate", "4: high", "5: highest"]
def response_cluster_to_answer_info(responses):
    coder_responses = [r for r in responses if r['coder_email'].endswith("@nyu.edu")]
    grader_responses = [r for r in responses if not r['coder_email'].endswith("@nyu.edu")]
    assert len(grader_responses) <= 1
    coder_response_set = [' & '.join(sorted(resp['values'])) for resp in coder_responses]
    all_response_set = [' & '.join(sorted(resp['values'])) for resp in responses]
    coders_agree = len(set(coder_response_set)) == 1
    grader_overrules = coders_agree and len(set(all_response_set)) != 1
    agreement = ""
    if grader_responses:
        answer = " & ".join(grader_responses[0]['values'])
    elif coders_agree:
        answer = " & ".join(coder_responses[0]['values'])
    else:
        answer = UNRECONSILED
    if coders_agree:
        agreement = "coders agree"
    elif coders_agree and len(set(all_response_set)) != 1:
        agreement = "coders agree, all incorrect"
    elif answer == UNRECONSILED:
        agreement = "unreconsiled disagreement"
    elif not coders_agree and len(set(coder_response_set)) == len(set(all_response_set)):
        agreement = "coders disagree, ≥1 coder match grader"
    elif not coders_agree and len(set(coder_response_set)) == len(set(all_response_set)):
        agreement = "coders disagree, 0 coders match grader"
    confidences = [int(resp["confidence"]) for resp in coder_responses if resp.get("confidence")]
    return {
        "num_coders": len(coder_response_set),
        "coders_agree": coders_agree,
        "grader_overruled": grader_overrules,
        "agreement": agreement,
        "answer": answer,
        "avg_confidence": "not reported" if not confidences else f"{round(2*sum(confidences) / len(confidences))/2:0.1f}",
        "low_confidence": "not reported" if not confidences else CONFIDENCES[list(sorted(confidences))[0]],
        "high_confidence": "not reported" if not confidences else CONFIDENCES[list(sorted(confidences))[-1]],
        "confidences": confidences,
    }

In [458]:
_inner_width = 85
_outer_width = 90

def cluster_stat(key="answer"):
    def inner(q_id):
        clusters = response_clusters_for_question(q_id)
        answer_infos = list(map(response_cluster_to_answer_info, clusters))
        return dict(Counter([info[key] for info in answer_infos]))
    return inner

def confidences(q_id):
    clusters = response_clusters_for_question(q_id)
    answer_infos = list(map(response_cluster_to_answer_info, clusters))
    confidences = sum([info["confidences"] for info in answer_infos], [])
    return dict(Counter(confidences))

def variance_sorter(fn):
    def inner(q_id):
        vals = sorted(fn(q_id).values())
        if len(vals) >= 2:
            a, b = vals[-2:]
        else:
            a, b = 1, 1
        return a/b
    return inner

def draw_stat(fn, name):
    with open(f"{DATA_FOLDER}/2022-09-11_{name}.txt", "w") as f:
        for cat in d1['coding'][6]['categories']:
            if "Skip me" in cat['label']:
                continue
            entry = []
            entry.append(textchart.add_border(cat['label'], max_width=_inner_width))
#             print(json.dumps(cat, indent=2))
            for q in sorted(cat['questions'], key=variance_sorter(fn)):
                q_id = q['id']
                question = summarize_question(q_id)['question']
                counts = fn(q_id)
                unreconsiled = counts.get(UNRECONSILED)
                q_entry = [
                    question['info'],
                    textchart.bar_graph({k:v for k,v in counts.items() if k!=UNRECONSILED}),
                ]
                if unreconsiled:
                    q_entry.append(f"{f'({unreconsiled} unreconsiled disagreements)':>{_inner_width-2}}")
                entry.append(textchart.add_border("\n".join(q_entry), max_width=_inner_width))
            f.write(textchart.add_border('\n'.join(entry), max_width=_outer_width, bold=True))
            f.write("\n"*5)
draw_stat(cluster_stat("answer"), "policy_breakdown")
draw_stat(cluster_stat("agreement"), "agreement")
draw_stat(cluster_stat("avg_confidence"), "self_reported_confidence")
draw_stat(confidences, "self_reported_confidence")

In [469]:
def format_table(array):
    col_widths = []
    for row in array:
        for i, col in enumerate(row):
            if len(col_widths) <= i:
                col_widths.append(0)
            col_widths[i] = max(col_widths[i], len(str(col)))
    to_ret = []
    spacer = ["\n"] + [f"├─{'':─^{width}}─" for width in col_widths] + ["─┤"]
    for row in array:
        to_ret.extend(spacer)
        to_ret.append("\n")
        to_ret.extend([f"│ {col:>{width}} " for width, col in zip(col_widths, row)])
        to_ret.append(" │")
    to_ret.extend(spacer)
    return ''.join(to_ret)

def format_lookup_table(my_dict, row_keys=None, col_keys=None):
    row_keys = row_keys or sorted(set([k[0] for k in my_dict]))
    col_keys = col_keys or sorted(set([k[1] for k in my_dict]))
    return format_table(
        [[''] + [(col) for col in col_keys]] + 
        [
        [(row)] + [my_dict.get((row, col), '') for col in col_keys] 
        for row in row_keys
    ])


In [470]:

def cluster_stat_comparison(k1="low_confidence", k2="high_confidence"):
    def inner(q_id):
        clusters = response_clusters_for_question(q_id)
        answer_infos = list(map(response_cluster_to_answer_info, clusters))
        return dict(Counter([(info[k1], info[k2]) for info in answer_infos]))
    return inner


def draw_table(fn, name, row_keys=None, col_keys=None):
    with open(f"{DATA_FOLDER}/2022-09-11_{name}.txt", "w") as f:
        for cat in d1['coding'][6]['categories']:
            if "Skip me" in cat['label']:
                continue
            entry = []
            entry.append(textchart.add_border(cat['label'], max_width=_inner_width))
#             print(json.dumps(cat, indent=2))
            for q in sorted(cat['questions'], key=variance_sorter(fn)):
                q_id = q['id']
                question = summarize_question(q_id)['question']
                data = fn(q_id)
                q_entry = [
                    question['info'],
                    format_lookup_table(data, row_keys=row_keys, col_keys=col_keys),
                ]
                entry.append(textchart.add_border("\n".join(q_entry), max_width=_inner_width))
            f.write(textchart.add_border('\n'.join(entry), max_width=_outer_width, bold=True))
            f.write("\n"*5)

draw_table(cluster_stat_comparison(), "self-reported-confidence_pairs", row_keys=CONFIDENCES[1:], col_keys=CONFIDENCES[1:])

In [333]:
summarize_question('v72.1_2020.1')['question']

{'category': None,
 'type': 'singleselect',
 'options': {'0': "[ 0 ] no, it's part of the same privacy policy",
  '1': "[ 1 ] yes, it's on a separate link or separate document",
  '.': '[ . ] N/A -there is no CCPA section or CCPA reference in the contract'},
 'info': 'Is the CCPA section in a separate link (as opposed to in the same privacy policy?)'}

In [481]:
def summarize_question(q_id):
    final_answers = defaultdict(int)
    for policy in CODED_POLICIES:
        answers = policy['questions'].get(q_id)
        if answers:
            resolved = resolve_answer(answers)
            if resolved:
                final_answers[resolved] += 1
    a_list, q_list = question_data(q_id)
    return dict(
        question=q_list[-1],
        final_answers=final_answers,
        agreement_frequency=1
    )

In [482]:
NEWLINE = "\n"
def render_answer_frequency(answer_counts, all_answers):
    max_val = max(answer_counts.values())
    to_show = [(v.replace(',', ' & '), c) for v, c in sorted(answer_counts.items()) if v != NON_MATCH_KEY]
    gap = max([20] + [len(str(v)) for v, _ in to_show])
    to_ret = [
f"""{textchart.add_border('Coding Summary', fit=True)}
policies with question fully coded: {sum(answer_counts.values())}
answers recorded: {len(all_answers)}

{answer_counts[NON_MATCH_KEY]} unresolved, non-matching responses!"""]
    for val, count in to_show:
        to_ret.append(f"{val:>{gap}}: {'':■>{40*count/max_val}} {count}")
    return textchart.add_border('\n'.join(to_ret), max_width=83)

def render_summary():
    return textchart.add_border(f"""
Fully Coded Policies: {len(CODED_POLICIES)}
Recorded Answers: {len(ALL_ANSWERS)}
Recorded Answers with Timing: {len(ALL_TIMED_ANSWERS)}
""")

In [538]:
def timing_summary(timed_answers):
    if not timed_answers:
        return "(insufficient timing data)"
    timings = [(a['timing_focus'], a['timing_blur']) for a in sum(timed_answers, [])]
    if len(timings) > 1:
        chart = textchart.scatterplot(
            timings, 
            x_range=[0, max([t[0] for t in timings])],
            y_range=[0, max([t[1] for t in timings])],
            x_scale_fn=textchart.SCALE_FN.log, 
            y_scale_fn=textchart.SCALE_FN.log,
            x_label=f"time spent answering question\n (secs) (log scale)",
            y_label=f"extra time\nspent (secs)\n(log scale)",
            width=50,
            show_key=False,
            border=False,
        )
    else:
        chart = ""
    return "\n".join([
        f"timings recorded: {len(timed_answers)}",
        chart])

In [539]:
def render_question(q_id):
    data = summarize_question(q_id)
    question = data['question']
    answer_counts = data['final_answers']
    all_answers = answers_for_question(q_id, timed_only=False)
    timed_answers = answers_for_question(q_id, timed_only=True)
    if not answer_counts:
        return "(nothing recorded for this question)"
    full_summary = [
#         textchart.add_border(f'Question {q_id}', fit=True, bold=True),
        ''.join(data['question']['info']),
#         f"response validation used: {question['type']}",
#         "CODINGS:",
#         "\n".join([(v or '') for v in data['question']['options'].values()]),
#         render_answer_frequency(answer_counts, all_answers),
        timing_summary(timed_answers),
    ]
    return "\n\n".join(full_summary)

In [540]:
def render_cat_summary(cat):
    return textchart.add_border(cat['label'], max_width=80)

In [544]:
all_question_ids = sorted(
    set(
        k[2] for k in _question_lookups 
        if k[2].startswith('v')
    ), key=question_id_sort_fn
)
with open(f"{DATA_FOLDER}/2022-09-11_timing_data.txt", "w") as f:
#     f.write(render_summary())
    for cat in d1['coding'][6]['categories']:
        entry = [render_cat_summary(cat)]
        for q in cat['questions']:
            q_id = q['id']
            try:
                entry.append(textchart.add_border(render_question(q_id)))
            except Exception as e:
                raise e
        f.write(textchart.add_border("\n".join(entry), max_width=93, bold=True))
# with open(f"{DATA_FOLDER}/2022-08-26_data.txt", "r") as f:
#     print(f.read())

In [536]:
!head /home/stein/Dropbox/_Law/Documents_and_Administrata/_DATA/privacy_coding/2022-09-11_data.txt -n100

┌─────────────────────────────────────────────────────────────────────────────────┐
│                                                                                 │
│ Fully Coded Policies: 113                                                       │
│ Recorded Answers: 15344                                                         │
│ Recorded Answers with Timing: 3043                                              │
│                                                                                 │
└─────────────────────────────────────────────────────────────────────────────────┘
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ CCPA                                                                                         ┃
┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛


# Exploration

In [None]:
from IPython import display
from IPython.core.display import HTML
import csv


from elasticsearch import Elasticsearch

ES = Elasticsearch("http://localhost:9200")
INDEX_prefix = "privacypolicy--"

In [30]:
print("\n"* 200)










































































































































































































