In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
from typing import List, Tuple, Optional
import pba.parse as ps
from pba.prediction import Prediction, Response
from pba.task import MeanCredencePredictor, RandomPredictor, BaseRatePredictor, evaluate
import numpy as np
from tqdm import tqdm
from scipy.stats import pearsonr
from collections import defaultdict
import random
import seaborn as sns
from pba.features import politics, personal, money, negative_formulation, difficulty, calc_features
from pba.analysis import brier_scores_by_user

sns.set(rc={'figure.figsize':(12,9*0.75)})

In [5]:
json_file = "../data/train-predictions.json"
predictions = ps.load_json_to_predictions(json_file)

100%|██████████| 9613/9613 [00:53<00:00, 179.79it/s]


In [17]:
brier_scores = brier_scores_by_user(predictions)

def hardest_events(preds):
    event_diffs = []
    for p in preds:
        diff = difficulty(p, brier_scores)
        if diff:
            event_diffs.append((diff, p.event))
    return sorted(event_diffs, reverse=True)

hardest_events(predictions)[:10]

[(0.8152479166666667,
  "KnaveOfAllTrades put 100% (or 0%) on PredictionBook on one of more of yesterday's meetup prediction"),
 (0.8033063127690101, 'Both backups fail'),
 (0.8030725490196078, 'My dad and step-mom divorce within the next two years'),
 (0.8025904761904762,
  'I will complete my next 30-day pushup challenge inside of 35 days'),
 (0.7953479166666667, 'FMK will claim that she has never smoked in her life'),
 (0.7932217821782178,
  'Should James Eagan Holmes be found guilty, he will be sentenced capital punishment.'),
 (0.7834063127690101,
  'US Federal Reserve stuck near the 0 bound (say, <1.5% with no willingness to increase it further in the immediate future) by Jan 1, 2016 (see http://www.global-rates.com/interest-rates/central-banks/central-bank-america/fed-interest-rate.aspx )'),
 (0.7771538461538461, 'I will get in >10 minutes of walking/running today'),
 (0.775695564516129,
  '[Holiday reading challenge: By 2019-02-10 EOD I will have finished at least] 1 book'),
 (

# Features

In [None]:
def explore(feat):
    known_preds = {pred for pred in predictions if pred.known()}
    len_known = len(known_preds)
    right = {p for p in known_preds if p.right()}
    wrong = known_preds - right
    pos = {p for p in known_preds if feat(p)}
    neg = known_preds - pos
    print(feat.__name__.upper())
    print("       F+    F-")
    print(f"Right: {len(right & pos)/len_known:.2}  {len(right & neg)/len_known:.2}")
    print(f"Wrong: {len(wrong & pos)/len_known:.2}  {len(wrong & neg)/len_known:.2}")
    print("Sample events:")
    for pred in random.sample(pos, 10):
        print("\t", pred.event)
    print()
    
features = [personal, money, negative_formulation, politics, wait_length]

for feat in features:
    explore(feat)

In [None]:
def diff(pred):
    return difficulty(pred, brier_scores)

known_predictions = [prediction for prediction in predictions if prediction.known()]
stats = [personal, money, negative_formulation, politics, diff, wait_length]
def feature_descriptions():
    records = defaultdict(list)
    for prediction in known_predictions:
        for stat in stats:
            records[stat.__name__].append(int(stat(prediction)))
    print("Name\t             % w/ feat.\tcorr. w/ diff.")
    for feat in features:
        pct = sum(records[feat.__name__]) / len(known_predictions)
        corr = pearsonr(records[feat.__name__], records["diff"])[0]
        print(f"{feat.__name__:>20}:\t{pct:<.2}\t{corr:<0.2}")
    return records
records = feature_descriptions()

In [None]:
len(records['difficulty'])

In [None]:
sns.scatterplot(records["personal"], records["diff"])

In [None]:
sns.scatterplot(records["wait_length"], records["diff"])

In [None]:
sum(records['money'])

In [None]:
sum(records['personal'])

In [None]:
sum(records['negative_formulation'])

In [None]:
min(records['wait_length'])

In [None]:
i=0
for pred in predictions:
    if wait_length(pred) < -5000:
        print((wait_length(pred), pred))
        i+= 1
print(i)

In [None]:
p = Prediction(event='At the end of the EA Weekend Away Debate, at least one person will change their opinion to favor poverty.', 
               number=20406, 
               outcome='wrong', 
               user='Peter Hurford', 
               time_created='2013-07-14 21:07:30+00:00', 
               time_known='2013-07-14 16:00:00+00:00', 
               responses=[Response(user='Peter Hurford', 
                                   time='2013-07-14 21:07:31+00:00', 
                                   actions={'credence': 10}), 
                          Response(user='Peter Hurford', 
                                   time='2013-07-14 21:08:49+00:00', 
                                   actions={'outcome': 'wrong'})])

In [None]:
wait_length(p)

In [None]:
p.time_known

In [None]:
p.time_created

In [None]:
p.time_created - datetime.datetime(2013,7,14,21,7,31, tzinfo=tzutc())

In [None]:
from dateutil.tz import tzutc