# Making test predictions

In [3]:
import csv
from sklearn.externals import joblib
from scipy.stats.stats import pearsonr
from scipy.stats.stats import pearsonr
from sklearn.metrics import make_scorer
from data import load_jsonl

IGNORE = {
    'a23_pdistress',
    'a33_pdistress',
    'a42_pdistress',
    'essay',
    'id',
    'a11_bsag_total',
    'a11_bsag_anxiety',
    'a11_bsag_depression',
}

def dis_r(truth, predictions, key_reliab=0.77, pred_reliab=0.70):
    return pearsonr(truth, predictions)[0] / np.sqrt(key_reliab * pred_reliab)


dis_r_score = make_scorer(dis_r, greater_is_better=True)


def make_xy(data, label_name, include=None, exclude=IGNORE):
    X, y = [], []
    for i in data:
        label = i[label_name]
        if label == '':
            continue
        d = {}
        for k, v in i.items():
            if include and k not in include:
                continue
            elif exclude and k in exclude:
                continue
            else:
                d[k] = v
        X.append(d)
        y.append(label)
    return X, y


def todense(X):
    return X.todense()


def to_range(score):
    """ Returns a float in the range: 0 to 9 inclusive. """
    return max(0, min(score, 9))


def predict(data, clf, fname):
    fieldnames = ['Id', 'Pred']
    with open(fname, 'w') as f:
        w = csv.DictWriter(f, delimiter=',', fieldnames=fieldnames)
        w.writerow({f: f for f in fieldnames})
        for inst, pred in zip(test, clf.predict(data)):
            w.writerow({'Id': inst['id'], 'Pred': to_range(pred)})


test = load_jsonl('test.jsonl')

In [4]:
for a in [23, 33, 42]:
    out = f'a{a}_pdistress'
    for model in ['baseline', 'SGDR-all']:
        clf = joblib.load(f'{model}-{out}.pkl')
        predict(test, clf, 
                f'{model}-{out}-test-no-round.csv')