In [1]:
import csv
from sklearn.externals import joblib
from scipy.stats.stats import pearsonr
from scipy.stats.stats import pearsonr
from sklearn.metrics import make_scorer
from data import load_jsonl

IGNORE = {
    'a23_pdistress',
    'a33_pdistress',
    'a42_pdistress',
    'essay',
    'id',
    'a11_bsag_total',
    'a11_bsag_anxiety',
    'a11_bsag_depression',
}

def dis_r(truth, predictions, key_reliab=0.77, pred_reliab=0.70):
    return pearsonr(truth, predictions)[0] / np.sqrt(key_reliab * pred_reliab)


dis_r_score = make_scorer(dis_r, greater_is_better=True)


def make_xy(data, label_name, include=None, exclude=IGNORE):
    X, y = [], []
    for i in data:
        label = i[label_name]
        if label == '':
            continue
        d = {}
        for k, v in i.items():
            if include and k not in include:
                continue
            elif exclude and k in exclude:
                continue
            else:
                d[k] = v
        X.append(d)
        y.append(label)
    return X, y


def todense(X):
    return X.todense()


def to_range(score):
    """ Returns a float in the range: 0 to 9 inclusive. """
    return max(0, min(score, 9))


test = load_jsonl('test.jsonl')

In [2]:
#!/usr/bin/env python3
########################################
## CLP18_eval: Script
## Version: 0.1 (beta)
##
## The following script is intended to be used for evaluating
## the 2018 CLPsych Shared Task, Subtasks A and B. Results are
## reported using Mean Absolute Error and Disattenuated
## Pearson Correlation.
##
## The file expects two input files:
## The first should be a CSV with the first column containing
## the id and the second column containing the true value.
##
## The second should be a CSV with the first column containing
## the id and the second column containing the predicted value.
##
## A header row is expected for both files.
##
## Command to run: CLP18_eval_v0_1.py [true csv] [pred csv]
##
## Any ids missing in the prediction file are treated as
## the mean of all predictions. Any extra ids in the
## prediction file are ignored.
##
## Creators: Anvesh Myla, Veronica Lynn, H. Andrew Schwartz
from __future__ import print_function

import sys
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
from sklearn.metrics import mean_absolute_error


##PARAMETERS:
key_reliab = 0.77  # reliability of the key (an inverse of measurement error)
pred_reliab = 0.70  # reliability expected of the predictions

def get_disr(df_key, df_pred):
    ##ALIGN THE DATA
    df_key['Pred'] = np.nan
    mean = np.mean(df_pred["Pred"].values)  # mean of predictions is used when no prediction is present.
    ids_not_in_pred = []
    for i in range(len(df_key)):
        try:
            df_key.iloc[i, 2] = df_pred[df_pred["Id"] == df_key.iloc[i, 0]].values[0][1]
        except IndexError:
            df_key.iloc[i, 2] = mean
            ids_not_in_pred.append(int(df_key.iloc[i, 0]))

    ##NOTIFY OF ANY MISSING PREDICTIONS
    if len(ids_not_in_pred) == 0:
        print("All the IDs in csv file with true values are present in csv file with predicted values")
    else:
        print("IDs in csv file with true values but not in csv file with predicted values :", ids_not_in_pred)
        print("Number of IDs common to both the csv files :", len(df_key) - len(ids_not_in_pred))

    ##RUN EVALUATION METRICS:
    mae = mean_absolute_error(df_key["True"].values, df_key["Pred"].values)
    dis_r = pearsonr(df_key["True"].values, df_key["Pred"].values)[0] / np.sqrt(key_reliab * pred_reliab)
    print(f'{mae}\t{dis_r}')
    return mae, dis_r, df_key["True"].values, df_key["Pred"].values

In [3]:
# https://gist.github.com/jdmonaco/5922991
from __future__ import division
from collections import namedtuple

import numpy as np
import scipy.stats as st

TtestResults = namedtuple("Ttest", "T p")

def t_welch(x, y, tails=2):
    """Welch's t-test for two unequal-size samples, not assuming equal variances
    """
    assert tails in (1,2), "invalid: tails must be 1 or 2, found %s"%str(tails)
    x, y = np.asarray(x), np.asarray(y)
    nx, ny = x.size, y.size
    vx, vy = x.var(), y.var()
    df = int((vx/nx + vy/ny)**2 / # Welch-Satterthwaite equation
        ((vx/nx)**2 / (nx - 1) + (vy/ny)**2 / (ny - 1)))
    t_obs = (x.mean() - y.mean()) / np.sqrt(vx/nx + vy/ny)
    p_value = tails * st.t.sf(abs(t_obs), df)
    return TtestResults(t_obs, p_value)

In [4]:
HIGH = {
    'Professional',
    'Managerial',
    'Skilled non-manual',
}
LOW = {
    'Skilled manual',
    'Partly skilled',
    'Unskilled',
}


def process_social_class(row):
    v = row.get('Cntrl_a11_social_class') or row.get('ntrl_a11_social_class')
    if v in HIGH:
        return 'HIGH'
    elif v in LOW:
        return 'LOW'
    else:
        return None


def evaluate(test, demos, filter_func=None):
    results = []
    for a in [23, 33, 42]:
        out = f'a{a}_pdistress'
        for model in ['baseline', 'SGDR-all']:
            clf = joblib.load(f'{model}-{out}.pkl')
            df_key = pd.read_csv(f'../data/clpsych_2018_test_data/a{a}_pdistress_KEY.csv', 
                                 skiprows=1, names=['Id', 'True'])
            df_key = pd.merge(df_key, demos, on='Id', how='inner')
            df_key['hilo_class'] = df_key.apply(lambda row: process_social_class(row), axis=1)
            #df_key.info()
            if filter_func:
                df_key = filter_func(df_key)
            df_key = df_key.loc[:, 'Id':'True']
            preds = []
            for inst, pred in zip(test, clf.predict(test)):
                preds.append({'Id': inst['id'], 'Pred': to_range(pred)})
            df_pred = pd.DataFrame(preds, columns=['Id', 'Pred'])
            df_pred = pd.merge(df_pred, demos, on='Id', how='inner')
            #df_pred.info()
            df_pred['hilo_class'] = df_pred.apply(lambda row: process_social_class(row), axis=1)
            #df_pred.info()
            if filter_func:
                df_pred = filter_func(df_pred)
            df_pred = df_pred.loc[:, 'Id':'Pred']
            e, d, true_vals, pred_vals = get_disr(df_key, df_pred)
            n = len(df_key.values)
            prev = len([i for i in df_key.values if i[1] >= 4]) / n
            print(f'{a}\t{model}\td={d:.3f}\te={e:.3f}\tn={n}\tprev={prev:.2f}')
            results.append((a, model, e, d, true_vals, pred_vals))
    return results

In [5]:
demos = pd.DataFrame([{k.capitalize(): v for k, v in i.items() 
                      if k in ['cntrl_gender', 'cntrl_a11_social_class', 'id']} 
                     for i in test], columns=['Id', 'Cntrl_gender', 'Cntrl_a11_social_class'])
print(demos.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
Id                        1000 non-null object
Cntrl_gender              1000 non-null int64
Cntrl_a11_social_class    1000 non-null object
dtypes: int64(1), object(2)
memory usage: 23.5+ KB
None


In [6]:
from collections import Counter

gender = Counter(i['cntrl_gender'] for i in test)
cls = Counter(i['cntrl_a11_social_class'] for i in test)

all_results = {}

for g, count in gender.most_common():
    for age, model, e, d, true_vals, pred_vals in evaluate(test, demos, lambda df: df[df['Cntrl_gender'] == g]):
        all_results.setdefault((age, model), {})['M' if g == 0 else 'F'] = (e, d, true_vals, pred_vals)

All the IDs in csv file with true values are present in csv file with predicted values
0.936627335568334	0.020715876404240077
23	baseline	d=0.021	e=0.937	n=393	prev=0.04
All the IDs in csv file with true values are present in csv file with predicted values
0.9129991927098206	0.250480231418923
23	SGDR-all	d=0.250	e=0.913	n=393	prev=0.04
All the IDs in csv file with true values are present in csv file with predicted values
0.9049378565468633	0.049135967609835435
33	baseline	d=0.049	e=0.905	n=331	prev=0.05
All the IDs in csv file with true values are present in csv file with predicted values
0.9031168812132951	0.210542864768675
33	SGDR-all	d=0.211	e=0.903	n=331	prev=0.05
All the IDs in csv file with true values are present in csv file with predicted values
1.1968868095448093	-0.11481705572148841
42	baseline	d=-0.115	e=1.197	n=347	prev=0.09
All the IDs in csv file with true values are present in csv file with predicted values
1.2041132728927917	-0.015639809551966085
42	SGDR-all	d=-0.016	e=

In [14]:
pd.options.display.float_format = '{:,.3f}'.format
print(f'M={gender[0]}\tF{gender[1]}')
gender_data = []
for (age, model), values in all_results.items():
    i = {'Age': age, 'Model': model}
    t, p = t_welch(values['M'][-1], values['F'][-1])
    for k in 'MF':
        mae, dpc, _, _ = values[k]
        i[f'{k}_mae'] = mae
        i[f'{k}_dpc'] = dpc
    i['p'] = p
    gender_data.append(i)
gender_df = pd.DataFrame(gender_data, columns=['Age', 'Model', 'M_dpc', 'F_dpc'])

M=529	F471


In [15]:
gender_df.sort_values(['Age', 'Model'])

Unnamed: 0,Age,Model,M_dpc,F_dpc
1,23,SGDR-all,0.25,0.231
0,23,baseline,0.021,0.307
3,33,SGDR-all,0.211,0.019
2,33,baseline,0.049,0.177
5,42,SGDR-all,-0.016,0.049
4,42,baseline,-0.115,0.053


In [16]:
print(gender_df.sort_values(['Age', 'Model']).to_latex())

\begin{tabular}{lrlrr}
\toprule
{} &  Age &     Model &  M\_dpc &  F\_dpc \\
\midrule
1 &   23 &  SGDR-all &  0.250 &  0.231 \\
0 &   23 &  baseline &  0.021 &  0.307 \\
3 &   33 &  SGDR-all &  0.211 &  0.019 \\
2 &   33 &  baseline &  0.049 &  0.177 \\
5 &   42 &  SGDR-all & -0.016 &  0.049 \\
4 &   42 &  baseline & -0.115 &  0.053 \\
\bottomrule
\end{tabular}



In [20]:
cls = Counter()
for i in test:
    if i['cntrl_a11_social_class'] in HIGH:
        label = 'HIGH'
    elif i['cntrl_a11_social_class'] in LOW:
        label = 'LOW'
    else:
        label = None
    cls[label] += 1

cls_results = {}
for g, count in cls.most_common():
    print(f'{g}\t{count}')
    for age, model, e, d, true_vals, pred_vals in evaluate(test, demos, lambda df: df[df['hilo_class'] == g]):
        cls_results.setdefault((age, model), {})[g] = (d, true_vals, pred_vals)

LOW	672
All the IDs in csv file with true values are present in csv file with predicted values
1.1765914151652206	0.4352721627894312
23	baseline	d=0.435	e=1.177	n=493	prev=0.10
All the IDs in csv file with true values are present in csv file with predicted values
1.1549005554135436	0.465814833555509
23	SGDR-all	d=0.466	e=1.155	n=493	prev=0.10
All the IDs in csv file with true values are present in csv file with predicted values
1.2059715266851183	0.2505265294311525
33	baseline	d=0.251	e=1.206	n=446	prev=0.09
All the IDs in csv file with true values are present in csv file with predicted values
1.1573186259349584	0.2946246178769604
33	SGDR-all	d=0.295	e=1.157	n=446	prev=0.09
All the IDs in csv file with true values are present in csv file with predicted values
1.3837450812214789	0.24255276901251607
42	baseline	d=0.243	e=1.384	n=450	prev=0.13
All the IDs in csv file with true values are present in csv file with predicted values
1.3745577897268753	0.21269994625805835
42	SGDR-all	d=0.213	e

In [21]:
cls_data = []

keys = [
    'LOW', 'HIGH',
]
print(cls)

for (age, model), values in cls_results.items():
    i = {'Age': age, 'Model': model}
    t, p = t_welch(values['HIGH'][2], values['LOW'][2])
    for k in keys:
        mae, _, _ = values[k]
        i[k] = mae
    i['p'] = p
    cls_data.append(i)
cls_df = pd.DataFrame(cls_data, columns=['Age', 'Model'] + keys)

Counter({'LOW': 672, 'HIGH': 328})


In [24]:
cls_df.sort_values(['Age', 'Model'])

Unnamed: 0,Age,Model,LOW,HIGH
1,23,SGDR-all,0.466,0.234
0,23,baseline,0.435,0.213
3,33,SGDR-all,0.295,0.189
2,33,baseline,0.251,0.228
5,42,SGDR-all,0.213,0.094
4,42,baseline,0.243,0.109


In [22]:
print(cls_df.sort_values(['Age', 'Model']).to_latex())

\begin{tabular}{lrlrr}
\toprule
{} &  Age &     Model &   LOW &  HIGH \\
\midrule
1 &   23 &  SGDR-all & 0.466 & 0.234 \\
0 &   23 &  baseline & 0.435 & 0.213 \\
3 &   33 &  SGDR-all & 0.295 & 0.189 \\
2 &   33 &  baseline & 0.251 & 0.228 \\
5 &   42 &  SGDR-all & 0.213 & 0.094 \\
4 &   42 &  baseline & 0.243 & 0.109 \\
\bottomrule
\end{tabular}

