In [1]:
from dask.distributed import Client
# see https://github.com/dask/distributed/issues/4168
import multiprocessing.popen_spawn_win32

client = Client(n_workers=4)

In [2]:
from preprocess import Preprocessor

import ml_inference
from ml_inference.automl import AutoRegressor
from ml_inference.baseline import BaselineRegressor
from ml_inference.test import cv_test, gen_score_plot
from ml_inference.inference import explain_performance, explain_predictions, explain_correlations

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

import random

np.random.seed(123456789)
random.seed(123456789)

pd.options.mode.chained_assignment = None
INFILE = '../data/pennycook_et_al_study2_clean.csv'

In [3]:
df = pd.read_csv(INFILE)
df = df[df.Treatment == 0].drop(columns='Treatment').reset_index(drop=True)
X, y = df.drop(columns='Diff'), df.Diff
y = (y - y.mean()) / y.std()
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
df.head()

Unnamed: 0,SharingType_1,SharingType_2,SharingType_3,SharingType_4,SharingType_6,SharingType_5,SocialMedia_1,SocialMedia_2,SocialMedia_3,SocialMedia_4,...,Male,Education,Income,English,Partisan,Social_Conserv,Economic_Conserv,Diff,Party,POTUS2016
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,19.0,3.0,1.0,5.0,4.0,4.0,-0.666667,Republican,Trump
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,16.0,7.0,1.0,3.0,2.0,2.0,0.0,Independent,Clinton
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,13.0,4.0,1.0,2.0,4.0,4.0,0.0,Democrat,Clinton
3,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,14.0,6.0,1.0,1.0,5.0,5.0,0.2,Democrat,Clinton
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,15.0,3.0,1.0,3.0,3.0,3.0,-0.2,Democrat,Other


In [9]:
%%time

baseline_reg = BaselineRegressor().fit(X, y)

linear_reg = make_pipeline(
    Preprocessor(),
    LinearRegression()
).fit(X, y)

ml_reg = AutoRegressor(preprocessors=Preprocessor(), n_iter=2, max_ensemble_size=10, n_jobs=-1)
ml_reg = ml_reg.fit(X, y).best_estimator_

estimators = [
    ('Baseline', baseline_reg), 
    ('Linear', linear_reg), 
    ('Machine learning', ml_reg)
]


Tuning estimator 1 of 18: RandomForestRegressorCV
Best estimator score: 0.1331

Tuning estimator 2 of 18: PCARandomForestRegressorCV
Best estimator score: 0.0061

Tuning estimator 3 of 18: LassoLarsCV
Best estimator score: -0.0121

Tuning estimator 4 of 18: PCALassoLarsCV
Best estimator score: -0.0121

Tuning estimator 5 of 18: RidgeCV
Best estimator score: -0.0816

Tuning estimator 6 of 18: PCARidgeCV
Best estimator score: 0.0654

Tuning estimator 7 of 18: ElasticNetCV
Best estimator score: -0.0121

Tuning estimator 8 of 18: PCAElasticNetCV
Best estimator score: -0.0121

Tuning estimator 9 of 18: KernelRidgeCV
Best estimator score: 0.1347

Tuning estimator 10 of 18: PCAKernelRidgeCV
Best estimator score: 0.0905

Tuning estimator 11 of 18: SVRCV
Best estimator score: 0.0395

Tuning estimator 12 of 18: PCASVRCV
Best estimator score: 0.0676

Tuning estimator 13 of 18: KNeighborsRegressorCV
Best estimator score: 0.0662

Tuning estimator 14 of 18: PCAKNeighborsRegressorCV
Best estimator s

In [72]:
%%time

from copy import deepcopy

from scipy.stats import t
from sklearn.base import is_classifier
from sklearn.model_selection import KFold, check_cv, cross_val_score

def repeated_cv_test(estimators, X, y, repetitions=5, cv=2, scoring=None, n_jobs=None):
    def collect_dataframe():
        names = [
            {
                'Estimator1': estimators[i][0],
                'Estimator2': estimators[j][0]
            }
            for i in range(len(estimators))
            for j in range(i+1, len(estimators))
        ]
        df = pd.DataFrame(names)
        df['PerformanceDifference'] = pairwise_diff
        df['t-stat'] = t_stats
        df['p-value'] = p_vals
        return df
        
    cv = check_cv(cv, classifier=is_classifier(estimators[0][1]))
    if hasattr(cv, 'shuffle'):
        cv.shuffle = True
    pairwise_diff = None
    pairwise_var = []
    for i in range(repetitions):
        cv.random_state = np.random.RandomState()
        scores = np.array([
            cross_val_score(est, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs)
            for _, est in estimators
        ]).T
        if pairwise_diff is None:
            pairwise_diff = np.array([
                scores[0][i]-scores[0][j] 
                for i in range(len(estimators)) 
                for j in range(i+1, len(estimators))
            ])
        pairwise_var.append([
            (scores[:,i]-scores[:,j]).var()
            for i in range(len(estimators))
            for j in range(i+1, len(estimators))
        ])
    pairwise_var = np.array(pairwise_var).mean(axis=0)
    t_stats = pairwise_diff / np.sqrt(pairwise_var)
    p_vals = [t.sf(abs(t_stat), repetitions)*2 for t_stat in t_stats]
    return collect_dataframe()
            
results_df = repeated_cv_test(estimators, X, y, repetitions=20, n_jobs=-1)
results_df

Wall time: 17.2 s


Unnamed: 0,Estimator1,Estimator2,PerformanceDifference,t-stat,p-value
0,Baseline,Linear,0.291604,5.537085e-09,1.0
1,Baseline,Machine learning,-0.065043,-3.435684,0.002617
2,Linear,Machine learning,-0.356647,-6.772144e-09,1.0


In [None]:
%%time

from dask import delayed
from sklearn.metrics import r2_score
from ml_inference.test import run_pairwise_tests

def oos_test(estimators, X, y, parallel=True):
    def compute_error(i):
        return (y-estimators[i][1].predict(X))**2
    
    f = (
        delayed(compute_error) if parallel 
        else compute_error
    )
    errors = {name: f(i) for i, (name, _) in enumerate(estimators)}
    df = (
        delayed(pd.DataFrame)(errors).compute() if parallel
        else pd.DataFrame(errors)
    )
    return df, run_pairwise_tests(df)

score_df, results_df = oos_test(estimators, X_test, y_test)
results_df

In [None]:
score_df.describe()

In [None]:
gen_score_plot(score_df, showfliers=False)

In [None]:
%%time

from sklearn.metrics import r2_score

scores, pairwise_tests = cv_test(estimators, X_train, y_train, scorer=r2_score, parallel=True)
ax = gen_score_plot(scores, showfliers=False)
pairwise_tests

In [None]:
def cv_test(estimators, X, y, scorer, repeat=10, cv=10, parallel=True):
    """
    Prepares a dataframe for a cross validation test.

    Parameters
    ----------
    estimators : list of (str, estimator) tuples
        List of (estimator name, estimator) tuples. The estimator must
        implement `fit` and `predict` methods.

    X : pandas.DataFrame or numpy.array
        Features.

    y : pandas.DataFrame, pandas.Series, or numpy.array
        Targets.

    scorer : callable
        Takes the true and predicted target values and returns a score.

    repeat : int, default=10
        Number of repetitions.

    cv : int, default=10
        Number of folds to cross-validate. TODO: sklearn-style cv parameter.

    parallel: bool, default=True
        Run the CV test using parallel processing. Set to `False` to force a
        single process.

    Returns
    -------
    scores, pairwise_tests : pd.DataFrame, pd.DataFrame
        `scores` is the average cross-validation score for each repetition 
        organized by estimator. `pairwise_tests` is a dataframe of pairwise
        t-tests for each estimator.
    """
    def compute_cv_score(kf):
        # compute CV score for a given split
        scores = {name: [] for name, _ in estimators}
        for train_idx, test_idx in kf.split(X):
            compute_fold_score(scores, train_idx, test_idx)
        return {
            name: np.array(fold_scores).mean() 
            for name, fold_scores in scores.items()
        }
    
    def compute_fold_score(scores, train_idx, test_idx):
        # compute score for a given fold
        f = delayed(compute_estimator_score) if parallel else compute_estimator_score
        for i, (name, _) in enumerate(estimators):
            scores[name].append(f(i, train_idx, test_idx))
    
    def compute_estimator_score(estimator_idx, train_idx, test_idx):
        # compute score for a given estimator and fold
        est = estimators[estimator_idx][1]
        if parallel:
            est = clone(est)
        X_train, X_test, y_train, y_test = _split(X, y, train_idx, test_idx)
        est.fit(X_train, y_train)
        return scorer(y_test, est.predict(X_test))
        
    scores = [compute_cv_score(KFold(cv, shuffle=True)) for _ in range(repeat)]
    score_df = (
        delayed(pd.DataFrame)(scores).compute() if parallel
        else pd.DataFrame(scores)
    )
    return score_df, run_pairwise_tests(score_df)

In [None]:
from sklearn.metrics import r2_score
    
def prefix_group(prefix):
    return [col for col in X.columns if col.startswith(prefix)]
    
perf_groups = [
    ('Sharing content', prefix_group('SharingType')),
    ('Social media use', prefix_group('SocialMedia')),
    ('CRT', prefix_group('CRT')),
    ('Scientific knowledge', prefix_group('sci')),
    ('Medical maximizing', prefix_group('mms')),
    ('News media', prefix_group('Media')),
    ('Ethnicity', prefix_group('Ethnicity')),
    ('Politics', ['Partisan', 'Social_Conserv', 'Economic_Conserv', 'Party', 'POTUS2016'])
]
ml_reg.fit(X, y)
df, ax = explain_performance(ml_reg.predict, X, y, metric=r2_score, nsamples=2**5, groups=perf_groups)
ax.set(xlabel='Amount of explained variance')
df

In [None]:
predict_groups = [
    ('Sharing content', prefix_group('SharingType')),
    ('Social media use', prefix_group('SocialMedia')),
    ('CRT', prefix_group('CRT')),
    ('Scientific knowledge', prefix_group('sci')),
    ('Medical maximizing', prefix_group('mms')),
    ('Ethnicity', prefix_group('Ethnicity')),
]
df = explain_predictions(ml_reg.predict, X, y, nsamples=2**5, groups=predict_groups)

In [None]:
mms = X[[col for col in X.columns if col.startswith('mms')]].sum(axis=1)
ax = sns.scatterplot(x=mms, y=df['Medical maximizing'])
# indicates that some MMS questions are much more diagnostic of sharing discernment than others
ax.set(xlabel='Medical maximizing score', ylabel='Effect on prediction')

In [None]:
media_values = list(range(1, 6))
media_vars = [
    ('Media3_1', 'national news'),
    ('Media3_2', 'local news'),
    ('Media3_12', 'fact checkers'),
    ('Media3_3', 'friends and family for news'),
    ('Media3_11', 'social media for news')
]
for var, label in media_vars:
    mask = X[var].isin(media_values)
    ax = sns.boxplot(x=X[var][mask], y=df[var][mask], showfliers=False)
    ax.set(xlabel='Trust in '+label, ylabel='Effect on prediction', ylim=(-.15, .15))
    plt.show()

In [None]:
X_preproc = Preprocessor(X).transform(X)
cols = [col for col in X_preproc.columns if col.startswith('CRT') and col.endswith('intuit')]
crt = X_preproc[cols].sum(axis=1)
ax = sns.violinplot(x=crt, y=df.CRT)
ax.set(xlabel='CRT intuitive responses', ylabel='Effect of CRT on prediction')

In [None]:
candidates = ['Trump', 'Other', 'Clinton']
mask = X.POTUS2016.isin(candidates)
ax = sns.boxplot(x=X.POTUS2016[mask], y=df.POTUS2016[mask], showfliers=False, order=candidates)
ax.set(xlabel='Voted for in 2016', ylabel='Effect on prediction')

In [None]:
mask = X.Party.isin(['Republican', 'Democrat', 'Independent'])
ax = sns.boxplot(x=X.Party[mask], y=df.Party[mask], showfliers=False)
ax.set(xlabel='Political party', ylabel='Effect on prediction')

In [None]:
partisan_vals = list(range(1, 7))
mask = X.Partisan.isin(partisan_vals)
ax = sns.boxplot(x=X.Partisan[mask], y=df.Partisan[mask], showfliers=False)
ax.set(xlabel='<= Democrat, Republican =>', ylabel='Effect on prediction')

In [None]:
sci = X[[col for col in X.columns if col.startswith('sci')]].sum(axis=1)
ax = sns.scatterplot(x=sci, y=df['Scientific knowledge'])
ax.set(xlabel='Scientific knowledge', ylabel='Effect on prediction')

In [None]:
ax = sns.scatterplot(x=X.COVID_concern, y=df['COVID_concern'])
ax.set(xlabel='Concern about COVID-19', ylabel='Effect on prediction')

In [None]:
ax = sns.scatterplot(x=X.Age, y=df.Age)
ax.set(ylabel='Effect on prediction')

In [None]:
ax = sns.boxplot(x=X.COVID_news, y=df.COVID_news, showfliers=False)
ax.set(xlabel='How often you check COVID-19 news', ylabel='Effect on prediction')

In [None]:
mask = X.Education.isin(list(range(21)))
ax = sns.boxplot(x=X[mask].Education, y=df[mask].Education, showfliers=False)
ax.set(xlabel='Years of education', ylabel='Effect on prediction')
ax.set_xticks(list(range(0, 21, 4)))

In [None]:
features = ['COVID_concern', 'COVID_news', 'Education', 'Income']
corr_df, plots = explain_correlations(features, ml_reg.predict, X, y, groups=perf_groups, nsamples=2**5)
corr_df

In [None]:
X_preproc['Scientific knowledge'] = sci
X_preproc['MMS'] = mms
X_preproc['Diff'] = y
X_preproc[['Education', 'Scientific knowledge', 'Media3_1', 'Media3_3', 'Media3_11', 'Income', 'Party_Democrat', 'MMS', 'Diff']].corr()