In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import re
import numpy as np
import json
import scienceplots
from scipy.stats import ks_2samp

In [None]:
tex_fonts = {
    "text.usetex": True,
     "axes.labelsize": 12,
     "font.size": 12,
     "legend.fontsize": 6,
    "xtick.labelsize": 6,
    "ytick.labelsize": 6
}

plt.style.use(['science', 'nature'])
plt.rcParams.update(tex_fonts)

In [None]:
ideo = pd.read_csv('data/video_ideology_scores.csv')
ideology_scores = {row.video_id: row.score for row in ideo.itertuples()}

In [None]:
survey = pd.read_spss('data/AMST0001_OUTPUT_w1andw2.sav')

In [None]:
visa_ids = set(pd.read_csv('data/visas.csv')['visaId'])

In [None]:
def jsonify(js):
    if js is None:
        return None
    return json.loads(js)

In [None]:
full_log = pd.read_parquet('data/logs.parquet')
full_log = full_log[full_log['visaId'].isin(visa_ids)]
full_log = full_log[full_log['intervention'] == 'background']
full_log['homepage_content'] = full_log['homepage_content'].map(jsonify)
full_log['upnext_content'] = full_log['upnext_content'].map(jsonify)

In [None]:
def extract_video_id(url):
    try:
        vId = re.search(r'v=(.{11})', url).group(1)
        if vId is not None and vId != '' and len(vId) == 11:
            return vId
    except:
        pass
    return None

def extract_video_ids(urls):
    videoIds = set()
    for url in urls:
        vId = extract_video_id(url)
        if vId is not None:
            videoIds.add(vId)            
    return videoIds
    
def get_watched_videos(logs):
    urls = set()
    logs = logs[~logs['is_injected']]
    for url in logs['url']:
        urls.add(url)
    return extract_video_ids(urls)

def get_homepage_videos(logs):
    urls = set()
    for hp in logs['homepage_content'].dropna():
        for url in hp:
            urls.add(url)
    return extract_video_ids(urls)

def get_upnext_videos(logs):
    urls = set()
    for hp in logs['upnext_content'].dropna():
        for url in hp:
            urls.add(url)
    return extract_video_ids(urls)

In [None]:
def get_scores(videoIds):
    return list(filter(lambda x: x is not None, [ideology_scores.get(vId, None) for vId in videoIds]))

In [None]:
def map_ideology(p):
    if p == 'Not sure':
        return 'Moderate'
    return p

In [None]:
survey['ideo5_mapped'] = survey['ideo5'].map(map_ideology)

In [None]:
ideologies = ['Very liberal', 'Liberal', 'Moderate', 'Conservative', 'Very conservative']
caption = 'abcdefghijklmno'
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
def make_ideology_plot(ideo, ax, cap):

    visaIds = survey[survey['ideo5'] == ideo]['session_visa']
    partial_log = full_log[full_log['visaId'].isin(visaIds)]

    dataset1 = []
    dataset2 = []
    
    for stage in ['PRE', 'MID', 'POST'][::-1]:
        relevant_logs = partial_log[partial_log['intervention_stage'] == stage]
        relevant_logs = relevant_logs[relevant_logs['intervention'] == 'background']
        dataset1.append(get_scores(get_watched_videos(relevant_logs)))
        dataset2.append(get_scores(get_homepage_videos(relevant_logs)))

    wp1 = ax.violinplot(dataset1, [0, 2, 4], vert=False, showmeans=True, showmedians=False, showextrema=True)
    wp2 = ax.violinplot(dataset2, [1, 3, 5], vert=False, showmeans=True, showmedians=False, showextrema=True)

    for pc1, pc2 in zip(wp1['bodies'], wp2['bodies']):
        pc1.set_alpha(0.3)
        pc2.set_alpha(0.3)        
        pc1.set_facecolor(colors[0])
        pc2.set_facecolor(colors[1])
        pc1.set_edgecolor('none')
        pc2.set_edgecolor('none')

    for y, data in zip([0, 2, 4], dataset1):
        mean = np.mean(data)
        ax.annotate('%.2f' % mean, (mean, y + 0.2), color=colors[0], fontsize=6)
    for y, data in zip([1, 3, 5], dataset2):
        mean = np.mean(data)
        ax.annotate('%.2f' % mean, (mean, y + 0.2), color=colors[1], fontsize=6)

    ax.plot([-2.5, 2.5], [1.5, 1.5], ls='dashed', lw=0.5, color='black')
    ax.plot([-2.5, 2.5], [3.5, 3.5], ls='dashed', lw=0.5, color='black')
    
    ax.set_yticks([0.5, 2.5, 4.5])
    ax.set_yticklabels(['Pre', 'Mid', 'Post'][::-1], fontsize=6)

    ax.set_xlim((-2.1, 2.1))
    ax.annotate(cap.upper(), (-0.125, 1.05), annotation_clip=False, fontsize=9, xycoords='axes fraction')
    
    return ax

In [None]:
import matplotlib.gridspec as gridspec

In [None]:
#fig, axes = plt.subplots(figsize=(6, 4), dpi=300)
fig = plt.figure(figsize=(figw*1.75, figh*1.5), dpi=300)
gs = fig.add_gridspec(2, 6)

axes = [
    plt.subplot(gs[0, 0:2]),
    plt.subplot(gs[0, 2:4]),
    plt.subplot(gs[0, 4:6]),
    plt.subplot(gs[1, 1:3]),
    plt.subplot(gs[1, 3:5]),
]

for ideo, ax, cap in zip(ideologies, axes, 'abcdef'):
    make_ideology_plot(ideo, ax, cap)

fig.legend(
    [mpatches.Patch(color=colors[0], alpha=0.3, ec='none'), mpatches.Patch(color=colors[1], alpha=0.3, ec='none')][::-1], 
    ['Watched', 'Recommended'][::-1], 
    loc='lower center', fontsize=5, ncols=2
)
    
fig.tight_layout()
outfile = 'figures/ideology-distribution.pdf'
fig.savefig(outfile, transparent = True, bbox_inches = 'tight', pad_inches = 0.1, dpi=300)

In [None]:
def kstest(a, b, p_=0.05):
    s, p = ks_2samp(a, b)
    if p < p_:
        return '%.3f*' % s
    return '%.3f' % s

In [None]:
def compute_significance_pre_mid_post(ideo):

    visaIds = survey[survey['ideo5'] == ideo]['session_visa']
    partial_log = full_log[full_log['visaId'].isin(visaIds)]

    test_results = {i: {} for i in ['PRE', 'MID', 'POST']}
    
    for stage1 in ['PRE', 'MID', 'POST'][::-1]:
        for stage2 in ['PRE', 'MID', 'POST']:
            
            relevant_logs_1 = partial_log[partial_log['intervention_stage'] == stage1]
            relevant_logs_1 = relevant_logs_1[relevant_logs_1['intervention'] == 'background']

            relevant_logs_2 = partial_log[partial_log['intervention_stage'] == stage2]
            relevant_logs_2 = relevant_logs_2[relevant_logs_2['intervention'] == 'background']

            dataset1w = get_scores(get_watched_videos(relevant_logs_1))
            dataset1r = get_scores(get_homepage_videos(relevant_logs_1))

            dataset2w = get_scores(get_watched_videos(relevant_logs_2))
            dataset2r = get_scores(get_homepage_videos(relevant_logs_2))
                        
            test_results[stage1][stage2] = kstest(dataset1w, dataset2w), kstest(dataset1r, dataset2r)
            
    return test_results

In [None]:
for ideology in ideologies:
    test_results = compute_significance_pre_mid_post(ideology)
    for i in ['PRE', 'MID', 'POST']:
        if i == 'PRE':
            print('\\hline \\multirow{6}{*}{%s} ' % ideology, end='')
        print('& \\multirow{2}{*}{%s} & %s & %s & %s & %s \\\\\n\\cline{3-6}' % (i.capitalize(), 'Watched', test_results[i]['PRE'][0], test_results[i]['MID'][0], test_results[i]['POST'][0]))
        print('& & %s & %s & %s & %s \\\\\n\\cline{2-6}' % ('Recommended', test_results[i]['PRE'][1], test_results[i]['MID'][1], test_results[i]['POST'][1]))

In [None]:
def compute_significance_watched_recommended(ideo):

    visaIds = survey[survey['ideo5'] == ideo]['session_visa']
    partial_log = full_log[full_log['visaId'].isin(visaIds)]

    test_results = {}
    
    for stage1 in ['PRE', 'MID', 'POST'][::-1]:

        relevant_logs = partial_log[partial_log['intervention_stage'] == stage1]
        relevant_logs = relevant_logs[relevant_logs['intervention'] == 'background']

        dataset1w = get_scores(get_watched_videos(relevant_logs))
        dataset1r = get_scores(get_homepage_videos(relevant_logs))

        test_results[stage1] = len(dataset1w), len(dataset1r) #kstest(dataset1w, dataset1r)
        
    return test_results

In [None]:
for ideology in ideologies:
    test_results = compute_significance_watched_recommended(ideology)
    print('%s & %s & %s & %s \\\\\n\hline' % (ideology, test_results['PRE'] , test_results['MID'], test_results['POST']))