In [None]:
import pandas as pd
from util import fetch_metadata_using_api
import re
import json
from parser import get_participants
from collections import Counter
import matplotlib.pyplot as plt
from urllib.parse import unquote
import numpy as np
import scienceplots

In [None]:
tex_fonts = {
    "text.usetex": True,
}

plt.style.use(['science', 'nature', 'grid'])
plt.rcParams.update(tex_fonts)
figw, figh = plt.rcParams['figure.figsize']

In [None]:
survey = pd.read_spss('data/AMST0001_OUTPUT_w1andw2.sav')

In [None]:
visa_ids = set(pd.read_csv('data/visas.csv')['visaId'])

In [None]:
news_ids = set(pd.read_csv('data/final_news_ids.csv')['channel_id'])
len(news_ids)

In [None]:
pol_clf = json.load(open('cache/political_classifications.json'))
participants = pd.read_parquet('data/participants.parquet')
finished_participants = set(participants.dropna(subset='finishTime')['userId'])

In [None]:
def jsonify(js):
    if js is None:
        return None
    return json.loads(js)

In [None]:
full_log = pd.read_parquet('data/logs.parquet')
full_log = full_log[full_log['visaId'].isin(visa_ids)]
full_log['homepage_content'] = full_log['homepage_content'].map(jsonify)
full_log['upnext_content'] = full_log['upnext_content'].map(jsonify)

In [None]:
metadata = fetch_metadata_using_api([])

In [None]:
def extract_video_id(url):
    try:
        vId = re.search(r'v=(.{11})', url).group(1)
        if vId is not None and vId != '' and len(vId) == 11:
            return vId
    except: pass
    return None

def extract_video_ids(urls):
    videoIds = []
    for url in urls:
        vId = extract_video_id(url)
        if vId is not None:
            videoIds.append(vId)            
    return videoIds
    
def get_watched_videos(logs):
    urls = []
    for url in logs['url']:
        urls.append(url)
    return extract_video_ids(urls)

def get_homepage_videos(logs):
    urls = []
    for hp in logs['homepage_content'].dropna():
        for url in hp:
            urls.append(url)
    return extract_video_ids(urls)

def get_upnext_videos(logs):
    urls = []
    for hp in logs['upnext_content'].dropna():
        for url in hp:
            urls.append(url)
    return extract_video_ids(urls)    

In [None]:
def is_news(vId):
    return metadata.get(vId, {}).get('channel_id', None) in news_ids

def is_political(vId):
    return pol_clf.get(vId, False)

In [None]:
CLASSIFICATIONS = ['News', 'Political non-news', 'Other']

def classify_video(vId):
    if is_news(vId):
        return 'News'
    if not is_news(vId) and is_political(vId):
        return 'Political non-news'
    if not is_news(vId) and not is_political(vId):
        return 'Other'

In [None]:
videos = get_watched_videos(full_log) + get_homepage_videos(full_log) + get_upnext_videos(full_log)

In [None]:
news_videos = [v for v in videos if classify_video(v) == 'News']
pnn_videos = [v for v in videos if classify_video(v) == 'Political non-news']

In [None]:
ideo = pd.read_csv('data/video_ideology_scores.csv')
ideology_scores = {row.video_id: row.score for row in ideo.itertuples()}

In [None]:
len([i for i in news_videos if i in ideology_scores]) / len(news_videos)

In [None]:
len([i for i in pnn_videos if i in ideology_scores]) / len(pnn_videos)

# Changes to the User

In [None]:
partial_log = full_log

In [None]:
interventions = ['background', 'banner', 'control']
final_names = ['Algorithmic nudge', 'User nudge', 'Control']

In [None]:
perc = {}

In [None]:
fig, axes = plt.subplots(1, 3, dpi=300, figsize=(figw * 2.5, figh*0.75))
caption = 'abcdef'
for intervention, ax, cap in zip(interventions, axes, caption):

    x = []
    y = {'recommended': [], 'watched': []}

    logs = partial_log
    logs = logs[logs['intervention'] == intervention]
    logs = logs[~logs['is_injected']]

    for day in range(28): 
        watched = get_watched_videos(logs[logs['day'] == day])
        recommended = get_homepage_videos(logs[logs['day'] == day])
        recommended.extend(get_upnext_videos(logs[logs['day'] == day]))
        
        clf_watched = [classify_video(vId) for vId in watched]
        clf_recommended = [classify_video(vId) for vId in recommended]
        
        counts_watched = Counter(clf_watched)
        counts_recommended = Counter(clf_recommended)
                
        y['watched'].append(counts_watched.get('News', 0) / sum(counts_watched.values()) * 100)
        y['recommended'].append(counts_recommended.get('News', 0) / sum(counts_recommended.values()) * 100)

        x.append(day)

    ax.plot(x, y['watched'], label='Watched', marker='.')
    ax.plot(x, y['recommended'], label='Recommended', marker='.')
    
    ax.set_xticks([0, 7, 14, 21, 28])
    ax.set_xticklabels(['', 'W1', 'W2', 'W3', ''])
    ax.set_ylim((0, 25))
    ax.set_xlabel('Day')
    # ax.set_xlabel('Day\n\n\\textbf{%s)} %s' % (cap, final_names[interventions.index(intervention)]))
    ax.legend(fontsize=6)
    ax.set_ylabel('Percentage')
    ax.annotate(cap.upper(), (-0.125, 1.05), annotation_clip=False, fontsize=10, xycoords='axes fraction')
    
    perc[intervention] = {}
    perc[intervention]['watched'] = y['watched']
    perc[intervention]['recommended'] = y['recommended']
    
fig.tight_layout()
fig.savefig('figures/percentage-news.pdf', transparent = True, bbox_inches = 'tight', pad_inches = 0.1, dpi=300)

In [None]:
perc = {}

In [None]:
fig, axes = plt.subplots(1, 3, dpi=300, figsize=(8, 2))
caption = 'abc'
for intervention, ax, cap in zip(interventions, axes, caption):

    x = []
    y = {'recommended': [], 'watched': []}

    logs = partial_log
    logs = logs[logs['intervention'] == intervention]
    logs = logs[~logs['is_injected']]

    for day in range(28): 
        watched = get_watched_videos(logs[logs['day'] == day])
        recommended = get_homepage_videos(logs[logs['day'] == day])
        recommended.extend(get_upnext_videos(logs[logs['day'] == day]))
        
        clf_watched = [classify_video(vId) for vId in watched]
        clf_recommended = [classify_video(vId) for vId in recommended]
        
        counts_watched = Counter(clf_watched)
        counts_recommended = Counter(clf_recommended)
        
        
        y['watched'].append(counts_watched.get('Political non-news', 0) / sum(counts_watched.values()) * 100)
        y['recommended'].append(counts_recommended.get('Political non-news', 0) / sum(counts_recommended.values()) * 100)

        x.append(day)

    ax.plot(x, y['watched'], label='Watched', marker='.')
    ax.plot(x, y['recommended'], label='Recommended', marker='.')
    
    ax.set_xticks([0, 7, 14, 21, 28])
    ax.set_xticklabels(['', 'W1', 'W2', 'W3', ''])
    ax.set_ylim((0, 25))
    # ax.set_xlabel('Day\n\n\\textbf{%s)} %s' % (cap, final_names[interventions.index(intervention)]))
    ax.set_xlabel('Day')
    ax.legend(fontsize=6)
    ax.set_ylabel('Percentage')
    ax.annotate(cap.upper(), (-0.125, 1.05), annotation_clip=False, fontsize=10, xycoords='axes fraction')
    
    perc[intervention] = {}
    perc[intervention]['watched'] = y['watched']
    perc[intervention]['recommended'] = y['recommended']
    
fig.tight_layout()
fig.savefig('figures/percentage-pol-non-news.pdf', transparent = True, bbox_inches = 'tight', pad_inches = 0.1, dpi=300)

In [None]:
for day in range(28):
    a = perc['background']
    b = perc['banner']
    c = perc['control']
    print('%s & %.2f\\%% & %.2f\\%% & %.2f\\%% & %.2f\\%% & %.2f\\%% & %.2f\\%% \\\\' % (day+1, a['recommended'][day], a['watched'][day], b['recommended'][day], b['watched'][day], c['recommended'][day], c['watched'][day]))