In [None]:
import pandas as pd
import political_classifier
import json
import pickle
import os
import numpy as np
from util import get_engine, fetch_metadata_using_api
from datetime import datetime
from random import sample, random
from tqdm.auto import tqdm
import requests
from collections import Counter

os.environ['MKL_THREADING_LAYER'] = 'GNU'

In [None]:
PUPPET_DIR = './puppets-old'
with open('processed-puppets.txt') as f:
    PUPPET_LIST = f.read().strip().split('\n')
    PUPPET_LIST = [p for p in PUPPET_LIST if os.path.exists(os.path.join(PUPPET_DIR, p))]
print('processed puppets: %s' % len(PUPPET_LIST))
print('collected puppets: %s' % len(os.listdir(PUPPET_DIR)))

In [None]:
with open('cache/metadata.pickle', 'rb') as f:
    metadata = pickle.load(f)

In [None]:
def load_puppet(puppetId):
    with open(os.path.join(PUPPET_DIR, puppetId)) as f:
        return json.load(f)
    
def get_puppet_data(version, filter_injection_type):
    for puppetId in PUPPET_LIST:
        
        if version == 3 and not puppetId.endswith('-v3'):
            continue
        if version == 4 and not puppetId.endswith('-v4'):
            continue
            
        # load  puppet data
        puppet = load_puppet(puppetId)

        tokens = puppetId.split(',')
        
        if len(tokens) == 3:
            trace_id, alpha, _id = tokens
            alpha = alpha[:3]
            intervention_type = 'news-random'
        elif len(tokens) == 4:
            trace_id, intervention_type, injection_type, injections = tokens
            injections, _ = injections.split('-')
           
        # alpha or fixed
        if injection_type != filter_injection_type:
            continue
            
        yield intervention_type, injections, puppet, puppet['actions']

In [None]:
puppet_data = list(get_puppet_data(3, 'alpha')) + list(get_puppet_data(3, 'fixed')) + list(get_puppet_data(4, 'alpha')) + list(get_puppet_data(4, 'fixed')) 

In [None]:
videos = set()
for intervention_type, injections, puppet, actions in puppet_data:
        
    init, pre, post = [action['params'] for action in actions if action['action'] == 'get_homepage']
    for vId in init+ pre + post:
        videos.add(vId)
        
len(videos)

In [None]:
# load already classified videos
classified = pd.read_pickle('cache/political-classification.pickle')

to_classify = []

for video_id in videos:
    # skip already classified videos
    if video_id in classified['video_id'].values:
        continue
     
    # get video metadata
    if video_id in metadata:
        to_classify.append(dict(
            text=metadata[video_id]['title'],
            video_id=video_id
        ))
        
print(len(to_classify))

In [None]:
# classify videos
try:
    classification = political_classifier.classifier(pd.DataFrame(to_classify))
    classified = pd.concat([classified, classification])
    # save to cache
except Exception as e:
    print(e)
    pass

In [None]:
classified.to_pickle('cache/political-classification.pickle')

In [None]:
is_political_video = {row.video_id: row.is_political for row in classified.itertuples()}

def is_political(video):
    return video in is_political_video and is_political_video[video]

In [None]:
news_channel_ids = pd.read_csv('data/news_channel_ids.txt')['channel'].tolist()

def is_news(video):
    return metadata.get(video, {}).get('channel_id', '') in news_channel_ids

In [None]:
CLASSIFICATIONS = ['News', 'Political non-news', 'Other']

def classify_video(vId):
    if is_news(vId):
        return 'News'
    if not is_news(vId) and is_political(vId):
        return 'Political non-news'
    if not is_news(vId) and not is_political(vId):
        return 'Other'

In [None]:
ratios = {}

for intervention_type, injections, puppet, actions in puppet_data:
    
    if intervention_type not in ratios:
        ratios[intervention_type] = {'pre': {}, 'post': {}}
        
    init, pre, post = [action['params'] for action in actions if action['action'] == 'get_homepage']
    
    for label, arr in zip(['pre', 'post'], [pre, post]):
        classf = [classify_video(vId) for vId in arr]
        counts = Counter(classf)
        total = sum(counts.values())

        for clf in classf:
            if clf not in ratios[intervention_type][label]:
                ratios[intervention_type][label][clf] = []
            ratios[intervention_type][label][clf].append(counts[clf] / total)

In [None]:
for intervention_type in ratios:
    print(intervention_type)
    print('%.2f' % (np.mean(ratios[intervention_type]['post']['News']) / np.mean(ratios[intervention_type]['pre']['News'])))
    print('%.2f' % (np.mean(ratios[intervention_type]['post']['Political non-news']) / np.mean(ratios[intervention_type]['pre']['Political non-news'])))

In [None]:
ratios = {}

for intervention_type, injections, puppet, actions in puppet_data:
    
    if injections not in ratios:
        ratios[injections] = {'pre': {}, 'post': {}}
        
    init, pre, post = [action['params'] for action in actions if action['action'] == 'get_homepage']
    
    for label, arr in zip(['pre', 'post'], [pre, post]):
        classf = [classify_video(vId) for vId in arr]
        counts = Counter(classf)
        total = sum(counts.values())

        for clf in classf:
            if clf not in ratios[injections][label]:
                ratios[injections][label][clf] = []
            ratios[injections][label][clf].append(counts[clf] / total)

In [None]:
alpha = ['0.0', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9']
fixed = ['0', '1', '3', '5', '6', '7', '8', '9', '15', '20']

In [None]:
len(alpha), len(fixed)

In [None]:
print(' & '.join(alpha))

In [None]:
for injections in alpha:
    print('$%.2f\\times$' % (np.mean(ratios[injections]['post']['News']) / np.mean(ratios[injections]['pre']['News'])), end=' & ')

In [None]:
for injections in alpha:
    print('$%.2f\\times$' % (np.mean(ratios[injections]['post']['Political non-news']) / np.mean(ratios[injections]['pre']['Political non-news'])), end=' & ')

In [None]:
print(' & '.join(fixed))

In [None]:
for injections in fixed:
    print('$%.2f\\times$' % (np.mean(ratios[injections]['post']['News']) / np.mean(ratios[injections]['pre']['News'])), end=' & ')

In [None]:
for injections in fixed:
    print('$%.2f\\times$' % (np.mean(ratios[injections]['post']['Political non-news']) / np.mean(ratios[injections]['pre']['Political non-news'])), end=' & ')