# Preprocessing

This notebook does some pre-processing of the unstructured data gathered by the scraper.

In [None]:
import json
import re
import os
from datetime import date
import time
import sys

import pandas as pd
import numpy as np
import networkx as nx

from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

KEY_LOC = '../../credentials/api_key.txt'
with open(KEY_LOC, 'r') as f:
    DEVELOPER_KEY = f.read()

youtube = build('youtube', 'v3', developerKey=DEVELOPER_KEY)

In [None]:
scrape_path = '../../data/scrape_results'
outdir = '../../data/derived_data/analysis'
queries = False

if not os.path.isdir(outdir):
    os.mkdir(outdir)

# import the raw video info json
video_info = pd.read_json(os.path.join(scrape_path, 'video_info.json'),
                   orient='index')
video_info.reset_index(inplace=True)
video_info.rename(index=str, columns={"index": "video_id"}, inplace=True)
video_info.to_csv(os.path.join(outdir, 'video_info.csv'),
                index=False)

### Make an adjacency list 

Combine the BFS tree searches into one big ol' adjacency list

In [None]:
# dictionary of out-edges
adj_dict = {}

# populate the dictionary
for folder in os.listdir(scrape_path):
    if not os.path.isdir(os.path.join(scrape_path, folder)):
        continue
        
    with open(os.path.join(scrape_path, folder, 'search_info.json'), 'r') as f:
        search_info = json.load(f)
        
    for video_id in search_info:
        if video_id in adj_dict:
            adj_dict[video_id].union(set(search_info[video_id]['recommendations']))
        else:
            adj_dict[video_id] = set(search_info[video_id]['recommendations'])

# save as adjacency list
f = open(os.path.join(outdir, 'adjacency_list.txt'), 'w')
for video_id in adj_dict:
    line = "{} {}".format(video_id, " ".join(adj_dict[video_id]))
    f.write(line + "\n")
f.close()

### Make a channel adjacency list

Since our classification of channels is based on _channels_, not _videos_ it makes more sense to analyze the _channel_ recommendation graph when it comes to political leaning.

In [None]:
# set up a dataframe of out-edges for each video_id
adj_df = pd.DataFrame.from_dict(adj_dict, orient='index')\
         .reset_index()\
         .rename(columns={'index': 'video_id'})
#adj_df['out_edges'] = adj_df[[0, 1]].values.tolist().drop(columns=[0,1])

# wide -> long
adj_df = pd.melt(adj_df, id_vars=['video_id'], value_vars=[0,1], var_name='child_no',
                 value_name='child_id')

# join in channel information
adj_df = adj_df.merge(video_info[['video_id', 'channel']], on='video_id')\
    .rename(columns={'channel': 'parent_channel'})\
    .merge(video_info[['video_id', 'channel']], left_on='child_id', right_on='video_id', suffixes=('', '_right'))\
    .drop(columns=['video_id_right'])\
    .rename(columns={'channel': 'child_channel'})
    
# aggregate to channel level
channel_adj = adj_df.filter(['parent_channel', 'child_channel'])\
                    .groupby(['parent_channel'])\
                    .agg(lambda x: list(x))\
                    .assign(out_deg = len('child_channel'))\
                    .reset_index()
                
# save as csv
channel_adj.to_csv(os.path.join(outdir, 'channel_adj.csv'), index=False)

# save as comma-delimited adjacency list
f = open(os.path.join(outdir, 'channel_adj.txt'), 'w')
for row in channel_adj.iterrows():
    row_data = row[1]
    line = '{},{}'.format(row_data.parent_channel, ",".join(row_data.child_channel))
    f.write(line + '\n')
f.close()

### Make search info dataframe

The json search_info format was nice for scraping, not nice for analyzing. Pack it into one csv.

In [None]:
columns = ['video_id', 'recommendations', 'depth', 'search', 'query',
           'search_splits', 'search_depth', 'root_video']
result = pd.DataFrame(columns=columns)

for folder in os.listdir(scrape_path):
    if not os.path.isdir(os.path.join(scrape_path, folder)):
        continue
    
    with open(os.path.join(scrape_path, folder, 'params.json'), 'r') as f:
        params = json.load(f)
    
    filepath = os.path.join(scrape_path, folder, 'search_info.json')
    search_df = pd.read_json(filepath, orient='index').reset_index()
    search_df.rename(index=str, columns={'index': 'video_id'}, inplace=True)
    
    search_df['search'] = folder
    search_df['search_splits'] = params['n_splits']
    search_df['search_depth'] = params['depth']
    search_df['scrape_date'] = params['date']
    
    if queries:
        search_df['query'] = folder.split("_")[0]
        search_df['root_video'] = folder.split("_")[-1]
    else:
        search_df['root_video'] = folder
    
    result = result.append(search_df, ignore_index=True)
    
result.to_csv(os.path.join(outdir, 'search_info.csv'), index=False)

### In-degrees

In [None]:
# open adjacency list
f = open(os.path.join(outdir, 'adjacency_list.txt'), 'r')

in_degrees = {}
out_degrees = {}

for line in f.read().splitlines():
    out_degrees[line.split(" ")[0]] = len(line.split(" ")[1:])
    for ix, video_id in enumerate(line.split(" ")):
        if ix == 0 or video_id == "":
            continue
        if video_id in in_degrees:
            in_degrees[video_id] += 1
        else:
            in_degrees[video_id] = 1
            
in_deg = pd.DataFrame.from_dict(in_degrees, orient="index")
in_deg = in_deg.rename(index=str, columns={0: 'in_deg'})

out_deg = pd.DataFrame.from_dict(out_degrees, orient="index")
out_deg = out_deg.rename(index=str, columns={0: 'out_deg'})

full = in_deg.join(out_deg, how='left').reset_index()\
             .rename(index=str, columns={'index': 'video_id'})
    
full.to_csv(os.path.join(outdir, 'vertex_degrees.csv'), index=False)

### Pageranks

In [None]:
# import the graph from adjacency list
G = nx.read_adjlist(create_using=nx.DiGraph(), 
                    path=os.path.join(outdir, "adjacency_list.txt"))

# load pageranks into a dataframe
pr = nx.pagerank(G)
pr_df = pd.DataFrame.from_dict(pr, orient="index").reset_index()\
                 .rename(index=str, columns={'index': 'video_id', 0: 'pagerank'})
    
pr_df.to_csv(os.path.join(outdir, 'pageranks.csv'), index=False)

### Get channel information from the API

In [None]:
channel_ids = video_info.channel_id.unique()

batch_size = 50  # 50 seems to be the API limit per request
channel_info = {}
for ix in range(0, len(channel_ids), batch_size):
    batch = channel_ids[ix: ix+batch_size]
    id_str = ",".join(batch)

    for _ in range(10):
        try:
            response = youtube.channels().list(
                id=id_str,
                part='snippet,statistics,topicDetails'
            ).execute()
            break
        except:
            e = sys.exc_info()[0]
            print(e)
            time.sleep(1)
    else:
        print("Issues with backend: could not get info for {}".format(id_str))
            
    
    for result in response.get('items', []):
        channel_id = result['id']
        # Get channel statistics
        try:
            statistics = result.get('statistics')
            snippet = result.get('snippet')
            if result.get('topicDetails', []):
                cat_urls = result.get('topicDetails')['topicCategories']
                categories = [url.split('/')[-1] for url in cat_urls]
            else:
                categories = -1
        except:
            print("Could not get info for channel {}".format(channel_id))
        
        # Populate the channel dict
        channel_info[channel_id] = {
            'name': snippet.get('title', -1),
            'country': snippet.get('country', -1),
            'date_created': snippet.get('publishedAt', -1),
            'n_subscribers': statistics.get('subscriberCount', -1),
            'n_videos': statistics.get('videoCount', -1),
            'n_views': statistics.get('viewCount', -1),
            'categories': categories}

# save as csv
channel_df = pd.DataFrame.from_dict(channel_info, orient='index')\
               .reset_index()\
               .rename(columns={'index': 'channel_id'})
channel_df.to_csv(os.path.join(outdir, 'channel_info.csv'), index=False)