# Preprocessing

This notebook does some pre-processing of the unstructured data gathered by the scraper.

In [4]:
import json
import re
import os

import pandas as pd
import numpy as np
import networkx as nx

In [3]:
scrape_path = '../../data/scrape_results_redo'
outdir = '../../data/derived_data/analysis_redo'

# import the raw video info json
raw = pd.read_json(os.path.join(scrape_path, 'video_info.json'),
                   orient='index')
raw.reset_index(inplace=True)
raw.rename(index=str, columns={"index": "video_id"}, inplace=True)
raw.to_csv(os.path.join(outdir, 'video_info.csv'),
                index=False)

### Make an adjacency list 

Combine the BFS tree searches into one big ol' adjacency list

In [40]:
# dictionary of out-edges
result = {}

# populate the dictionary
for folder in os.listdir(scrape_path):
    if not os.path.isdir(os.path.join(scrape_path, folder)):
        continue
        
    with open(os.path.join(scrape_path, folder, 'search_info.json'), 'r') as f:
        search_info = json.load(f)
        
    for video_id in search_info:
        if video_id in result:
            result[video_id].union(set(search_info[video_id]['recommendations']))
        else:
            result[video_id] = set(search_info[video_id]['recommendations'])

# save as adjacency list
f = open(os.path.join(outdir, 'adjacency_list.txt'), 'w')
for video_id in result:
    line = "{} {}".format(video_id, " ".join(result[video_id]))
    f.write(line + "\n")

### Make search info dataframe

The json search_info format was nice for scraping, not nice for analyzing. Pack it into one csv.

In [43]:
columns = ['video_id', 'recommendations', 'depth', 'search', 'query',
           'search_splits', 'search_depth', 'root_video']
result = pd.DataFrame(columns=columns)

for folder in os.listdir(scrape_path):
    if not os.path.isdir(os.path.join(scrape_path, folder)):
        continue
    
    with open(os.path.join(scrape_path, folder, 'params.json'), 'r') as f:
        params = json.load(f)
    
    filepath = os.path.join(scrape_path, folder, 'search_info.json')
    search_df = pd.read_json(filepath, orient='index').reset_index()
    search_df.rename(index=str, columns={'index': 'video_id'}, inplace=True)
    
    search_df['search'] = folder
    search_df['query'] = folder.split("_")[0]
    search_df['search_splits'] = params['n_splits']
    search_df['search_depth'] = params['depth']
    search_df['root_video'] = folder.split("_")[1]
    
    result = result.append(search_df, ignore_index=True)
    
result.to_csv(os.path.join(outdir, 'search_info.csv'), index=False)

### In-degrees

In [52]:
# open adjacency list
f = open(os.path.join(outdir, 'adjacency_list.txt'), 'r')

in_degrees = {}
out_degrees = {}

for line in f.read().splitlines():
    out_degrees[line.split(" ")[0]] = len(line.split(" ")[1:])
    for ix, video_id in enumerate(line.split(" ")):
        if ix == 0 or video_id == "":
            continue
        if video_id in in_degrees:
            in_degrees[video_id] += 1
        else:
            in_degrees[video_id] = 1
            
in_deg = pd.DataFrame.from_dict(in_degrees, orient="index")
in_deg = in_deg.rename(index=str, columns={0: 'in_deg'})

out_deg = pd.DataFrame.from_dict(out_degrees, orient="index")
out_deg = out_deg.rename(index=str, columns={0: 'out_deg'})

full = in_deg.join(out_deg, how='left').reset_index()\
             .rename(index=str, columns={'index': 'video_id'})
    
full.to_csv(os.path.join(outdir, 'vertex_degrees.csv'), index=False)

### Pageranks

In [6]:
# import the graph from adjacency list
G = nx.read_adjlist(create_using=nx.DiGraph(), 
                    path=os.path.join(outdir, "adjacency_list.txt"))

# load pageranks into a dataframe
pr = nx.pagerank(G)
pr_df = pd.DataFrame.from_dict(pr, orient="index").reset_index()\
                 .rename(index=str, columns={'index': 'video_id', 0: 'pagerank'})
    
pr_df.to_csv(os.path.join(outdir, 'pageranks.csv'), index=False)