# Some Preprocessing

This notebook does some pre-processing of the unstructured data gathered by the scraper. The goal is to be able to move things to R and avoid matplotlib :) 

In [1]:
import json
import re
import os

import pandas as pd
import numpy as np
from textblob import TextBlob

In [2]:
# import the raw video info json
raw = pd.read_json('results/video_info.json', orient='index')
raw.reset_index(inplace=True)
raw.rename(index=str, columns={"index": "video_id"}, inplace=True)
raw.drop(['comments', 'description'], axis=1).to_csv('derived_data/video_info.csv',
                                                    index=False)

## Sentiment Analysis

### Sentiment Analysis of Comments

In [3]:
# process the comments
comments = raw[['video_id', 'comments']].copy()

comments = comments.comments.apply(lambda x: pd.Series(x))\
                        .stack()\
                        .reset_index(level=1, drop=True)\
                        .to_frame('comments')\
                        .join(comments[['video_id']], how='left')

comments['comments'] = comments['comments'].astype('str')

polarities = comments['polarity'] = comments.comments\
                       .apply(lambda x: TextBlob(x).sentiment.polarity)
    
avg_polarity = comments.groupby('video_id')['polarity'].mean().reset_index()
avg_polarity.to_csv('derived_data/comment_sentiments.csv', index=False)

### Sentiment Analysis of Descriptions

In [4]:
descriptions = raw[['video_id', 'description']].copy()
descriptions['description'] = descriptions.description.astype(str)

descriptions['polarity'] = descriptions.description.\
                           apply(lambda x: TextBlob(x).sentiment.polarity)
    
descriptions[['video_id', 'polarity']].to_csv('derived_data/description_sentiments.csv',
                                             index=False)

### Sentiment Analysis of Captions

In [5]:
polarities = {}

for file in os.listdir('derived_data/captions_clean/'):
    if file[0] == '.':
        continue
    video_id = file.split('.')[0]
    captions = open(os.path.join('derived_data/captions_clean', file), 'r').readlines()
    captions = [line.replace('\n', '') for line in captions]
    blob = TextBlob(" ".join(captions))
    polarities[video_id] = blob.sentiment.polarity
    
polarity_df = pd.DataFrame(list(polarities.items()), columns=['video_id', 'polarity'])
polarity_df.to_csv('derived_data/caption_sentiments.csv', index=False)

# Misc Processing

### Make a category id <-> category crosswalk

In [6]:
with open('results/category_info.json', 'r') as f:
    raw = json.load(f)

result = {}
for item in raw.get('items', []):
    result[item['id']] = item['snippet']['title']
    

crosswalk = pd.DataFrame.from_dict(result, orient="index").reset_index()
crosswalk.rename(index=str, columns={'index': 'category_id',
                                     0: 'category_name'},
                inplace=True)

crosswalk.to_csv('derived_data/category_crosswalk.csv', index=False)

### Make an adjacency list 

Combine the BFS tree searches into one big ol' adjacency list

In [7]:
# dictionary of out-edges
result = {}

# populate the dictionary
for folder in os.listdir('results'):
    if not os.path.isdir(os.path.join('results', folder)) or folder == "captions":
        continue
        
    with open(os.path.join('results', folder, 'search_info.json'), 'r') as f:
        search_info = json.load(f)
        
    for video_id in search_info:
        if video_id in result:
            result[video_id].union(set(search_info[video_id]['recommendations']))
        else:
            result[video_id] = set(search_info[video_id]['recommendations'])

# save as adjacency list
f = open('derived_data/adjacency_list.txt', 'w')
for video_id in result:
    line = "{} {}".format(video_id, " ".join(result[video_id]))
    f.write(line + "\n")

### Make search info dataframe

The json search_info format was nice for scraping, not nice for analyzing. Pack it into one csv.

In [8]:
columns = ['video_id', 'recommendations', 'depth', 'search', 'query',
           'search_splits', 'search_depth', 'root_video']
result = pd.DataFrame(columns=columns)

for folder in os.listdir('results'):
    if not os.path.isdir(os.path.join('results', folder)) or folder == "captions":
        continue
    
    with open(os.path.join('results', folder, 'params.json'), 'r') as f:
        params = json.load(f)
    
    filepath = os.path.join('results', folder, 'search_info.json')
    search_df = pd.read_json(filepath, orient='index').reset_index()
    search_df.rename(index=str, columns={'index': 'video_id'}, inplace=True)
    
    search_df['search'] = folder
    search_df['query'] = folder.split("_")[0]
    search_df['search_splits'] = params['n_splits']
    search_df['search_depth'] = params['depth']
    search_df['root_video'] = folder.split("_")[1]
    
    result = result.append(search_df, ignore_index=True)
    
result.to_csv('derived_data/search_info.csv', index=False)

## Graph Stuff

### In-degrees

In [9]:
# open adjacency list
f = open('derived_data/adjacency_list.txt', 'r')

in_degrees = {}
out_degrees = {}

for line in f.read().splitlines():
    out_degrees[line.split(" ")[0]] = len(line.split(" ")[1:])
    for ix, video_id in enumerate(line.split(" ")):
        if ix == 0 or video_id == "":
            continue
        if video_id in in_degrees:
            in_degrees[video_id] += 1
        else:
            in_degrees[video_id] = 1
            
in_deg = pd.DataFrame.from_dict(in_degrees, orient="index")
in_deg = in_deg.rename(index=str, columns={0: 'in_deg'})

out_deg = pd.DataFrame.from_dict(out_degrees, orient="index")
out_deg = out_deg.rename(index=str, columns={0: 'out_deg'})

full = in_deg.join(out_deg, how='left').reset_index()\
             .rename(index=str, columns={'index': 'video_id'})
    
full.to_csv('derived_data/vertex_degrees.csv', index=False)