In [None]:
import os
import pandas
import json
import numpy as np
import pandas as pd
from functools import reduce

import networkx as nx
from matplotlib import pyplot as plt
from collections import Counter

In [None]:
# import features df
video_info = pd.read_csv('../../data/derived_data/analysis/video_info.csv')

# import channel leanings and merge into features
channel_leanings = pd.read_csv('../../data/derived_data/analysis/channel_classification.csv')
video_info = video_info.merge(channel_leanings, on=['channel_name', 'channel_id'], how='left')

# import channel information
channel_info = pd.read_csv('../../data/derived_data/analysis/channel_info.csv')

# get top 100 channels, by number of visits
n = 200
counts = (video_info
         .groupby('channel_id')['video_id']
         .agg('count')
         .reset_index()
         .rename(columns={'video_id': 'n'}))

top_n = (counts
        .sort_values('n', ascending = False)
        .head(n))

In [None]:
# create weighted digraph from multi-edge adjacency list
G = nx.DiGraph()
edges = []
with open(os.path.join('../../data/derived_data/analysis/channel_adjacency.txt'), 'r') as f:
    for line in f:
        line_tuple = line.rstrip().split(',')
        parent_node = line_tuple[0]
        children_dict = dict(Counter(line_tuple[1:]))
        edge_tuple = [[parent_node, child, children_dict[child]] for child in children_dict]
        edges += edge_tuple

G.add_weighted_edges_from(edges)

# add in channel leanings
leaning = dict(zip(channel_leanings.channel_id, channel_leanings.leaning))
nx.set_node_attributes(G, name='leaning', values=leaning)

# save graph as graphml object
nx.write_graphml(G, '../../data/derived_data/analysis/channel_graph.graphml')

# save a graphml object for the channels we can classify by leaning
G_lean = G.subgraph(channel_leanings.channel_id.values)
nx.write_graphml(G_lean, '../../data/derived_data/analysis/channel_subgraph.graphml')

## Visualizing the Top 100 Channels

First plotting the adjacency matrix, normalized so that $A_{ij} = Pr(e(i,j))$

In [None]:
# extract the top 100 channels subgraph
G_top = G.subgraph(top_n.channel_id.values)
A_top = nx.adjacency_matrix(G_top).todense()
# normalize the mixing matrix so rows sum to 1
A_top = A_top / A_top.sum(axis=1)

# plot it
plt.figure(figsize=(7,7))
plt.imshow(A_top)
plt.colorbar(shrink=0.8)
plt.show()

## Channel Centralities

Get a bunch of centrality measures for each of the channels.

In [None]:
# get centrality measures from the graph
evec = nx.eigenvector_centrality(G)
indeg = nx.in_degree_centrality(G)
outdeg = nx.out_degree_centrality(G)
pr = nx.pagerank(G)

# make into individual dfs (doesn't seem to be a way to do this all at once?)
evec_df = (pd.DataFrame.from_dict(evec, orient='index')
          .rename(columns={0: 'evec'}))
indeg_df = (pd.DataFrame.from_dict(indeg, orient='index')
           .rename(columns={0: 'indeg'}))
outdeg_df = (pd.DataFrame.from_dict(outdeg, orient='index')
            .rename(columns={0: 'outdeg'}))
pr = (pd.DataFrame.from_dict(pr, orient='index')
     .rename(columns={0: 'pr'}))

# merge everything together into one df
centralities = (reduce(lambda x, y: pd.merge(x, y, how='outer',
                                          left_index=True,
                                          right_index=True),
                     [evec_df, indeg_df, outdeg_df, pr])
               .reset_index()
               .rename(columns={'index':'channel_id'}))

# write to csv
centralities.to_csv('../../data/derived_data/analysis/channel_centralities.csv',
                    index=False)