In [11]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as ss
import numpy as np
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import normalize
from sklearn.base import BaseEstimator
from sklearn.utils import check_array
from sklearn.cluster import DBSCAN, KMeans
from sklearn.manifold import TSNE
from os.path import isfile
import subprocess

import hdbscan

from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import HoverTool, ColumnDataSource, CustomJS, value
from bokeh.models.mappers import LinearColorMapper
from bokeh.palettes import plasma
from collections import OrderedDict

output_notebook()

In [2]:
data_dir = '/home/ed/github/pod_tweets/follower_ids/'
pod_list = os.listdir(data_dir)
num_pods = len(pod_list)

In [3]:
def calc_common_usrs(data_dir, pod1, pod2):
    pod_list1 = [line.rstrip('\n') for line in open(data_dir + pod1)]
    pod_list1.pop(0)
    pod_list2 = [line.rstrip('\n') for line in open(data_dir + pod2)]
    pod_list2.pop(0)
    num_same = set(pod_list1) & set(pod_list2)
    return len(num_same), len(pod_list1), len(pod_list2)

In [4]:
pod_common_usrs = pd.DataFrame(columns = ['podcast_1', 'podcast_2', 'comm_users'])
pod_popularity = pd.DataFrame(pod_list, columns=['podcast'])
pod_popularity['followers'] = 0
pod_popularity = pod_popularity.set_index('podcast')

for x1 in range(num_pods):
    for x2 in range(x1+1, num_pods):
        print('{}:{} of {}'.format(x1, x2, num_pods), end='\r', flush=True)
        comm_num, pod_x1, pod_x2 = calc_common_usrs(data_dir, pod_list[x1], pod_list[x2])
        pod_common_usrs.loc[len(pod_common_usrs)] = [pod_list[x1], pod_list[x2], comm_num]
        pod_popularity.at[pod_list[x1], 'followers'] = pod_x1
        pod_popularity.at[pod_list[x2], 'followers'] = pod_x2

69:70 of 71

In [5]:
pod_popularity = pod_popularity.sort_values(by='followers', ascending=False)
pod_pop_arr = np.array(pod_popularity.index)

index_map = dict(np.vstack([pod_pop_arr, np.arange(pod_pop_arr.shape[0])]).T)

count_matrix = ss.coo_matrix((pod_common_usrs.comm_users, 
                              (pod_common_usrs.podcast_2.map(index_map),
                               pod_common_usrs.podcast_1.map(index_map))),
                             shape=(pod_pop_arr.shape[0], pod_pop_arr.shape[0]),
                             dtype=np.float64)

conditional_prob_matrix = count_matrix.tocsr()
conditional_prob_matrix = normalize(conditional_prob_matrix, norm='l1', copy=False)

In [7]:
reduce_conProM = TSNE(n_components=2, perplexity=10).fit_transform(conditional_prob_matrix)
reduce_conProM = normalize(reduce_conProM, norm='l2', copy=False)

In [12]:
pod_space_df = pd.DataFrame(reduce_conProM, columns=('x', 'y'))
pod_space_df['pod_name'] = pod_pop_arr
pod_space_df['pod_name'] = pod_space_df.pod_name.str[:-4]
# pod_space_df = pod_space_df.sort_values(by='pod_name', ignore_index=True)
# pod_space_df.to_pickle(resources_dir + 'pod_space_df.pkl')
pod_space_df.head()

Unnamed: 0,x,y,pod_name
0,-0.709126,-0.705082,60Minutes
1,-0.739136,0.673556,48Hours
2,-0.625223,0.780447,1A
3,0.987835,-0.155506,99Invisible
4,0.999101,-0.042399,BehindtheBastards


In [14]:
clustering_martrix = pd.DataFrame(reduce_conProM, columns=['x', 'y'])

clusterer = hdbscan.HDBSCAN(min_samples=15, metric='manhattan', 
                            min_cluster_size=2).fit(clustering_martrix)
cluster_ids = clusterer.labels_
print('num clusters = {}'.format(clusterer.labels_.max()+1))

clustering_martrix['cluster'] = cluster_ids
clustering_martrix['pod_name'] = pod_space_df.pod_name

num clusters = 3


In [16]:
# Construct a color palette and map clusters to colors
palette = ['#777777'] + plasma(cluster_ids.max())
colormap = LinearColorMapper(palette=palette, low=-1, high=cluster_ids.max())
color_dict = {'field': 'cluster', 'transform': colormap}

# Set fill alpha globally
clustering_martrix['fill_alpha'] = np.exp((reduce_conProM.min() - 
                                     reduce_conProM.max()) / 5.0) + 0.05

# Build a column data source
plot_data = ColumnDataSource(clustering_martrix)

# Custom callback for alpha adjustment
jscode="""
    var data = source.data;
    var start = cb_obj.start;
    var end = cb_obj.end;
    alpha = data['fill_alpha']
    for (i = 0; i < alpha.length; i++) {
         alpha[i] = Math.exp((start - end) / 5.0) + 0.05;
    }
    source.trigger('change');
"""

# Create the figure and add tools
bokeh_figure = figure(title='A Map of PodSpace',
                   plot_width = 700,
                   plot_height = 700,
                   tools=('pan, wheel_zoom, box_zoom, box_select, reset'),
                   active_scroll=u'wheel_zoom')

bokeh_figure.add_tools( HoverTool(tooltips = OrderedDict([('pod_name', '@pod_name'),
                                                       ('cluster', '@cluster')])))

# draw the podcasts as circles on the plot
bokeh_figure.circle(u'x', u'y', source=plot_data,
                 fill_color=color_dict, line_color=None, fill_alpha='fill_alpha',
                 size=10, hover_line_color=u'black')

# bokeh_figure.x_range.callback = CustomJS(args=dict(source=plot_data), code=jscode)
# bokeh_figure.y_range.callback = CustomJS(args=dict(source=plot_data), code=jscode)

# configure visual elements of the plot
bokeh_figure.title.text_font_size = value('18pt')
bokeh_figure.title.align = 'center'
bokeh_figure.xaxis.visible = False
bokeh_figure.yaxis.visible = False
bokeh_figure.grid.grid_line_color = None
bokeh_figure.outline_line_color = '#222222'

In [17]:
# display the figure
show(bokeh_figure)