In [1]:
%matplotlib inline
import pandas as pd
import operator
import numpy as np
import matplotlib.pyplot as plt
from scipy import spatial
import json
from sklearn.cluster import KMeans
from IPython.display import IFrame

In [2]:
# Helper functions

def cosine_similarity(d1,d2):
    return 1.0 - spatial.distance.cosine(d1,d2)

def find_similar(df,subs,ops=None,nreturn=5):
    subs = [s.lower() for s in subs]
    assert len(subs)-1 == len(ops)
    sims = {}
    d1 = df.loc[subs[0]].values
    if len(subs) > 1:
        for i, s in enumerate(subs[1:]):
            if ops[i] == '+':
                d1 = d1 + df.loc[s].values
            elif ops[i] == '-':
                d1 = d1 - df.loc[s].values
            else:
                print "invalid operator! check your input"
    for s in df.index.values:
        if s in subs: continue
        d2 = df.loc[s].values
        sims[s] = cosine_similarity(d1,d2)
    return sorted(sims.items(), key=operator.itemgetter(1),reverse=True)[:nreturn]

def normalize(x):
    xnorm = np.sqrt((x**2).sum())
    return x/xnorm

def to_ppmi(df1,normalize_ppmi=True):
    df = df1.copy()
    totsum = df.values.sum()
    psc = df.apply(lambda x: x/totsum)
    psc['row_sum'] = psc.sum(axis=1)
    psc.loc['col_sum']= psc.sum()
    cols = [col for col in psc.columns if col not in ['row_sum']]
    rows = [row for row in psc.index if row not in ['col_sum']]
    pmi = psc.loc[rows,cols]
    sumrow = psc.loc[rows,'row_sum']
    pmi = pmi.divide(sumrow.astype(float),axis=0)
    sumcol = psc.loc['col_sum',cols]
    pmi = pmi.divide(sumcol.astype(float),axis=1)
    pmi = np.log2(pmi)
    pmi[pmi < 0.0] = 0.0
    pmi.index = pmi.index.str.lower()
    pmi.columns = pmi.columns.str.lower()
    if normalize_ppmi:
        pmi = pmi.apply(normalize, axis=1)
    return pmi

def create_json_graph(cmatrix,rowsum,kmean_label):
    cmax = cmatrix[cmatrix<1.0].max().max()
    cmin = 0.2
    lmin = 0.5
    lmax = 15
    smax = 20
    smin = 3
    rmax = np.log10(rowsum[0])
    rmin = np.log10(rowsum[cmatrix.shape[0]])
    nodes_list = []
    for i,ci in enumerate(cmatrix.index):
        sprime = (np.log10(rowsum[i])-rmin) / (rmax-rmin) * (smax-smin) + smin
        nodes_list.append({'id':ci, 'group': int(kmeans_label[i]), 'size': sprime})

    links_list = []
    for i,ic in enumerate(cmatrix.index):
        for j,jc in enumerate(cmatrix.columns):
            if i != j:
                if cmatrix.loc[ic,jc] > cmin:
                    c = cmatrix.loc[ic,jc]
                    # scale correlation measure between lmax and lmin (for link width)
                    cprime = (c-cmin) / (cmax-cmin) * (lmax-lmin) + lmin
                    links_list.append({'source': ic, 'target': jc, 'value': cprime })

    json_prep = {"nodes":nodes_list, "links":links_list}
    return json_prep

In [3]:
!head sbreddit_overlap_v2.csv

t1_subreddit,t2_subreddit,NumOverlaps
terriblefacebookmemes,aww,256
PoliticalHumor,self,256
whowouldwin,trashy,256
Damnthatsinteresting,StarWars,256
ANormalDayInRussia,The_Donald,256
de,television,256
photography,aww,768
whowouldwin,technology,1280
holdthemoan,de,1


In [None]:
fname='sbreddit_overlap_v2.csv'

pivoted_fname = "subreddit_overlap_pivoted.csv"

#df = pd.read_csv(fname)
#pivoted = df.pivot_table(values='NumOverlaps', index='t2_subreddit',columns='t1_subreddit',fill_value=0)
#pivoted.to_csv(pivoted_fname)

pivoted_fname = "subreddit_overlap_pivoted.csv"
pivoted = pd.read_csv(pivoted_fname,index_col=0)

# get rid of unvisited subreddits
noverlap = pivoted.sum(axis=1)
trimmed = pivoted[noverlap>1000]
print "original shape / trimmed shape", pivoted.shape, trimmed.shape

In [None]:
# create positive pointwise mutual information matrix
ppmi = to_ppmi(trimmed)
ppmi.head()

In [None]:
# Explore similar subreddits
# inspired by: https://fivethirtyeight.com/features/dissecting-trumps-most-rabid-online-following/

subs = ['cocktails']
ops = []
print "cocktails similarities: "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs

In [None]:
subs = ['astronomy']
ops = []
print "astronomy similarities: "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs

In [None]:
subs = ['babyelephantgifs']
ops = []
print "baby elephant gifs similarities: "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs

In [None]:
subs = ['30rock']
ops = []
print "30 rock similarities: "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs

In [None]:
subs = ['austin']
ops = []
print "austin similarities: "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs

In [None]:
# subreddit 'algebra'

subs = ['austin','texas']
ops = ['-']
print "austin - texas: "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs

In [None]:
subs = ['texas','austin']
ops = ['-']
print "texas - austin: "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs
    

In [None]:
subs = ['austin','texas','oregon']
ops = ['-','+']
print "austin - texas + oregon: "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs
    

In [None]:
subs = ['the_donald','politics']
ops = ['-']
print "trump - politics: "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs

In [None]:
subs = ['hillaryclinton','politics']
ops = ['-']
print "clinton - politics = "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs
    
subs = ['sandersforpresident','politics']
ops = ['-']
print ''
print "sanders - politics = "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs

In [None]:
subs = ['personalfinance','frugal']
ops = ['-']
print "personal finance - frugal = "
for s,cs in find_similar(ppmi,subs,ops):
    print s, cs

In [None]:
# Visualize relationship among top subreddits

# take top ntrim subreddits based on overlaps
ntop = 100
trimmed['rowsum'] = trimmed.sum(axis=1)
trimmed.sort_values('rowsum',ascending=False,inplace=True)
rowsum = trimmed['rowsum']
topsubs = trimmed[:ntop]
del topsubs['rowsum']
#topsubs.head()

In [None]:
# convert to ppmi
ppmi = to_ppmi(topsubs)

In [None]:
# determine (rough) kmeans clustering
kmeans_label = KMeans(n_clusters=5).fit_predict(ppmi.values)
kmeans_label

In [None]:
# find correlation of subreddits with every other subreddit based on ppmi vector
cmatrix = ppmi.T.corr()
cmatrix.head()

In [None]:
# create graph structure in json format and save
json_graph = create_json_graph(cmatrix,rowsum,kmeans_label)
with open('cmatrix'+str(ntop)+'.json', 'w') as fp:
    json.dump(json_graph, fp)
print json_graph

In [None]:
# use D3 to create force-directed graph:
IFrame('fdg.html', width=1100, height=700)