In [100]:
%%time 

tweet_ids = []
word_ids = []
word_to_id = {}
i=0
with open("askacop.csv", "r") as f:
    for line in f:
        for w in line.rstrip().split(","):
            if w not in word_to_id:
                word_to_id[w] = len(word_to_id)
        tweet_ids.append(i)
        word_ids.append(word_to_id[w])
        i += 1

CPU times: user 76 ms, sys: 0 ns, total: 76 ms
Wall time: 155 ms


In [101]:
%%time
  
import numpy as np
from scipy.sparse import csr_matrix 

adj = csr_matrix( 
    ( np.ones((len(word_ids),)), (np.array(word_ids),np.array(tweet_ids)) ), 
    shape=(len(word_to_id), i) )

row_sums = adj.sum(axis=1).A1
words = list(range(len(word_to_id)))
for w in word_to_id:
    words[word_to_id[w]] = w
words = np.array(words)

adj = adj[np.where(row_sums>1)]
words = words[np.where(row_sums>1)]
row_sums = adj.sum(axis=1).A1
print(adj.shape)

(689, 15791)
CPU times: user 8.51 ms, sys: 0 ns, total: 8.51 ms
Wall time: 15.9 ms


In [102]:
%%time

from sklearn.decomposition import TruncatedSVD 
embedded_coords = TruncatedSVD(n_components=120).fit_transform(adj[:,:15000]) 

from sklearn.preprocessing import normalize 
embedded_coords_l1 = normalize(embedded_coords, norm='l1')

from scipy.stats import rankdata
embedded_ranks_l1 = np.array([rankdata(c) for c in embedded_coords_l1.T]).T

CPU times: user 2.78 s, sys: 17.3 ms, total: 2.8 s
Wall time: 3.05 s


In [103]:
%%time

from sklearn.cluster import KMeans
clusters = KMeans(n_clusters=20).fit_predict(embedded_ranks_l1)

CPU times: user 3.18 s, sys: 7.71 ms, total: 3.19 s
Wall time: 3.27 s


In [104]:
row_selector = np.where(row_sums>0)
row_selector[0].shape

(689,)

In [105]:
%%time

from sklearn.manifold import TSNE
xycoords = TSNE().fit_transform(embedded_coords_l1[row_selector])

CPU times: user 2.9 s, sys: 222 ms, total: 3.13 s
Wall time: 3.26 s


In [106]:
import bokeh.plotting as bp
from bokeh.models import HoverTool 
bp.output_notebook()

In [107]:

plot_data = bp.ColumnDataSource(data=dict( 
    word = words[row_selector],
    x = xycoords[:,0],
    y = xycoords[:,1],
    r = np.log2(row_sums[row_selector])/10,
))



p = bp.figure(plot_width=900, plot_height=900, title="Words in Tweets Map via t-SNE",
       tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
       x_axis_type=None, y_axis_type=None, min_border=1)
p.scatter("x","y",radius="r", source=plot_data)
p.select(dict(type=HoverTool)).tooltips = [("","@word")]
bp.show(p)