# Cluster articles according to TFIDF

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix, vstack

import nbimporter
from preprocessed_data_reader import ReaderPreprocessedData

Importing Jupyter notebook from preprocessed_data_reader.ipynb
Importing Jupyter notebook from utils_os.ipynb


## Get training data

In [None]:
dataset = ReaderPreprocessedData.read_data("../preprocessed")

## Train KMeans

In [None]:
def grouper(n_elements_in_batch, l):
    return [l[i:i + n_elements_in_batch] for i in range(0, len(l), n_elements_in_batch)]

In [None]:
idf = pd.read_csv("../resources/wiki-30k-10-IDF.csv")
idf = idf.set_index('term')

In [None]:
for i,batch in enumerate(grouper(100, dataset)):
    rows = [np.log(sample["tfidf"]["logtfidf"] + 1.000001) for sample in batch]
    if i == 0:
        res_matrix = csr_matrix(pd.DataFrame(rows, columns=idf.index).fillna(value=0))
    else:
        delta_matrix = csr_matrix(pd.DataFrame(rows, columns=idf.index).fillna(value=0))
        res_matrix = vstack([res_matrix, delta_matrix])
    print("{0}/{1}".format(i + 1, len(dataset) // 100 + 1))
    
res_matrix

In [None]:
from sklearn.decomposition import TruncatedSVD # PCA does not support sparse input
from sklearn.manifold import TSNE

def to_tsne(x):
    """From 1024 to 50 with PCA, from 50 to 2 with TSNE"""
    y = TruncatedSVD(n_components=50).fit_transform(x)
    y = TSNE(n_components=2).fit_transform(y)
    return y

In [None]:
y = to_tsne(res_matrix)

In [None]:
kmeans = KMeans(n_clusters=15).fit(y)

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

colors = []
for r in range(0, 201, 100):
    for g in range(0, 201, 100):
        for b in range(0, 201, 100):
            colors.append("rgb({0},{1},{2})".format(r,g,b))

def get_scatter(sv_list, colors, color_index):
    return go.Scatter(
        x=[i[1][0] for i in sv_list],
        y=[i[1][1] for i in sv_list],
        mode='markers',
        text=[sv[0]["title"] for sv in sv_list],
        marker=dict(
            size=8,
            opacity= 0.8,
            color=colors[color_index],
            #colorscale='Viridis',
            showscale=False
        ),
        name=sv_list[0][0]["url"].split("/")[2]
    )

def scatter_plot(dataset, y, colors, kmeans):
    """Scatter plot of samples by their 2 dimensions"""
    domains = list(set([sample["url"].split("/")[2] for sample in dataset]))
    
    d = {}
    for sample, vector in zip(dataset, y):
        domain = sample["url"].split("/")[2]
        if domain in d:
            d[domain] += [(sample, vector)]
        else:
            d[domain] = [(sample, vector)]
            
    data = [get_scatter(sv_list, colors, color_index) for color_index, (_, sv_list) in enumerate(d.items())]
    data.append(go.Scatter(
        x=[cc[0] for cc in kmeans.cluster_centers_],
        y=[cc[1] for cc in kmeans.cluster_centers_],
        mode='markers',
        marker=dict(
            size=40,
            opacity= 0.5,
            color="rgb(0,0,0)",
            showscale=False
        ),
        name="cluster"
    ))

    layout = go.Layout(
        yaxis = dict(zeroline = False),
        xaxis = dict(zeroline = False)
    )
    fig = go.Figure(data=data, layout=layout)
    file = plot(fig, filename='Sentence encode.html')

In [None]:
scatter_plot(dataset, y, colors, kmeans)

## Test model

Assign each sample to a cluster

In [22]:
for i,sample in enumerate(dataset):
    sample["cluster-tfidf"] = kmeans.predict(y[i].reshape(1, -1))[0]

In [23]:
from_cluster_to_article_indices = {}
for i,sample in enumerate(dataset):
    cluster = sample["cluster-tfidf"]
    if cluster not in from_cluster_to_article_indices:
        from_cluster_to_article_indices[cluster] = [i]
    else:
        from_cluster_to_article_indices[cluster] += [i]

In [24]:
for cluster, articles in from_cluster_to_article_indices.items():
    random_articles = np.random.choice(articles, size=5, replace=True)
    print("Cluster {0} has {1} elements:".format(cluster, len(articles)))
    for art_ind in random_articles:
        sample = dataset[art_ind]
        print("\t{0}".format(sample["title"]))
    print("--------------")

Cluster 0 has 241 elements:
	Ephemeral Belief?
	The Paradox of the Preface
	Skepticism, Godzilla, and the Artificial Computerized Many-Branching You
	Can There Be Non-Obvious Illusions?
	Dreaming, Belief, and Emotion (by guest blogger Jonathan Ichikawa )
--------------
Cluster 1 has 209 elements:
	Twitter’s user growth goes nowhere and the stock is collapsing
	Understanding The Nuances of Instagram Data Transparency Policies
	Twitter now says 1.4 million people interacted with Russian trolls during 2016 presidential campaign
	5 Ways to Keep Your Information Secure in the Cloud
	Jack Dorsey says Twitter is keeping its 140-character limit, but maybe don’t get too excited
--------------
Cluster 2 has 352 elements:
	We Should Be Talking About Money With Our Partners — Here’s Where To Start
	5 email myths you can stop believing now
	14 Galentine's Day Gifts To Show Your Friends Some Love
	How to Sell Gum at School
	Nitroolefins – The Crying Game
--------------
Cluster 3 has 348 elements:
	J