# Cluster articles according to TFIDF

In [14]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix, vstack

import nbimporter
from preprocessed_data_reader import ReaderPreprocessedData

## Get training data

In [2]:
dataset = ReaderPreprocessedData.read_data("../preprocessed")

## Train KMeans

In [4]:
def grouper(n_elements_in_batch, l):
    return [l[i:i + n_elements_in_batch] for i in range(0, len(l), n_elements_in_batch)]

In [5]:
idf = pd.read_csv("../resources/wiki-30k-10-IDF.csv")
idf = idf.set_index('term')

In [6]:
for i,batch in enumerate(grouper(100, dataset)):
    rows = [sample["tfidf"]["logtfidf"] for sample in batch]
    if i == 0:
        res_matrix = csr_matrix(pd.DataFrame(rows, columns=idf.index).fillna(value=0))
    else:
        delta_matrix = csr_matrix(pd.DataFrame(rows, columns=idf.index).fillna(value=0))
        res_matrix = vstack([res_matrix, delta_matrix])
    print("{0}/{1}".format(i + 1, len(dataset) // 100 + 1))
    
res_matrix

1/27
2/27
3/27
4/27
5/27
6/27
7/27
8/27
9/27
10/27
11/27
12/27
13/27
14/27
15/27
16/27
17/27
18/27
19/27
20/27
21/27
22/27
23/27
24/27
25/27
26/27
27/27


<2628x87709 sparse matrix of type '<class 'numpy.float64'>'
	with 606421 stored elements in Compressed Sparse Row format>

In [41]:
kmeans = KMeans(n_clusters=100).fit(res_matrix)

## Test model

Assign each sample to a cluster

In [42]:
for i,batch in enumerate(grouper(100, dataset)):
    rows = [sample["tfidf"]["logtfidf"] for sample in batch]
    matrix = csr_matrix(pd.DataFrame(rows, columns=idf.index).fillna(value=0))
    predictions = kmeans.predict(matrix)
    for sample, pred in zip(batch, predictions):
        sample["cluster-tfidf"] = pred
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26


In [43]:
from_cluster_to_article_indices = {}
for i,sample in enumerate(dataset):
    cluster = sample["cluster-tfidf"]
    if cluster not in from_cluster_to_article_indices:
        from_cluster_to_article_indices[cluster] = [i]
    else:
        from_cluster_to_article_indices[cluster] += [i]

In [44]:
np.random.choice([1,22,3], size=2, replace=False)

array([3, 1])

In [45]:
for cluster, articles in from_cluster_to_article_indices.items():
    random_articles = np.random.choice(articles, size=5, replace=True)
    print("Cluster {0} has {1} elements:".format(cluster, len(articles)))
    for art_ind in random_articles:
        sample = dataset[art_ind]
        print("\t{0}".format(sample["title"]))
    print("--------------")

Cluster 0 has 1 elements:
	A Week In North Carolina On A $53,300 Salary
	A Week In North Carolina On A $53,300 Salary
	A Week In North Carolina On A $53,300 Salary
	A Week In North Carolina On A $53,300 Salary
	A Week In North Carolina On A $53,300 Salary
--------------
Cluster 1 has 44 elements:
	I Asked 400 Undergrads to Perform 90 Minutes of Kindness for No Reward. Here's What Happened
	Bihar PSC SYLLABUS
	Current Affairs September 2018
	Current Affairs February 2018
	I Asked 400 Undergrads to Perform 90 Minutes of Kindness for No Reward. Here's What Happened
--------------
Cluster 2 has 2 elements:
	Selbstliebe: Ich hatte mit über 100 Männern Sex & das ist okay
	Selbstliebe: Ich hatte mit über 100 Männern Sex & das ist okay
	„Ich hatte ständig Stoppeln“: Warum sich Frauen wirklich die Haare entfernen
	„Ich hatte ständig Stoppeln“: Warum sich Frauen wirklich die Haare entfernen
	Selbstliebe: Ich hatte mit über 100 Männern Sex & das ist okay
--------------
Cluster 3 has 1 elements:
	