In [158]:
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup as bs
from sklearn.feature_extraction.text import HashingVectorizer
from os import listdir
from sklearn.neighbors import DistanceMetric
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.cluster import KMeans

In [2]:
path = "./Data/"

In [3]:
def make_corpus_for_one(data):
    bodies = []
    for line in data:
        parts_span = line.split('"<span')
        parts_mark_title = parts_span[0].split()
        mark = int(parts_mark_title[0][0])
        if mark in {1, 5}:
            continue  # seems that such reviews are not really informative
        body_html = "<span" + parts_span[1][:-1]
        body_text = bs(body_html).text
        bodies.append(body_text)
    return bodies  # take only bodies

In [103]:
def make_corpus(path, n_max_files=2, n_features=1000, n_max_elements=5000):
    bookname = []
    X = []
    y = []
    hv = HashingVectorizer(n_features=n_features)
    for filename in [f for f in listdir(path) if f.endswith('.csv')][:n_max_files]:
        with open(path + filename, 'r') as file:
            data = file.read().splitlines()[:n_max_elements]
        bodies = make_corpus_for_one(data) # todo so far take only bodies
        X.append(hv.transform(bodies).toarray())
        y.append(np.repeat(len(bookname), len(bodies)))
        bookname.append(filename.rstrip('.csv'))
    return np.concatenate(X), np.concatenate(y), bookname

In [177]:
X, y, bookname = make_corpus(path, n_max_files=2, n_features=1000, n_max_elements=5000)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


### Chosen clustering algorithms and metrics:

In [182]:
random_state = 132
knn = KMeans(n_clusters=2, random_state=random_state).fit_predict(X)

euclidean_distances = metrics.pairwise.pairwise_distances(X, metric='euclidean')
manhattan_distances = metrics.pairwise.pairwise_distances(X, metric='manhattan')
l1_distances  = metrics.pairwise.pairwise_distances(X, metric='l1')

In [181]:
db_euclidean = DBSCAN(metric="precomputed").fit(euclidean_distances)
db_manhattan = DBSCAN(metric="precomputed").fit(manhattan_distances)
db_l1 = DBSCAN(metric="precomputed").fit(l1_distances)

### Test perfomance

In [192]:
def check_clustering_perfomance(labels_true, labels):
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(labels_true, labels))

In [193]:
labels_true = y

In [194]:
labels_knn = knn
labels_db_euclidean = db_euclidean.labels_
labels_db_manhattan = db_manhattan.labels_
labels_db_l1 = db_l1.labels_

### Perfomance is really bad and strange

In [195]:
check_clustering_perfomance(labels_true, labels_knn)

Estimated number of clusters: 2
Homogeneity: 0.005
Completeness: 0.005
V-measure: 0.005
Adjusted Rand Index: -0.004
Adjusted Mutual Information: 0.005


In [196]:
check_clustering_perfomance(labels_true, labels_db_euclidean)

Estimated number of clusters: 2
Homogeneity: 0.033
Completeness: 0.158
V-measure: 0.054
Adjusted Rand Index: 0.043
Adjusted Mutual Information: 0.032


In [197]:
check_clustering_perfomance(labels_true, labels_db_l1)

Estimated number of clusters: 2
Homogeneity: -0.000
Completeness: 1.000
V-measure: -0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000


In [198]:
check_clustering_perfomance(labels_true, labels_db_manhattan)

Estimated number of clusters: 2
Homogeneity: -0.000
Completeness: 1.000
V-measure: -0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
