In [131]:
# http://data-mining.philippe-fournier-viger.com/introduction-clustering-k-means-java-code/
# http://brandonrose.org/clustering
# http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
# https://www.codeproject.com/Articles/439890/Text-Documents-Clustering-using-K-Means-Algorithm

import pandas as pd
import numpy as np
import nltk
import re

from nltk.stem import PorterStemmer
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer

In [133]:
df = pd.read_csv('data/dbpedia-discussion-archive-dataset.csv', sep="\t")

stemmer = PorterStemmer()
stemmer.stem('DBpedia')

def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(term) for term in filtered_tokens]
    return stems

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=5, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(df['subject'])

CPU times: user 739 ms, sys: 7.41 ms, total: 747 ms
Wall time: 749 ms


In [134]:
# 's, utf-8, vs. is getting matched
# need to decide if dbpedia-* should be filtered out or separated
tfidf_vectorizer.get_feature_names()

[u"'s",
 u'abstract',
 u'access',
 u'add',
 u'announc',
 u'announc openlink',
 u'announc openlink virtuoso',
 u'answer',
 u'api',
 u'arab',
 u'area',
 u'articl',
 u'avail',
 u'bad',
 u'base',
 u'basekb',
 u'best',
 u'broken',
 u'browser',
 u'bug',
 u'build',
 u'case',
 u'categori',
 u'chang',
 u'chapter',
 u'charact',
 u'citi',
 u'class',
 u'code',
 u'commun',
 u'concept',
 u'connect',
 u'contain',
 u'content',
 u'contribut',
 u'coordin',
 u'count',
 u'countri',
 u'creat',
 u'data',
 u'data dbpedia',
 u'data qualiti',
 u'data set',
 u'databas',
 u'dataset',
 u'date',
 u'dbpedia',
 u'dbpedia categori',
 u'dbpedia content',
 u'dbpedia data',
 u'dbpedia dataset',
 u'dbpedia dump',
 u'dbpedia endpoint',
 u'dbpedia extract',
 u'dbpedia extract framework',
 u'dbpedia freebas',
 u'dbpedia live',
 u'dbpedia local',
 u'dbpedia lookup',
 u'dbpedia lookup servic',
 u'dbpedia map',
 u'dbpedia ontolog',
 u'dbpedia queri',
 u'dbpedia releas',
 u'dbpedia resourc',
 u'dbpedia sparql',
 u'dbpedia sparq

In [135]:
# How to check convergence to local optimium

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [136]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=100)

%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

CPU times: user 2.1 s, sys: 12.1 ms, total: 2.11 s
Wall time: 2.12 s


In [137]:
df['cluster'] = clusters
df['cluster'].value_counts()

7     231
12     37
17     32
9      29
18     27
6      26
4      24
13     23
22     22
21     22
15     22
58     21
24     19
51     19
57     19
92     19
30     18
8      18
75     18
82     17
65     17
48     16
11     16
71     15
2      15
39     15
35     15
20     14
26     13
98     13
     ... 
55      8
23      8
78      7
91      7
3       7
81      7
79      7
25      7
42      7
76      7
29      7
94      6
1       6
97      6
86      6
87      6
60      6
89      6
96      6
95      6
59      5
5       5
77      5
93      5
61      5
66      5
33      5
54      4
99      4
46      2
Name: cluster, dtype: int64

In [141]:
df = df.sort_values('cluster', ascending=1)
df[['subject', 'cluster']].to_csv('data/dbpedia-discussion-archive-cluster-results.csv', index=False)