In [87]:
# http://brandonrose.org/clustering
# http://scikit-learn.org/stable/auto_examples/text/document_clustering.html

import pandas as pd
import numpy as np
import nltk
import re

from nltk.stem import PorterStemmer
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer

In [88]:
df = pd.read_csv('data/dbpedia-discussion-archive-dataset.csv', sep="\t")

stemmer = PorterStemmer()
stemmer.stem('DBpedia')

def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(term) for term in filtered_tokens]
    return stems

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=5, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(df['subject'])

CPU times: user 802 ms, sys: 48.4 ms, total: 851 ms
Wall time: 1.01 s


In [89]:
# 's, utf-8, vs. is getting matched
# need to decide if dbpedia-* should be filtered out or separated
tfidf_vectorizer.get_feature_names()

[u"'s",
 u'abstract',
 u'access',
 u'add',
 u'announc',
 u'announc openlink',
 u'announc openlink virtuoso',
 u'answer',
 u'api',
 u'arab',
 u'area',
 u'articl',
 u'avail',
 u'bad',
 u'base',
 u'basekb',
 u'best',
 u'broken',
 u'browser',
 u'bug',
 u'build',
 u'case',
 u'categori',
 u'chang',
 u'chapter',
 u'charact',
 u'citi',
 u'class',
 u'code',
 u'commun',
 u'concept',
 u'connect',
 u'contain',
 u'content',
 u'contribut',
 u'coordin',
 u'count',
 u'countri',
 u'creat',
 u'data',
 u'data dbpedia',
 u'data qualiti',
 u'data set',
 u'databas',
 u'dataset',
 u'date',
 u'dbpedia',
 u'dbpedia categori',
 u'dbpedia content',
 u'dbpedia data',
 u'dbpedia dataset',
 u'dbpedia dump',
 u'dbpedia endpoint',
 u'dbpedia extract',
 u'dbpedia extract framework',
 u'dbpedia freebas',
 u'dbpedia live',
 u'dbpedia local',
 u'dbpedia lookup',
 u'dbpedia lookup servic',
 u'dbpedia map',
 u'dbpedia ontolog',
 u'dbpedia queri',
 u'dbpedia releas',
 u'dbpedia resourc',
 u'dbpedia sparql',
 u'dbpedia sparq

In [90]:
# How to check convergence to local optimium

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [116]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=100)

%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

CPU times: user 2.25 s, sys: 21.5 ms, total: 2.27 s
Wall time: 2.31 s


In [117]:
df['clusters'] = clusters
df['clusters'].value_counts()

9     243
12     37
69     29
30     27
49     27
82     24
70     23
32     23
15     22
23     22
58     22
16     21
62     21
17     21
4      20
97     20
54     20
8      18
28     17
99     17
40     16
31     16
27     15
87     15
74     14
65     14
85     14
41     13
94     13
39     12
     ... 
38      8
44      8
83      8
93      8
48      8
29      7
73      7
78      7
98      7
3       7
53      7
67      7
2       6
11      6
0       6
33      6
43      6
63      6
92      5
90      5
96      5
71      5
79      5
51      4
59      4
14      3
13      3
20      3
21      3
84      2
Name: clusters, dtype: int64

In [120]:
df[df['clusters'] == 84]['subject'].tolist()

['semantic in URIs, was:dbpedia-links: Recommendation for predicate rdrel:manifestationOfWork" ?"',
 'semantic in URIs, was: dbpedia-links: Recommendation for predicate rdrel:manifestationOfWork" ?"']