In [None]:
import snowballstemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.cluster import AffinityPropagation


class LemmatizedTfidfVectorizer(TfidfVectorizer):
    """
    Vectorizer that first lemmatizes words.
    """
    def __init__(self, *args, **kwargs):
      super().__init__(*args, **kwargs)
      self.stemmer = snowballstemmer.stemmer('English')
      
    def build_analyzer(self):
        analyzer = super(LemmatizedTfidfVectorizer, self).build_analyzer()

        def lemmatize(phrase):
            words = analyzer(phrase)
            return [self.stemmer.stemWord(word)
                    for word in words]

        return lemmatize

In [None]:
import random
keywords = []
with open('../data/queriesu.txt') as f:
     for line in f:
            # replace only number lines like phone numbers
            if line.replace(" ","").strip().isdigit():
                line = ""
            # remove single characters and lone numbers
            line = " ".join([x if len(x) > 1 and not x.isdigit() else "" for x in line.split(" ") ]).strip()
            if len(line) > 0 and " " in line and random.random() > .85:
                keywords.append(line.strip())
print(len(keywords))
print(keywords[:30])

In [None]:
%%time
vec = LemmatizedTfidfVectorizer(stop_words=ENGLISH_STOP_WORDS.union(
    ['australia','australian','government','of',"www","gov","au","have","any"]))
vectorized = vec.fit_transform(keywords)

In [None]:
%%time
af = AffinityPropagation(max_iter=2, convergence_iter=2).fit(vectorized)
print(af.n_iter_)

In [None]:
%%time
clusters = {}
for keyword, cluster_id in zip(keywords, af.labels_):
    clusters.setdefault(cluster_id, []).append(keyword)
print(len(clusters),"clusters","\n")
i = 0
for id, items in clusters.items():
    if len(items) > 5 and len(items) < 20:
        i += 1
        print('\n'.join(items))
        print()
    else:
        if len(items) > 4:
            print("!!! ", len(items), items[0])
            print()
print("displayed",i,"clusters")