# Search query analysis via Affinity Propagation

## TODO
* Load data from the raw data files, removing dupes and including other data attributes
    * include domain column
    * include popularity measure of views and clicks
* HTML report including hiding/collapsing large clusters, sort by popularity and tags for domains
* use part of speech to find all the verbs/nouns
* use API search to find pages on domain and other domains on first page of results

In [None]:
import snowballstemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
from sklearn.cluster import AffinityPropagation, DBSCAN
from collections import Counter, OrderedDict
def find_modal_substring(strings):
    from functools import partial, reduce
    from itertools import chain
    from typing import Iterator
    

    def ngram(seq: str, n: int) -> Iterator[str]:
        return (seq[i: i+n] for i in range(0, len(seq)-n+1))

    def allngram(seq: str, minn=1, maxn=None) -> Iterator[str]:
        lengths = range(minn, maxn) if maxn else range(minn, len(seq))
        ngrams = map(partial(ngram, seq), lengths)
        return set(chain.from_iterable(ngrams))
    
    seqs_ngrams = map(partial(allngram), strings)
    counts = Counter(chain.from_iterable(seqs_ngrams))
    large_counts = {}
    for sstr in counts:
        key = counts[sstr]*len(sstr)
        if len(sstr) > len(large_counts.get(key,"")):
            large_counts[key] = sstr
    largest_counts = dict(sorted(large_counts.items(),reverse=True))

    modal_ngram = max(list(largest_counts.values())[:5], key=len).strip()
    modal_words_search = re.search(r"\b.?"+re.escape(modal_ngram)+r".?\b",'\n'.join(strings))
    modal_words = modal_words_search.group(0).strip() if modal_words_search else None
    if modal_words and modal_words.startswith("."):
        modal_words = modal_words[1:]
    return modal_words or modal_ngram 
class LemmatizedTfidfVectorizer(TfidfVectorizer):
    """
    Vectorizer that first lemmatizes words.
    """
    def __init__(self, *args, **kwargs):
      super().__init__(*args, **kwargs)
      self.stemmer = snowballstemmer.stemmer('English')
      
    def build_analyzer(self):
        analyzer = super(LemmatizedTfidfVectorizer, self).build_analyzer()

        def lemmatize(phrase):
            words = analyzer(phrase)
            return [self.stemmer.stemWord(word)
                    for word in words]

        return lemmatize
    ;

In [None]:
import random
import re
keywords = set()
with open('../data/queriesu.txt') as f:
     for line in f:
            # remove single characters and lone numbers
            line = " ".join([x if len(x) > 1 and not x.isdigit() else "" for x in line.split(" ") ]).strip()
            line = ''.join(e for e in re.sub("\s\s+" , " ",line)if e.isalnum() or e == " " or e==".")
            if line.startswith("."):
                line = line[1:]
            # replace only number lines like phone numbers
            if line.replace(" ","").strip().isdigit():
                line = ""
            if len(line) > 4:
                keywords.add(line)
print(len(keywords))
print(list(keywords)[:30])

In [None]:
%%time
stop_words= ENGLISH_STOP_WORDS.union(
    ['australia','australian','government','of',"www","gov","au","have","any"])
vec = LemmatizedTfidfVectorizer(stop_words=stop_words)
vectorized = vec.fit_transform(keywords)

In [None]:
%%time
#af = AffinityPropagation(max_iter=2, convergence_iter=2).fit(vectorized)
clustering = DBSCAN(eps=0.5, min_samples=2).fit(vectorized)
# import hdbscan
# clusterer = hdbscan.HDBSCAN()
# clustering = clusterer.fit(vectorized)

In [None]:
%%time
stop_words = ENGLISH_STOP_WORDS.union(
    [])
clusters = {}
labelled_clusters = {}
for keyword, cluster_id in zip(keywords, clustering.labels_):
    clusters.setdefault(cluster_id, []).append(keyword)
print(len(clusters),"clusters","\n")
i = 0
for id, items in clusters.items():
   
    if len(items) > 4 and len(items) < 100:
        i += 1
        clean_items = []
        for item in items:
            #word_tokens = word_tokenize(item) 
            word_tokens = item.lower().split(" ")
            clean_items.append(" ".join([lemmatizer.lemmatize(w) for w in word_tokens if not w in stop_words]))
        title = find_modal_substring(clean_items)
        if len(title) < 3:
            title = items[0]
        labelled_clusters[title]=sorted(items)
print("displayed",i,"clusters")
ordered_clusters = OrderedDict(sorted(labelled_clusters.items(), key=lambda t: t[0])).items()
#for label, cluster in ordered_clusters:
#     print("# ",label)
#     print('\n'.join(cluster))
#     print()

In [None]:
%%time
import string
from yattag import Doc, indent

doc, tag, text = Doc().tagtext()
header="""

<html>
  <head>
    <link rel="stylesheet" href="https://designsystem.gov.au/assets/css/style.css" />
    <style>
    .letter {
      column-count: 3;
  column-gap: 40px;
  column-fill: balance-all;
  column-rule: 1px solid black;
  padding-bottom: 20px;
}
.letter ul {
margin-top: 0 !important;
}
.letter li {
    break-inside: avoid-column;
    -webkit-column-break-inside: avoid;
}
h3 {
  -webkit-column-span: all; /* Chrome, Safari, Opera */
  column-span: all;
}
    </style>
  </head>
  <body class="au-grid au-body">
  <header class="au-header au-header--dark" role="banner">
    <div class="container">
        <div class="row">
            <div class="col-md-9">
                <a class="au-header__brand" href="#">
                    <img
                            class="au-header__brand-image"
                            alt="Australian Government"
                            src="https://designsystem.gov.au/assets/img/header-logo-agov.png"
                    />
                    <div class="au-header__text">
                        <h1 class="au-header__heading">Observatory</h1>
                        <div class="au-header__subline">
                            To quantify interactions with every government service
                        </div>
                    </div>
                </a>
            </div>
        </div>
    </div>
    <br/>
</header>
  	<main id="content"  class="au-body">
		<!--CONTENT-->
		<section>
			<div class="container-fluid">
				<div class="row">
"""
footer="""
</div></section>
<footer
        class="au-footer footer au-body au-body--dark au-footer--dark "
        role="contentinfo"
>
    <div class="container-fluid">
        <nav class="au-footer__navigation " aria-label="footer">
            <div class="col-md-offset-1 col-md-8 col-md-push-3">

                <ul class="au-link-list  au-link-list--inline">
                    <li><a href="/support">About the Observatory</a></li>
                    <li>
                        <a href="https://www.dta.gov.au/privacy-statement">Privacy</a>
                    </li>
                    <li><a href="/support">Training and Support</a></li>
                    <li><a href="/support">Contact</a></li>
                </ul>
                <div class="au-footer__end">
                    <div class="footer__content footer__legal">
                        <p>
                            Â© Commonwealth of Australia. With the exception of the
                            Commonwealth Coat of Arms and where otherwise noted, this work
                            is licensed under the
                            <a
                                    href="https://github.com/govau/design-system-components/blob/master/LICENSE"
                                    rel="external"
                            >MIT license</a
                            >
                        </p>
                    </div>
                </div>
            </div>
            <div class="col-md-3 col-md-pull-9 footer__logo">
                <p class="footer__affiliate">
              <span>An initiative of the </span
              ><span>Digital Transformation Agency </span
                ><span class="footer__affiliate-link"
                ><a
                        class="au-cta-link  au-cta-link--dark"
                        href="https://www.dta.gov.au/our-projects"
                >More projects</a
                ></span
                >
                </p>
            </div>
        </nav>
    </div>
</footer>
</main>
</body></html>
"""
with tag('div', klass="col-md-12"):
    doc.stag('br')
    with tag('h1'):
        text("Search Queries")
    with tag('section', klass='au-callout'):
        text('Browse search queries that lead to government websites grouped by letter and a common keyword')
    with tag('p', id='letters'):
        with tag("h2"):
            text("Jump to letter")
        for letter in ['2','3']+[x for x in string.ascii_lowercase if x not in ['x','z']]:
            with tag('a', href="#"+letter):
                text(letter)
    last_letter=' '
    i = 0
    for title, cluster in ordered_clusters:
        letter = title[0]
        if letter != last_letter:
            if i > 0:
                with tag('small'):
                    text("(%s keywords starting with '%s')"%(i,last_letter))
            i = 0
            doc.stag('hr')
        i += 1
        #print(title)
        #print(last_letter)\
        if letter != last_letter:
            with tag('a',id = letter):
                text("")
            with tag('h2'):
                text(letter)
        with tag('div', klass="letter"):
            with tag('h3'):
                    text(title)
            with tag("ul"):
                for item in cluster:
                    with tag("li"):
                        text(item)
        if letter != last_letter:
            last_letter = title[0]
    if i > 0:
        with tag('small'):
            text("(%s keywords starting with '%s')"%(i,last_letter))
    with tag('div'):
        text('')
with open('search-queries.html','wt') as out:
    out.write(header)
    out.write(indent(doc.getvalue()))
    out.write(footer)
    print("exported search-queries.html")