In [None]:
# Search query analysis by URL

Find typical queries for a series of URLs from the Google Search Console API reports

In [None]:
from collections import Counter, OrderedDict
import re
import snowballstemmer #  pip3 install snowballstemmer
stemmer = snowballstemmer.stemmer('English')
australian_re = re.compile("australian?")
def check_unordered_string(string, strings):
    if string in strings:
        return False
    string_set = set(australian_re.sub('',string).strip().split(' '))
    #print(string_set)
    for other_string in strings:
        if string_set == set(australian_re.sub('',other_string).strip().split(' ')):
            return False
    return True

def stem_dedup(strings):
    new_strings = {}
    for string in strings:
        new_string = ' '.join(stemmer.stemWord(word)
                    for word in string.split(' '))
        #print(string,' => ',new_string)
        if new_string not in new_strings and check_unordered_string(string, new_strings.values()):
            new_strings[new_string] = string
    return new_strings.values()
def find_modal_substring(strings):
    from functools import partial, reduce
    from itertools import chain
    from typing import Iterator
    

    def ngram(seq: str, n: int) -> Iterator[str]:
        return (seq[i: i+n] for i in range(0, len(seq)-n+1))

    def allngram(seq: str, minn=1, maxn=None) -> Iterator[str]:
        lengths = range(minn, maxn) if maxn else range(minn, len(seq))
        ngrams = map(partial(ngram, seq), lengths)
        return set(chain.from_iterable(ngrams))
    
    seqs_ngrams = map(partial(allngram), strings)
    counts = Counter(chain.from_iterable(seqs_ngrams))
    large_counts = {}
    for sstr in counts:
        key = counts[sstr]*len(sstr)
        if len(sstr) > len(large_counts.get(key,"")):
            large_counts[key] = sstr
    largest_counts = dict(sorted(large_counts.items(),reverse=True))

    modal_ngram = max(list(largest_counts.values())[:5], key=len).strip()
    modal_words_search = re.search(r"\b.?"+re.escape(modal_ngram)+r".?\b",'\n'.join(strings))
    modal_words = modal_words_search.group(0).strip() if modal_words_search else None
    if modal_words and modal_words.startswith("."):
        modal_words = modal_words[1:]
    return modal_words or modal_ngram 

In [None]:
%%time

from tablib import Dataset
# query	page	clicks	impressions	click_thru_ratio	search_result_position
imported_data = Dataset().load(open('data_searchqueries_australiagovau_websearch_20190916_20190923.csv').read())
queries = set()
queries_by_page = {}
query_clicks = {}
query_impressions = {}
for row in imported_data.dict:
    query =  row['query'].strip()
    if ('visa' in row['query'] or 'migration' in row['query'] or 'visa' in row['page']) and query != '':
        queries.add(query)
        if row['page'] not in queries_by_page:
                queries_by_page[row['page']] = set()
        queries_by_page[row['page']].add(query)
        query_clicks[row['query']] = float(row['clicks'])
        query_impressions[row['query']] = float(row['impressions'])

In [None]:
for page,queries in queries_by_page.items():
    if len(queries) < 75:
        total_clicks = 0
        total_impressions = 0
        for query in queries:
            total_clicks += query_clicks[query]
            total_impressions += query_impressions[query]
        if total_clicks > 10:
            print(page)
            print(len(queries))
            for query in queries:    
                if query_clicks[query] > 10:
                    print(query)
                    print ('clicks: ', query_clicks[query])
                elif query_impressions[query]> total_impressions* 0.05:
                    print(query)
                    print ('impressions: ', query_impressions[query])
                #else:
                #    print ('too small')
            print(' ')
            print(' ')

In [None]:
import tablib
result = tablib.Dataset()
result.headers = ['page','query','hits','impressions']
for page,queries in queries_by_page.items():
    if len(queries) > 75:
        total_clicks = 0
        total_impressions = 0
        for query in queries:
            total_clicks += query_clicks[query]
            total_impressions += query_impressions[query]
        if total_clicks > 10:
            
            print(page)
            #print(total_clicks, total_impressions)
            #print(len(queries))
            clean_queries = []
            for query in queries:    
                if query_clicks[query]> total_clicks* 0.01 and query_clicks[query]> 7:
                    clean_queries.append(query)
                    #print(query)
                    #print ('clicks: ', query_clicks[query])
                elif query_impressions[query]> total_impressions* 0.1:
                    clean_queries.append(query)
                    #print(query)
                    #print ('impressions: ', query_impressions[query])
                #else:
                #    print ('too small')
            clean_queries = set([x.replace('immigrate to','migrate to') for x in clean_queries])
            #print(clean_queries)
            dedup_queries = (stem_dedup(clean_queries))
            print(dedup_queries)
            for query in dedup_queries:
                #print(query, query_clicks[query], query_impressions[query])
                result.append([page,query,query_clicks[query], query_impressions[query]])
            #print(find_modal_substring(clean_queries))
            print(' ')
            print(' ')

In [None]:
print(result.csv)