In [None]:
from sqlalchemy import create_engine
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import re
plt.rcParams["figure.figsize"] = (20,5)
plt.rcParams["figure.dpi"] = 200
plt.style.use('ggplot')
# plt.tight_layout()

In [None]:
engine = create_engine('postgresql+psycopg2://postgres:dekart@cloudsql/dekart',pool_recycle=3600);
conn = engine.connect();

In [None]:
with open('notebooks/reports-queries.sql', 'r') as f:
    queries = pd.read_sql(f.read().format(**locals()), conn)
# queries

In [None]:
queries['week'] = queries['reports_created_at'].dt.to_period('W').apply(lambda r: r.start_time).dt.date
queries['day'] = queries['reports_created_at'].dt.date
queries['domain'] = queries.apply(lambda row: row['author_email'].split('@')[1], axis=1)

In [None]:
def query_source(query):
    # print(query['query_text'])
    if (re.search(r"only Berlin has boundaries at this level", query['query_text'])):
        return '/blog/admin-boundaries-in-bigquery-open-datasets/'
    if (re.search(r"public dataset with US ZIP Codes", query['query_text'])):
        return '/blog/admin-boundaries-in-bigquery-open-datasets/'
    if (re.search(r"boundary.*administrative", query['query_text'])):
        return '/blog/admin-boundaries-in-bigquery-open-datasets/'
    if (re.search("bigquery-public-data.chicago_crime.crime", query['query_text'])):
        return '/docs/about/playground/'
    if (len(query['query_text'])==0):
        return 'empty'
    return 'unknown'
queries['query_source'] = queries.apply(query_source, axis=1)

In [None]:
filtered_queries = queries[queries['author_email'] != 'bilonenko.v@gmail.com'][queries['author_email'] != 'vladimir@ree.technology']

In [None]:
filtered_queries[filtered_queries['query_source'] == 'unknown'].drop_duplicates('query_text')[['query_text', 'author_email']]

In [None]:
filtered_queries[['id', 'week', 'query_source']].groupby(['week', 'query_source']).count().unstack('query_source').fillna(0).plot(kind='bar', stacked=True, title='Queries')

In [None]:
filtered_queries[filtered_queries['job_status']==3][['id', 'week', 'query_source']].groupby(['week', 'query_source']).count().unstack('query_source').fillna(0).plot(kind='bar', stacked=True, title='Successful Queries')

In [None]:
filtered_queries[['id', 'week', 'job_status']].groupby(['week', 'job_status']).count().unstack('job_status').fillna(0).plot(kind='bar', stacked=True, title='Job Status')

In [None]:
filtered_queries[filtered_queries['job_status']==3].drop_duplicates('author_email')[[ 'author_email','week', 'query_source']].groupby(['week', 'query_source']).count().unstack('query_source').fillna(0).plot(kind='bar', stacked=True, title='First User Successfull Query')

In [None]:
filtered_queries.drop_duplicates('author_email')[[ 'author_email','week', 'query_source']].groupby(['week', 'query_source']).count().unstack('query_source').fillna(0).plot(kind='bar', stacked=True, title='First User Query')

In [None]:
filtered_queries[['id', 'author_email', 'week']].groupby(['week', 'author_email']).count().groupby(['week']).count().plot(kind='bar', title='Users per week')

In [None]:
filtered_queries[filtered_queries['query_source']=='unknown'][filtered_queries['job_status']==3].drop_duplicates('author_email')[['id','week' ]].groupby(['week']).count().cumsum().plot(kind='bar', title='Users created unique report')

In [None]:
filtered_queries[filtered_queries['job_status']==3].drop_duplicates('author_email')[['id','week' ]].groupby(['week']).count().cumsum().plot(kind='bar', title='Users created report')

In [None]:
filtered_queries[filtered_queries['query_source']=='unknown'].drop_duplicates('query_text')[['id', 'author_email', 'query_text', 'job_status', 'day', 'title']]

In [None]:
users = filtered_queries[['id', 'author_email', 'job_status']].groupby(['author_email']).agg({
    'job_status': lambda row: np.sum(row)/3
})
users['domain'] = users.apply(lambda row: row.name.split('@')[1], axis=1)

In [None]:
users.sort_values('job_status', ascending=False)

In [None]:
users[users['domain']!='gmail.com'].groupby(['domain']).count().sort_values('job_status', ascending=False)

In [None]:
queries[queries['domain']=='ubilabs.net']