In [1]:
import pandas as pd
import os.path
from pathlib import Path
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

pd.set_option('display.expand_frame_repr', False)

out_path = '/net/isilonP/public/rw/homes/uni_adm/tmp/log_parsing/2021.01.06-19.33.40/'
reduce_path = os.path.join(out_path, 'reduce')
results_path = os.path.join(out_path, 'results')

Path(results_path).mkdir(exist_ok=True)

# Overview

The data presented here is from parsing the log file from 2018-01-01 to 2020-07-16. There are several filtering steps:

1. User agent is checked to make sure it doesn't look like a bot/crawler:
    * For all requests (API & Browser):
        * comparing against a regex eg `(bot|crawler...)`
        * Removing any uknown `-`
    * For only browser requests:
        * for non-api requests by using [ua-parser](https://github.com/ua-parser) to ensure the browser name is valid
        * Filter on the main browsers (eg Chrome, Firefox, IE, ...)
2. Must be a `GET` request
3. Request must be a success (`200`)

In [2]:
df_bytes = pd.read_csv(os.path.join(reduce_path, 'bytes.csv'), names=[
    'date',
    'bytes',
])
df_bytes = df_bytes.groupby('date').sum().sort_index()
### First log day
print(df_bytes.index.min())
### Last log day
print(df_bytes.index.max())

2018-01-01
2020-07-16


# Queries by user type and namespace

In [3]:
# use nrows=1e6 to limit for testing
df = pd.read_csv(os.path.join(reduce_path, 'parsed.csv'), names=[
    'namespace',
    'user-type',
    'user-agent-family',
    'query'
])

In [4]:
user_agent_family_whitelist = {
    'Chrome',
    'Firefox'
    'IE',
    'Opera',
    'Safari',
    'Edge',
    'Netscape',
    'Mobile Safari',
    'Sogou Explorer',
    'Chrome Mobile',
    'UC Browser',
    'Chromium',
    'Samsung Internet',
    'Chrome Mobile iOS',
    'Vivaldi',
    'Yandex Browser',
    'Chrome Mobile WebView',
    'Apple Mail',
    'Android',
    'Mobile Safari UI/WKWebView',
    'Firefox Mobile',
    'Opera Mobile'
}

In [5]:
def save_csv_for_all_namespaces(dfin):
    namespaces = set(dfin['namespace'])
    user_types = set(dfin['user-type'])
    assert len(user_types) == 1
    user_type = list(user_types)[0]
    for namespace in namespaces:
        csv_path = os.path.join(results_path, f'{user_type}_{namespace}.csv')
        partitioned = dfin[(dfin['user-type']==user_type) & (dfin['namespace']==namespace)]['query'].value_counts()
        if not partitioned.empty:
            reordered = partitioned.reset_index().rename(columns={'index':'query','query':'count'})[['count', 'query']]
            print(f'User type={user_type} Namespace={namespace}')
            print(reordered.head(25).to_string(index=False))
            print('-'*75, '\n'*2)
            reordered.to_csv(csv_path, index=False)

### Browser breakdown

In [6]:
df_browser = df[(df['user-agent-family'].isin(user_agent_family_whitelist) & (df['user-type'] == 'browser'))]
df_browser['user-agent-family'].value_counts()

Chrome                        21860892
Safari                         3049246
Edge                           2864460
Chrome Mobile                  2167489
Opera                          1369151
Mobile Safari                   491886
Sogou Explorer                  458777
Netscape                        351653
UC Browser                      304696
Chrome Mobile WebView           234224
Chromium                         63194
Yandex Browser                   31887
Chrome Mobile iOS                27903
Samsung Internet                 24025
Firefox Mobile                   17938
Vivaldi                          16848
Mobile Safari UI/WKWebView       15813
Android                           9047
Apple Mail                        8447
Opera Mobile                      4057
Name: user-agent-family, dtype: int64

### Top 25 browser queries for each namespace

In [7]:
save_csv_for_all_namespaces(df_browser)

User type=browser Namespace=locations
 count                  query
   221               membrane
   215                      *
   206                nucleus
   194          cell membrane
   175     name:cell membrane
   142   name:golgi apparatus
   135               secreted
   123        plasma membrane
   120           mitochondria
    92              cytoplasm
    89          mitochondrion
    82               lysosome
    78                 id:162
    73          extracellular
    54  endoplasmic reticulum
    41              nucleolus
    41          transmembrane
    41            chloroplast
    41               endosome
    37                  golgi
    36             peroxisome
    35                cytosol
    31                      ,
    30                nuclear
    28           cell surface
--------------------------------------------------------------------------- 


User type=browser Namespace=uniref
 count                                         query
  5004         

### Programmatic library breakdown

In [8]:
df_programmatic = df[df['user-type'] == 'programmatic']
df_programmatic['user-agent-family'].value_counts()

Python Requests      91524425
Python-urllib        28091465
Other                 9090621
libwww-perl           7194876
curl                  1837109
Apache-HttpClient       36606
WordPress                   3
Android                     3
Name: user-agent-family, dtype: int64

### Top 25 programmatic queries for each namespace

In [9]:
save_csv_for_all_namespaces(df_programmatic)

User type=programmatic Namespace=locations
 count             query
    42       name:values
    41     name:analytes
    39       name:ranges
    39    name:indicates
    38     name:extracts
    38  name:metabolites
    36     name:cannabis
    35      name:samples
    33    name:compounds
    31     name:suggests
    30  name:cannabinoid
    29       name:limits
    28     name:requires
    26         name:acid
    25        name:study
    25  name:cannabidiol
    24       name:method
    24   name:bioaerosol
    24       name:sativa
    23         name:mean
    23        name:delta
    22      name:methods
    22        name:novel
    22          name:rat
    22    name:marijuana
--------------------------------------------------------------------------- 


User type=programmatic Namespace=uniref
 count                                              query
   500  uniprot:(lactococcus lactococcus lactis) and i...
   204  uniprot:(faecalibaculum faecalibaculum rodenti...
   185        