In [1]:
import pandas as pd
import os.path
from pathlib import Path
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',100)

out_path = '/net/isilonP/public/rw/homes/uni_adm/tmp/log_parsing/2021.01.08-13.01.12'
reduce_path = os.path.join(out_path, 'reduce')
results_path = os.path.join(out_path, 'results')

Path(results_path).mkdir(exist_ok=True)

# Overview

The data presented here is from parsing the log file from 2018-01-01 to 2020-07-16. There are several filtering steps:

1. Bots/crawlers are removed by looking at each request's user agent string using [ua-parser](https://github.com/ua-parser)
2. Removing any uknown user agent strings eg `-`
3. Must be a `GET` request
4. Request must be a success (`200`)
5. Must not be a faceted query ie includes `&fil=` in the resrouce.

In [2]:
df_bytes = pd.read_csv(os.path.join(reduce_path, 'bytes.csv'), names=[
    'date',
    'bytes',
])
df_bytes = df_bytes.groupby('date').sum().sort_index()
df_bytes

Unnamed: 0_level_0,bytes
date,Unnamed: 1_level_1
1969-12-31,208856
2018-01-01,40256803031
2018-01-02,62992128088
2018-01-03,93250914809
2018-01-04,80180277604
...,...
2020-07-12,109390915817
2020-07-13,151007642634
2020-07-14,235548710824
2020-07-15,181437110383


# Queries by user type and namespace

In [3]:
# use nrows=1e6 to limit for testing
df = pd.read_csv(os.path.join(reduce_path, 'parsed.csv'), encoding='utf-8', names=[
    'Namespace',
    'Application',
    'Query'
])

In [4]:
def save_csv_for_all_namespaces(dfin, user_type):
    namespaces = set(dfin['Namespace'])
    for namespace in namespaces:
        csv_path = os.path.join(results_path, f'{user_type}_{namespace}.csv')
        partitioned = dfin[dfin['Namespace']==namespace]['Query'].value_counts()
        if not partitioned.empty:
            reordered = partitioned.reset_index().rename(columns={'index':'Query','Query':'Count'})[['Count', 'Query']]
            print(f'User type={user_type} Namespace={namespace}')
            print(reordered.head(100).to_string(index=False))
            print('-'*100, '\n'*2)
            reordered.to_csv(csv_path, index=False)

### Application breakdown

In [5]:
df_counts = df['Application'].value_counts().to_frame()
df_counts['%'] = 100 * df_counts['Application'] / sum(df_counts['Application'])
df_counts

Unnamed: 0,Application,%
Python Requests,99861490,52.65892
Chrome,21587862,11.3837
Apache-HttpClient,17345401,9.146569
Wget,10590939,5.584809
Firefox,8431076,4.445871
libwww-perl,7771644,4.09814
IE,4758045,2.50901
Bytespider,3257142,1.717555
Safari,3049134,1.607868
Edge,2854941,1.505466


Out of curiosity had a look at the Thunderbird and Facebook queries:

In [6]:
set(df[df['Application'] == 'Thunderbird']['Query'])

{'interpro ipr014000 or interpro ipr014002 or interpro ipr014003', 'tegument'}

In [7]:
set(df[df['Application'] == 'Facebook']['Query'])

{'*',
 '1.-.-.- taxonomy:acromyrmex echinatior (panamanian leafcutter ant) (acromyrmex octospinosus echinatior) [103372]',
 '10-kda',
 '1093da',
 '1avw',
 '1ema',
 '1oel',
 '1p 34.2 duplication',
 '1p34.2 duplication',
 '1q21',
 '2.4.1.19',
 '3 hydroxyanthranilate 3 4 dioxygenase',
 '3.2.1.41',
 '3faw',
 '3faw pullulanase',
 '6.4.1.1.',
 '94 aa',
 'aa 500',
 'aa 500 features',
 'aah10943',
 'ab010145',
 'abdominal aortic aneurysm',
 'acad9',
 'accession:p04637',
 'accession:p38398',
 'accession:p84243',
 'accession:q9nzc9',
 'actin',
 'adn',
 'adrenocorticotropic',
 'af013254',
 'af043303',
 'af052018 or af052017 or af052016 or af052015 or af052014 or af019908 or af134518 or af134517 or af134516 or af134515 or af134514 or af133430',
 'af304460',
 'akt',
 'alanina',
 'alanine',
 'albumin',
 'albumin rat',
 'albúmina',
 'alcohol dehydrogenase',
 'ali',
 'alk5',
 'amylase',
 'ancestor:1643685',
 'ancestor:4896',
 'ancestor:7208',
 'ancestor:8711',
 'ancestor:9844',
 'annotation:(type:ca_b

### Partition the queries
Restrict queries submitted by these applications to as they constitute the vast majority and will help filter any bots user agents that are actually bots:

In [8]:
programmatic_apps = {
    'Python Requests',
    'Wget',
    'Apache-HttpClient',
    'libwww-perl',
    'curl',
    'Java'
}
browser_apps = {
    'Chrome',
    'IE',
    'Firefox',
    'Opera',
    'Safari',
    'QQ Browser',
    'Edge',
    'Maxthon',
    'Netscape',
    'Mobile Safari',
    'Thunderbird',
    'Sogou Explorer',
    'Chrome Mobile',
    'UC Browser',
    'Chromium',
    'Samsung Internet',
    'Chrome Mobile iOS',
}
assert not len(programmatic_apps & browser_apps)
df_browser = df[df['Application'].isin(browser_apps)]
df_programmatic = df[df['Application'].isin(programmatic_apps)]

Percentage of browser queries considered:

In [9]:
100*len(df_browser)/(len(df) - len(df_programmatic))

92.8667868200869

Percentage of programmatic queries considered:

In [10]:
100*len(df_programmatic)/(len(df) - len(df_browser))

97.4058841051774

### Top 100 browser queries for each namespace

In [11]:
save_csv_for_all_namespaces(df_browser, 'browser')

User type=browser Namespace=locations
 Count                                 Query
  1170                    name:cell membrane
   530                  name:golgi apparatus
   338                              membrane
   305                                     *
   273                               nucleus
   268                         cell membrane
   178                          mitochondria
   164                              secreted
   156                       plasma membrane
   131                             cytoplasm
   129                         mitochondrion
   108                              lysosome
   106                                id:162
    94                         extracellular
    77                 endoplasmic reticulum
    66                              endosome
    57                         transmembrane
    57                           chloroplast
    57                               nuclear
    56                            peroxisome
    55           

### Programmatic library breakdown

In [12]:
df_programmatic['Application'].value_counts()

Python Requests      99861490
Apache-HttpClient    17345401
Wget                 10590939
libwww-perl           7771644
curl                  2513980
Java                      791
Name: Application, dtype: int64

### Top 100 programmatic queries for each namespace

In [13]:
save_csv_for_all_namespaces(df_programmatic, 'programmatic')

User type=programmatic Namespace=locations
 Count                         Query
    37                 name:cannabis
    36                 name:analytes
    36                   name:values
    34                name:indicates
    34                   name:ranges
    33                 name:extracts
    32              name:metabolites
    31                  name:samples
    31              name:cannabinoid
    30                name:compounds
    28                   name:limits
    27                     name:acid
    25                   name:method
    25              name:cannabidiol
    25                    name:study
    25                 name:suggests
    25                   name:sativa
    25                     name:mean
    24               name:bioaerosol
    23                 name:requires
    23                  name:methods
    23                name:marijuana
    23                    name:delta
    23                    name:novel
    22                      name