In [1]:
import pandas as pd
import os.path
from pathlib import Path
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',100)

out_path = '/net/isilonP/public/rw/homes/uni_adm/tmp/log_parsing/2021.01.08-13.01.12'
reduce_path = os.path.join(out_path, 'reduce')
results_path = os.path.join(out_path, 'results')

Path(results_path).mkdir(exist_ok=True)

# Overview

The data presented here is from parsing the log file from 2018-01-01 to 2020-07-16. There are several filtering steps:

1. Bots/crawlers are removed by looking at each request's user agent string using [ua-parser](https://github.com/ua-parser)
2. Removing any uknown user agent strings eg `-`
3. Must be a `GET` request
4. Request must be a success (`200`)
5. Must not be a faceted query ie includes `&fil=` in the resrouce.

In [2]:
df_bytes = pd.read_csv(os.path.join(reduce_path, 'bytes.csv'), names=[
    'date',
    'bytes',
])
df_bytes = df_bytes.groupby('date').sum().sort_index()
df_bytes

Unnamed: 0_level_0,bytes
date,Unnamed: 1_level_1
1969-12-31,208856
2018-01-01,40256803031
2018-01-02,62992128088
2018-01-03,93250914809
2018-01-04,80180277604
...,...
2020-07-12,109390915817
2020-07-13,151007642634
2020-07-14,235548710824
2020-07-15,181437110383


# Queries by user type and namespace

In [3]:
# use nrows=1e6 to limit for testing
df = pd.read_csv(os.path.join(reduce_path, 'parsed.csv'), encoding='utf-8', names=[
    'Namespace',
    'Application',
    'Query'
])

In [4]:
def save_csv_for_all_namespaces(dfin, user_type):
    namespaces = set(dfin['Namespace'])
    for namespace in namespaces:
        csv_path = os.path.join(results_path, f'{user_type}_{namespace}.csv')
        partitioned = dfin[dfin['Namespace']==namespace]['Query'].value_counts()
        if not partitioned.empty:
            reordered = partitioned.reset_index().rename(columns={'index':'Query','Query':'Count'})[['Count', 'Query']]
            print(f'User type={user_type} Namespace={namespace}')
            print(reordered.head(100).to_string(index=False))
            print('-'*100, '\n'*2)
            reordered.to_csv(csv_path, index=False, encoding='utf-8')

### Application breakdown

In [5]:
df_counts = df['Application'].value_counts().to_frame()
df_counts['Percentage'] = df_counts['Application'] / sum(df_counts['Application'])
df_counts.style.format({
    'Percentage': '{:,.2%}'.format,
})

Unnamed: 0,Application,Percentage
IE,219875,21.99%
Apache-HttpClient,173798,17.38%
Firefox,156389,15.64%
Chrome,153537,15.35%
Opera,104153,10.42%
libwww-perl,35398,3.54%
QQ Browser,34938,3.49%
Netscape,34733,3.47%
Maxthon,34467,3.45%
Wget,19241,1.92%


Out of curiosity had a look at the Thunderbird and Facebook queries:

In [6]:
set(df[df['Application'] == 'Thunderbird']['Query'])

set()

In [7]:
set(df[df['Application'] == 'Facebook']['Query'])

{'taxonomy:canis lupus familiaris (dog) (canis familiaris) [9615] 0000139 goa:(metal ion binding [46872]) go:0000139'}

### Partition the queries
Restrict queries submitted by these applications to as they constitute the vast majority and will help filter any bots user agents that are actually bots:

In [8]:
programmatic_apps = {
    'Python Requests',
    'Wget',
    'Apache-HttpClient',
    'libwww-perl',
    'curl',
    'Java'
}
browser_apps = {
    'Chrome',
    'IE',
    'Firefox',
    'Opera',
    'Safari',
    'QQ Browser',
    'Edge',
    'Netscape',
    'Mobile Safari',
    'Sogou Explorer',
    'Chrome Mobile',
    'UC Browser',
    'Chromium',
    'Samsung Internet',
    'Chrome Mobile iOS',
}
assert not len(programmatic_apps & browser_apps)
df_browser = df[df['Application'].isin(browser_apps)]
df_programmatic = df[df['Application'].isin(programmatic_apps)]

Percentage of browser queries considered:

In [9]:
100*len(df_browser)/(len(df) - len(df_programmatic))

99.95195211138615

Percentage of programmatic queries considered:

In [10]:
100*len(df_programmatic)/(len(df) - len(df_browser))

99.85340984097802

### Top 100 browser queries for each namespace

In [11]:
save_csv_for_all_namespaces(df_browser, 'browser')

User type=browser Namespace=proteomes
 Count                                                                                                Query
    31                                                                                                    *
    19                                                                                         homo sapiens
    14                                                                                        taxonomy:9606
    12                                                                                                human
    11                                                                                                mouse
     7                                                                                                 homo
     6                                                                                           sus scrofa
     5                                                                                     escheri

User type=browser Namespace=keywords
 Count                                  Query
    16                                 signal
    13                   alternative splicing
    12                         phosphoprotein
     9                           glycoprotein
     9                            acetylation
     8                          transcription
     7                               secreted
     6                               immunity
     6                          mitochondrion
     6                         disulfide bond
     6                             cell cycle
     5                              apoptosis
     5                       lipid metabolism
     4                         oxidoreductase
     3                            transferase
     3                  developmental protein
     3                        ubl conjugation
     3                           polymorphism
     3                              cytoplasm
     3        serine/threonine-protein kina

User type=browser Namespace=uniprot
  Count                                                Query
 108306                             organism:9606 and q96f92
 107639                             organism:9606 and p0c5w0
 107639                             organism:9606 and q96l16
 107494                           organism:9606 and q96l92-4
 107301                           organism:9606 and a4qph2-3
  54314  organism:9606 and reviewed uniprotunreviewed unprot
  54058                             organism:9606 and q9h761
    577                                         reviewed:yes
    136                                              insulin
    106                                           hemoglobin
     94                                                  p53
     90                                                 egfr
     69                                                 bmr1
     64                                                 tp53
     63                                          

### Programmatic library breakdown

In [12]:
df_programmatic['Application'].value_counts()

Apache-HttpClient    173798
libwww-perl           35398
Wget                  19241
Python Requests       17855
curl                    290
Java                      3
Name: Application, dtype: int64

### Top 100 programmatic queries for each namespace

In [13]:
save_csv_for_all_namespaces(df_programmatic, 'programmatic')

User type=programmatic Namespace=proteomes
 Count                        Query
     1  reference:yes taxonomy:4751
     1  reference:yes taxonomy:9913
---------------------------------------------------------------------------------------------------- 


User type=programmatic Namespace=uniref
 Count                                  Query
     2     member:a0a1f1uu07 and identity:0.9
     2              id:uniref90_upi00052f1b7c
     2                     id:uniref50_q6k043
     2         member:s2xin7 and identity:0.9
     1                 uniref90_upi0009e405b1
     1                 uniref90_upi000bae784a
     1     member:a0a1n4cv59 and identity:0.9
     1     member:a0a1u0z855 and identity:0.9
     1                    uniref90_a0a1j4j8x9
     1     member:a0a1j6dvg6 and identity:0.9
     1     member:a0a0y1z827 and identity:0.9
     1     member:a0a1u0x935 and identity:0.9
     1                 uniref90_upi0009479f43
     1  member:upi000981d44d and identity:0.9
     1     memb