# HTTP Response and Redirection Analysis
- This notebook demonstrates the basic use of `http_responses` table and HTTP `Location` header by finding the most commonly occuring redirections.
- Partly based on https://github.com/mozilla/openwpm-crawler/blob/master/analysis/Sample%20Analysis.ipynb


In [1]:
import re
import json
import sqlite3
import pandas as pd
from collections import defaultdict

In [2]:
# import some analysis utilities from https://github.com/englehardt/crawl_utils
import sys
sys.path.append('./crawl_utils/')
import domain_utils as du
import analysis_utils as au

In [3]:
# use the sample sqlite for the 2018-06 stateless crawl
DB = 'sample_2018-06_1m_stateless_census_crawl.sqlite'

### Load the responses

In [4]:
# Load the data
con = sqlite3.connect(DB)
con.row_factory = sqlite3.Row
cur = con.cursor()
resps = pd.read_sql_query("SELECT * FROM http_responses", con)

In [5]:
total_sites = resps['visit_id'].nunique()
print "Database contains %s HTTP responses on %d sites" %(len(resps), total_sites)

Database contains 153693 HTTP responses on 967 sites


In [6]:
resps.head()

Unnamed: 0,id,crawl_id,visit_id,url,method,referrer,response_status,response_status_text,is_cached,headers,channel_id,location,time_stamp,content_hash
0,1,9,9,http://amazon.com/,GET,,301,Moved Permanently,0,"[[""Server"",""Server""],[""Date"",""Wed, 27 Jun 2018...",{7503bbce-c0a7-4cfd-aa9a-6b023c81184d},https://amazon.com/,2018-06-27T14:19:39.650Z,
1,2,7,7,http://google.co.in/,GET,,301,Moved Permanently,0,"[[""Location"",""http://www.google.co.in/""],[""Con...",{2e5da2ee-aa63-4295-8b01-bdd1e02282cb},http://www.google.co.in/,2018-06-27T14:19:39.660Z,
2,3,9,9,https://amazon.com/,GET,,301,Moved Permanently,0,"[[""Server"",""Server""],[""Date"",""Wed, 27 Jun 2018...",{825cc55c-1331-4bde-802b-16005b100e56},https://www.amazon.com/,2018-06-27T14:19:39.676Z,
3,4,1,1,http://google.com/,GET,,301,Moved Permanently,0,"[[""Location"",""http://www.google.com/""],[""Conte...",{ab3afb0f-9661-423e-8807-430687139415},http://www.google.com/,2018-06-27T14:19:39.682Z,
4,5,2,2,http://youtube.com/,GET,,301,Moved Permanently,0,"[[""Content-Length"",""0""],[""Location"",""https://y...",{364f2ffa-936f-458b-a895-530ea15681b7},https://youtube.com/,2018-06-27T14:19:39.684Z,


### Add additional columns to help with analysis

In [7]:
# Add the public suffix + 1 of a bunch of the URL columns
resps['url_ps1'] = resps['url'].apply(du.get_ps_plus_1)
resps['location_ps1'] = resps['location'].apply(du.get_ps_plus_1)

### Find the domains that redirections between domains

In [8]:
from tqdm import tqdm

# Join by `site_visits` table to get the site URL.
query = """SELECT sv.site_url, sv.visit_id,
    res.url, res.location
    FROM http_responses as res LEFT JOIN site_visits as sv
    ON sv.visit_id = res.visit_id
    """
redirections = defaultdict(int)  # (source domain, destination domain)=> frequency (num. of distinct sites)
redirections_by_visit_id = defaultdict(set)  # we keep a track of redirections observed on each domain

for row in tqdm(cur.execute(query)):
    # visit_id, script_url, operation, arguments, symbol, value = row[0:6]
    visit_id = row["visit_id"]
    site_url = row["site_url"]
    url = row["url"]
    location = row["location"]
    url_ps1 = du.get_ps_plus_1(url)
    location_ps1 = du.get_ps_plus_1(location)
    # only count redirections between different PS+1's
    if location_ps1 and (location_ps1 != url_ps1):
        # make sure we didn't count this redirection on this site
        if (url_ps1, location_ps1) not in redirections_by_visit_id[visit_id]:
            redirections[(url_ps1, location_ps1)] += 1
            redirections_by_visit_id[visit_id].add((url_ps1, location_ps1))

153693it [00:06, 22331.00it/s]


### Most frequent redirections between domains

In [9]:
for count, redirection_pair in sorted( ((v,k) for k,v in redirections.iteritems()), reverse=True):
    if count > 10:  # only prints redirections happened on more than 10 sites
        print redirection_pair[0], "=>", redirection_pair[1], count

doubleclick.net => google.com 295
google-analytics.com => doubleclick.net 248
everesttech.net => rubiconproject.com 88
doubleclick.net => rubiconproject.com 87
adsrvr.org => rubiconproject.com 78
mathtag.com => rubiconproject.com 73
rubiconproject.com => yahoo.com 70
rubiconproject.com => doubleclick.net 69
yahoo.com => rubiconproject.com 68
everesttech.net => openx.net 68
everesttech.net => pubmatic.com 67
doubleclick.net => pubmatic.com 67
rlcdn.com => pubmatic.com 64
rfihub.com => pubmatic.com 64
adsrvr.org => pubmatic.com 64
doubleclick.net => openx.net 63
everesttech.net => demdex.net 60
adsrvr.org => openx.net 59
turn.com => openx.net 57
quantserve.com => pubmatic.com 56
pubmatic.com => adsymptotic.com 55
adsymptotic.com => pubmatic.com 55
adsymptotic.com => adsrvr.org 54
dotomi.com => pubmatic.com 52
adsrvr.org => adsymptotic.com 52
nexac.com => addthis.com 51
mathtag.com => openx.net 49
google.com => doubleclick.net 49
1rx.io => pubmatic.com 46
mathtag.com => pubmatic.com 42
ad