# WebLog Challenge

## Sessionize by IP

### Clean and create temporary folder.  Unzip data.

In [1]:
import shutil
import os
import gzip

scriptpath = os.getcwd()
tempdir = '.tmp'

if os.path.exists(tempdir):
    shutil.rmtree(tempdir)
os.makedirs(tempdir)

extractedlog = 'web.log'
sourcepath = os.path.join(scriptpath, '../data/2015_07_22_mktplace_shop_web_log_sample.log.gz')
destpath = os.path.join(scriptpath, tempdir, extractedlog)
with gzip.open(sourcepath) as src, open(destpath, 'w') as dest:
    content = src.read()
    dest.write(content)

### Build sessions

In [2]:
from analytics.sessionizer import *
from pyspark import SparkContext

sessionizer = Sessionizer(sc)
sessions_rdd = sessionizer.calc_sessions_from_file(destpath, 15)

### Display first 10 sessions

In [3]:
sessions_rdd.take(10)

[{'end': datetime.datetime(2015, 7, 22, 11, 4, 43, 966443, tzinfo=<iso8601.Utc>),
  'id': -3925200981616695027,
  'ip': u'111.93.191.38',
  'requests': [u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/login',
   u'paytm.com:443/shop/authresponse',
   u'paytm.com:443/shop/v1/frequentorders',
   u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/user/address',
   u'paytm.com:443/shop/user/address',
   u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/cart/checkout'],
  'start': datetime.datetime(2015, 7, 22, 11, 3, 6, 822295, tzinfo=<iso8601.Utc>)},
 {'end': datetime.datetime(2015, 7, 22, 6, 58, 36, 823521, tzinfo=<iso8601.Utc>),
  'id': -8762026098901585080,
  'ip': u'49.201.18.231',
  'requests': [u'paytm.com:443/shop/g/electronics/the-electronic-store/ces-offer-on-pen-drives'],
  'start': datetime.datetime(2015, 7, 22, 6, 58, 36, 823521, tzinfo=<iso8601.Utc>)},
 {'end': datetime.datetim

## Average Session Time

In [4]:
avg = sessionizer.average_session_time(sessions_rdd)

print 'Average session time(s):' + str(avg)

Average session time(s):100


## Unique Visits per Sesions

In [7]:
visits_per_session = sessionizer.unique_visits_per_session(sessions_rdd)

for session in visits_per_session[:10]:
    print 'Session ID:' + str(session['id']) + ', IP:' + session['ip'] + ', Number of Visits:' + str(session['visits']) 

Session ID:-2182577144418974215, IP:52.74.219.71, Number of Visits:9530
Session ID:8773142274920106917, IP:119.81.61.166, Number of Visits:8014
Session ID:-1796594902152953624, IP:52.74.219.71, Number of Visits:5478
Session ID:-2990558728851149191, IP:106.186.23.95, Number of Visits:4656
Session ID:-4783974076483672580, IP:119.81.61.166, Number of Visits:3928
Session ID:-6829445129383477962, IP:119.81.61.166, Number of Visits:3637
Session ID:5912819856795839131, IP:119.81.61.166, Number of Visits:3334
Session ID:3795301233123946046, IP:52.74.219.71, Number of Visits:2907
Session ID:6628773623232956121, IP:119.81.61.166, Number of Visits:2841
Session ID:737435597002797831, IP:119.81.61.166, Number of Visits:2786


## Active Users

In [None]:
most_active_sessions = sessionizer.find_engaged_users(sessions_rdd)

for session in most_active_sessions[:10]:
    print 'User:' + session['ip'] + ', Duration(s):' + str(session['length']) 