# WebLog Challenge

## Sessionize by IP

### Clean and create temporary folder.  Unzip data.

In [22]:
import shutil
import os
import gzip

scriptpath = os.getcwd()
tempdir = '.tmp'

if os.path.exists(tempdir):
    shutil.rmtree(tempdir)
os.makedirs(tempdir)

extractedlog = 'web.log'
sourcepath = os.path.join(scriptpath, '../data/2015_07_22_mktplace_shop_web_log_sample.log.gz')
destpath = os.path.join(scriptpath, tempdir, extractedlog)
with gzip.open(sourcepath) as src, open(destpath, 'w') as dest:
    content = src.read()
    dest.write(content)

### Build sessions

In [23]:
from analytics.sessionizer import *
from pyspark import SparkContext

sessionizer = Sessionizer(sc)
sessions_rdd = sessionizer.calc_sessions_from_file(destpath, 15)

### Display first 10 sessions

In [25]:
sessions_rdd.take(10)

[{'end': datetime.datetime(2015, 7, 22, 11, 4, 43, 966443, tzinfo=<iso8601.Utc>),
  'id': -3925200981616695027,
  'ip': u'111.93.191.38',
  'requests': [u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/login',
   u'paytm.com:443/shop/authresponse',
   u'paytm.com:443/shop/v1/frequentorders',
   u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/user/address',
   u'paytm.com:443/shop/user/address',
   u'paytm.com:443/shop/cart',
   u'paytm.com:443/shop/cart/checkout'],
  'start': datetime.datetime(2015, 7, 22, 11, 3, 6, 822295, tzinfo=<iso8601.Utc>)},
 {'end': datetime.datetime(2015, 7, 22, 6, 58, 36, 823521, tzinfo=<iso8601.Utc>),
  'id': -8762026098901585080,
  'ip': u'49.201.18.231',
  'requests': [u'paytm.com:443/shop/g/electronics/the-electronic-store/ces-offer-on-pen-drives'],
  'start': datetime.datetime(2015, 7, 22, 6, 58, 36, 823521, tzinfo=<iso8601.Utc>)},
 {'end': datetime.datetim

## Average Session Time

In [35]:
avg = sessionizer.average_session_time(sessions_rdd)

print 'Average session time(s):' + str(avg)

Average session time(s):100


## Unique Visits per Sesions

In [38]:
visits_per_session = sessionizer.unique_visits_per_session(sessions_rdd)

it = visits_per_session.iteritems()
for i in range(10):
    session = it.next()[1]
    print 'Session ID:' + session['id'] + ', IP:' + session['ip'] + ', Number of Visits:' + str(session['visits']) 

1


TypeError: 'int' object has no attribute '__getitem__'

## Active Users

In [33]:
most_active_sessions = sessionizer.find_engaged_users(sessions_rdd)

for session in most_active_sessions[:10]:
    print 'User:' + session['ip'] + ', Duration(s):' + str(session['duration'].total_seconds()) 

User:52.74.219.71, Duration(s):2069.161909
User:119.81.61.166, Duration(s):2068.84845
User:106.186.23.95, Duration(s):2068.756402
User:125.19.44.66, Duration(s):2068.71247
User:125.20.39.66, Duration(s):2068.320635
User:192.8.190.10, Duration(s):2067.234329
User:54.251.151.39, Duration(s):2067.022968
User:180.211.69.209, Duration(s):2066.960642
User:180.179.213.70, Duration(s):2065.63799
User:203.189.176.14, Duration(s):2065.594344
