In [1]:
import re
import pandas as pd
import numpy as np
from StringIO import StringIO
import Tkinter
import tkFileDialog

In [2]:
def sanitizer(dFile):

    pattern = r"(\" )(?=HTTP)"  # remove triple quoted lines

    with open(dFile) as d:
        raw = d.read()

    f_txt = re.sub(pattern, " ", raw) # remove the extra quote

    x = StringIO(f_txt) # Load data into StringIO

    df = pd.read_csv(x, sep="\s+", header=None, na_values="-")  # separate by space.
    df.columns = ['IP_address', 'time_stamp', 'request', 'status', 'bytes']

    df['time_stamp'] += "1995-08" # Add Year and Month
    df['time_stamp'] = pd.to_datetime(df['time_stamp'], format="[%d:%H:%M:%S]%Y-%m")

    return df

In [3]:
# Select file using Tkinter GUI dialog box
tGUI = Tkinter.Tk()
tGUI.withdraw()
filePath = tkFileDialog.askopenfilename(parent=tGUI)

In [26]:
# Sanitize txt file to get clean data
cleanData = sanitizer(filePath)

# View Clean Data
cleanData.head(5)




Unnamed: 0,IP_address,time_stamp,request,status,bytes
0,141.243.1.172,1995-08-29 23:53:25,GET /Software.html HTTP/1.0,200,1497.0
1,query2.lycos.cs.cmu.edu,1995-08-29 23:53:36,GET /Consumer.html HTTP/1.0,200,1325.0
2,tanuki.twics.com,1995-08-29 23:53:53,GET /News.html HTTP/1.0,200,1014.0
3,wpbfl2-45.gate.net,1995-08-29 23:54:15,GET / HTTP/1.0,200,4889.0
4,wpbfl2-45.gate.net,1995-08-29 23:54:16,GET /icons/circle_logo_small.gif HTTP/1.0,200,2624.0


In [27]:
# 1. Which hostname or IP address made the most requests?
print "IP_address or hostname with most requests:"
print cleanData['IP_address'].value_counts()[:1]

IP_address or hostname with most requests:
sandy.rtptok1.epa.gov    294
Name: IP_address, dtype: int64


In [16]:
# 2. Which hostname or IP address received the most total bytes from the server? How many bytes did it receive?
print "IP_address and hostname with most total bytes:"
mostBytes = cleanData.sort_values(by='bytes', axis=0, ascending=False, inplace=False)
mostBytes.head(1)

IP_address and hostname with most total bytes:


Unnamed: 0,IP_address,time_stamp,request,status,bytes
20020,139.121.98.45,1995-08-30 13:04:57,GET /docs/TRI_Cover93/1993-only/ca93.e00 HTTP/1.0,200,4816896.0


In [22]:
#3. During what hour was the server the busiest in terms of requests?
time_1 = cleanData.groupby(cleanData['time_stamp'].dt.hour)
group_size = time_1.size()
group_size.sort_values(inplace=True, ascending=False)
print "Busiest server hour | number of requests:"
print "                 " + str(group_size[:1])

Busiest server hour | number of requests:
                 time_stamp
14    4716
dtype: int64


In [24]:
#4 Which .gif image was downloaded the most during the day?
sub = cleanData[(cleanData.status == 200) & (cleanData.request.str.contains('\\.gif'))]
print "                Most downloaded gif image | download count"
print sub.request.value_counts()[:1]

                Most downloaded gif image | download count
GET /icons/circle_logo_small.gif HTTP/1.0    2465
Name: request, dtype: int64


In [25]:
#5 What HTTP reply codes were sent other than 200?
http_s = cleanData[cleanData.status != 200]
print "Codes | counts:"
print http_s.status.value_counts()

Codes | counts:
304    5300
302    4506
404     611
501     272
403     272
500      69
400       6
Name: status, dtype: int64
