In [1]:
log = open('example_log.txt') # to take log lines from file 'example_log.txt' which contains each client request sent to a running web server in a client-server model.

In [2]:
lines = [line for line in log]
lines[:5] # to view first 5 lines

['200.155.108.44 - - [30/Nov/2017:11:59:54 +0000] "PUT /categories/categories/categories HTTP/1.1" 401 963 "http://www.yates.com/list/tags/category/" "Mozilla/5.0 (Windows CE) AppleWebKit/5332 (KHTML, like Gecko) Chrome/13.0.864.0 Safari/5332"\n',
 '36.139.255.202 - - [30/Nov/2017:11:59:54 +0000] "PUT /search HTTP/1.1" 404 171 "https://www.butler.org/main/tag/category/home.php" "Mozilla/5.0 (Macintosh; PPC Mac OS X 10_5_0) AppleWebKit/5332 (KHTML, like Gecko) Chrome/15.0.813.0 Safari/5332"\n',
 '50.112.115.219 - - [30/Nov/2017:11:59:54 +0000] "POST /main/blog HTTP/1.1" 404 743 "http://deleon-bender.com/categories/category.html" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_5 rv:2.0; apn-IN) AppleWebKit/531.48.1 (KHTML, like Gecko) Version/4.0 Safari/531.48.1"\n',
 '204.132.56.4 - - [30/Nov/2017:11:59:54 +0000] "POST /list HTTP/1.1" 404 761 "http://smith.com/category.htm" "Opera/9.39.(Windows 98; Win 9x 4.90; mn-MN) Presto/2.9.163 Version/12.00"\n',
 '233.154.7.24 - - [30/Nov/2017:11:59:

In [3]:
len(lines) # to find the number of lines in this log file

10000

In [4]:
lines[0] # to view first line 

'200.155.108.44 - - [30/Nov/2017:11:59:54 +0000] "PUT /categories/categories/categories HTTP/1.1" 401 963 "http://www.yates.com/list/tags/category/" "Mozilla/5.0 (Windows CE) AppleWebKit/5332 (KHTML, like Gecko) Chrome/13.0.864.0 Safari/5332"\n'

In [2]:
# implementing parse function to split each line and then extracts the fields. Data cleaning is also performed along with this.
from datetime import datetime

def strip_quotes(x):     #to remove unnecessary double quotes in some fields
    return x.replace('"','')

def parse_time(time_str):
    """
    Parses time in the format [30/Nov/2017:11:59:54 +0000]
    to a datetime object.
    """
    time_obj = datetime.strptime(time_str, '[%d/%b/%Y:%H:%M:%S %z]')
    return time_obj

def parse_log(log):
    for line in log:
        split_line = line.split()
        remote_addr = split_line[0]
        time_local = parse_time(split_line[3] + " " + split_line[4])
        request_type = strip_quotes(split_line[5])
        request_path = split_line[6]
        status = int(split_line[8])
        body_bytes_sent = int(split_line[9])
        http_referrer = strip_quotes(split_line[10])
        http_user_agent = strip_quotes(" ".join(split_line[11:]))
        yield (
            remote_addr, time_local, request_type, request_path,
            status, body_bytes_sent, http_referrer, http_user_agent
        )

In [3]:
import itertools ,csv
# function to save the lines into a csv file along with header names for the columns
def build_csv(lines,header = None, file=None):
    if header:
        lines = itertools.chain([header],lines)
    
    writer = csv.writer(file)
    for line in lines:
        writer.writerow(line)
    file.seek(0)
    return file

In [4]:
#function to count the unique request types in the log file.output is a dictionary showing unique request types and its count.
def count_unique_request(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('request_type')
    
    uniques = {}
    for line in reader:
        if not uniques.get(line[idx]):
            uniques[line[idx]] = 0
        uniques[line[idx]] += 1
   # print(uniques)    
    return uniques

In [5]:
import io
    
log = open('example_log.txt')
class Pipeline:
    def __init__(self):
        self.tasks = []
        
    def task(self, depends_on=None):
        idx = 0
        if depends_on:
            idx = self.tasks.index(depends_on) + 1
        def inner(f):
            self.tasks.insert(idx, f)
            return f
        return inner
    
    def run(self, input_):
        output = input_
        for task in self.tasks:
            output = task(output)
        return output
    
pipeline = Pipeline()

@pipeline.task()
def parse_logs(logs):
    return parse_log(logs)

@pipeline.task(depends_on=parse_logs)  
def build_raw_csv(lines):
    csvfile = open('temporary.csv', 'w+', newline='')
    return build_csv(lines, header=[
        'ip', 'time_local', 'request_type',
        'request_path', 'status', 'bytes_sent',
        'http_referrer', 'http_user_agent'
    ],
    file=csvfile)

@pipeline.task(depends_on=build_raw_csv)
def count_uniques(csv_file):
    return count_unique_request(csv_file)

log = open('example_log.txt')
unique_request_count = pipeline.run(log)
print('unique_request_count: ',unique_request_count)

unique_request_count:  {'PUT': 3367, 'POST': 3299, 'GET': 3334}


In [193]:
pipeline.tasks # to view the tasks in order

[<function __main__.parse_logs(logs)>,
 <function __main__.build_raw_csv(lines)>,
 <function __main__.count_uniques(csv_file)>]

In [6]:
import pandas as pd
new = pd.read_csv('temporary.csv')
new.head()

Unnamed: 0,ip,time_local,request_type,request_path,status,bytes_sent,http_referrer,http_user_agent
0,200.155.108.44,2017-11-30 11:59:54+00:00,PUT,/categories/categories/categories,401,963,http://www.yates.com/list/tags/category/,Mozilla/5.0 (Windows CE) AppleWebKit/5332 (KHT...
1,36.139.255.202,2017-11-30 11:59:54+00:00,PUT,/search,404,171,https://www.butler.org/main/tag/category/home.php,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_5_0) A...
2,50.112.115.219,2017-11-30 11:59:54+00:00,POST,/main/blog,404,743,http://deleon-bender.com/categories/category.html,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_5 ...
3,204.132.56.4,2017-11-30 11:59:54+00:00,POST,/list,404,761,http://smith.com/category.htm,Opera/9.39.(Windows 98; Win 9x 4.90; mn-MN) Pr...
4,233.154.7.24,2017-11-30 11:59:54+00:00,GET,/app,404,526,http://www.cherry.com/main.htm,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...
