In [13]:
log = open('example_log.txt') # to take log lines from file 'example_log.txt' which contains each client request sent to a running web server in a client-server model.

In [2]:
lines = [line for line in log]
lines[:5] # to view first 5 lines

['200.155.108.44 - - [30/Nov/2017:11:59:54 +0000] "PUT /categories/categories/categories HTTP/1.1" 401 963 "http://www.yates.com/list/tags/category/" "Mozilla/5.0 (Windows CE) AppleWebKit/5332 (KHTML, like Gecko) Chrome/13.0.864.0 Safari/5332"\n',
 '36.139.255.202 - - [30/Nov/2017:11:59:54 +0000] "PUT /search HTTP/1.1" 404 171 "https://www.butler.org/main/tag/category/home.php" "Mozilla/5.0 (Macintosh; PPC Mac OS X 10_5_0) AppleWebKit/5332 (KHTML, like Gecko) Chrome/15.0.813.0 Safari/5332"\n',
 '50.112.115.219 - - [30/Nov/2017:11:59:54 +0000] "POST /main/blog HTTP/1.1" 404 743 "http://deleon-bender.com/categories/category.html" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_5 rv:2.0; apn-IN) AppleWebKit/531.48.1 (KHTML, like Gecko) Version/4.0 Safari/531.48.1"\n',
 '204.132.56.4 - - [30/Nov/2017:11:59:54 +0000] "POST /list HTTP/1.1" 404 761 "http://smith.com/category.htm" "Opera/9.39.(Windows 98; Win 9x 4.90; mn-MN) Presto/2.9.163 Version/12.00"\n',
 '233.154.7.24 - - [30/Nov/2017:11:59:

In [3]:
len(lines) # to find the number of lines in this log file

10000

In [4]:
lines[0]

'200.155.108.44 - - [30/Nov/2017:11:59:54 +0000] "PUT /categories/categories/categories HTTP/1.1" 401 963 "http://www.yates.com/list/tags/category/" "Mozilla/5.0 (Windows CE) AppleWebKit/5332 (KHTML, like Gecko) Chrome/13.0.864.0 Safari/5332"\n'

In [14]:
# modify the parse_log function by cleaning : Set the time_local field to a datetime object.
#Strip the quotes off request_type, http_referrer, and http_user_agent.
#Parse the status and body_bytes_sent to int.
from datetime import datetime
#import datetime
def strip_quotes(x):
    return x.replace('"','')

def parse_time(time_str):
    """
    Parses time in the format [30/Nov/2017:11:59:54 +0000]
    to a datetime object.
    """
    time_obj = datetime.strptime(time_str, '[%d/%b/%Y:%H:%M:%S %z]')
    return time_obj

def parse_log(log):
    for line in log:
        split_line = line.split()
        remote_addr = split_line[0]
        time_local = parse_time(split_line[3] + " " + split_line[4])
        request_type = strip_quotes(split_line[5])
        request_path = split_line[6]
        status = int(split_line[8])
        body_bytes_sent = int(split_line[9])
        http_referrer = strip_quotes(split_line[10])
        http_user_agent = strip_quotes(" ".join(split_line[11:]))
        yield (
            remote_addr, time_local, request_type, request_path,
            status, body_bytes_sent, http_referrer, http_user_agent
        )

In [12]:
first_line = next(parse_log(log))
first_line

('200.155.108.44',
 datetime.datetime(2017, 11, 30, 11, 59, 54, tzinfo=datetime.timezone.utc),
 'PUT',
 '/categories/categories/categories',
 401,
 963,
 'http://www.yates.com/list/tags/category/',
 'Mozilla/5.0 (Windows CE) AppleWebKit/5332 (KHTML, like Gecko) Chrome/13.0.864.0 Safari/5332')

In [15]:
# lines = [header]+[l for l in lines] when we use this, lines turns to list but we want to keep its generator behavior and insert a header. For that use itertools.chain
import itertools,csv
def build_csv(lines,header = None, file=None):
    if header:
        lines = itertools.chain([header],lines)
        #print(type(lines))
    writer = csv.writer(file,delimiter=',')
    writer.writerows(lines)
    file.seek(0)
    return file

In [16]:
parsed = parse_log(log)
file = open('example.csv', 'w+',newline='')
csv_file = build_csv(parsed,header=['ip','time_local','request_type','request_path','status','bytes_sent','http_referrer','http_user_agent'],file=file)
contents=csv_file.readlines()
contents[:5]

['ip,time_local,request_type,request_path,status,bytes_sent,http_referrer,http_user_agent\r\n',
 '200.155.108.44,2017-11-30 11:59:54+00:00,PUT,/categories/categories/categories,401,963,http://www.yates.com/list/tags/category/,"Mozilla/5.0 (Windows CE) AppleWebKit/5332 (KHTML, like Gecko) Chrome/13.0.864.0 Safari/5332"\r\n',
 '36.139.255.202,2017-11-30 11:59:54+00:00,PUT,/search,404,171,https://www.butler.org/main/tag/category/home.php,"Mozilla/5.0 (Macintosh; PPC Mac OS X 10_5_0) AppleWebKit/5332 (KHTML, like Gecko) Chrome/15.0.813.0 Safari/5332"\r\n',
 '50.112.115.219,2017-11-30 11:59:54+00:00,POST,/main/blog,404,743,http://deleon-bender.com/categories/category.html,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_5 rv:2.0; apn-IN) AppleWebKit/531.48.1 (KHTML, like Gecko) Version/4.0 Safari/531.48.1"\r\n',
 '204.132.56.4,2017-11-30 11:59:54+00:00,POST,/list,404,761,http://smith.com/category.htm,Opera/9.39.(Windows 98; Win 9x 4.90; mn-MN) Presto/2.9.163 Version/12.00\r\n']

In [17]:
import pandas as pd
newdf = pd.read_csv('example.csv')
newdf.head()

Unnamed: 0,ip,time_local,request_type,request_path,status,bytes_sent,http_referrer,http_user_agent
0,200.155.108.44,2017-11-30 11:59:54+00:00,PUT,/categories/categories/categories,401,963,http://www.yates.com/list/tags/category/,Mozilla/5.0 (Windows CE) AppleWebKit/5332 (KHT...
1,36.139.255.202,2017-11-30 11:59:54+00:00,PUT,/search,404,171,https://www.butler.org/main/tag/category/home.php,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_5_0) A...
2,50.112.115.219,2017-11-30 11:59:54+00:00,POST,/main/blog,404,743,http://deleon-bender.com/categories/category.html,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_5 ...
3,204.132.56.4,2017-11-30 11:59:54+00:00,POST,/list,404,761,http://smith.com/category.htm,Opera/9.39.(Windows 98; Win 9x 4.90; mn-MN) Pr...
4,233.154.7.24,2017-11-30 11:59:54+00:00,GET,/app,404,526,http://www.cherry.com/main.htm,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...


In [18]:
newdf.request_type.value_counts()

PUT     3367
GET     3334
POST    3299
Name: request_type, dtype: int64