In [1]:
# ===========
# ENVIRONMENT
# ===========


import os
import sys
import re
import pandas as pd
import numpy as np

from datetime import datetime




# =======
# ACQUIRE
# =======


def get_log():
    """
    Opens the access.log file and breaks it into lines.
    """
    with open('access.log') as f:
        return [line.rstrip('\n') for line in f]




# =======
# PREPARE
# =======


def parse_log(loglines):
    """
    Parses records from the access log and returns a dataframe.
    """

    logs = []

    for logline in loglines:
        log = logline.split(' ')

        ip_address = log[0]

        timestamp = (log[3] + ' ' + log[4]).replace(' ', '')
        timestamp = timestamp.replace(']', '')
        timestamp = timestamp.replace('[', '')

        http_method = (log[5]).replace('\"', '')
        path = (log[6]).replace('\"', '')
        protocol = (log[7]).replace('\"', '')
        status = int(log[8])
        size = int(log[9])
        user_agent = log[11].replace('\"', '')
        
        log_string = [ip_address,
                      timestamp,
                      http_method,
                      path,
                      protocol,
                      status,
                      size,
                      user_agent]
#         print(log_string)

        logs.append(log_string)
        logs

    return pd.DataFrame(logs,columns=['ip_address',
                                      'timestamp',
                                      'http_method',
                                      'path',
                                      'protocol',
                                      'status',
                                      'size',
                                      'user_agent'])


def remove_space(df, column):
    """
    Removes the colon between date and hour.
    """
    return df[column].str.replace(':', ' ', 1)


def convert_to_datetime(df, column):
    """
    Converts string object to datetime object.
    """
    return pd.to_datetime(df[column])


def set_utc(df, locale):
    """
    Converts to UTC time.
    """
    return df.tz_localize('utc').tz_convert(locale)


def process_datetime(df, column, locale):
    """
    Pre-processess timestamp column.
    """
    df[column] = remove_space(df, column)
    df[column] = convert_to_datetime(df, column)
    df = df.set_index(column)
    return set_utc(df, locale)

In [2]:
df = parse_log(get_log())
df = process_datetime(df, 'timestamp', 'America/Chicago')
df.head(10)

Unnamed: 0_level_0,ip_address,http_method,path,protocol,status,size,user_agent
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-04-16 14:34:42-05:00,97.105.19.58,GET,/api/v1/sales?page=81,HTTP/1.1,200,512495,python-requests/2.21.0
2019-04-16 14:34:42-05:00,97.105.19.58,GET,/api/v1/items,HTTP/1.1,200,3561,python-requests/2.21.0
2019-04-16 14:34:44-05:00,97.105.19.58,GET,/api/v1/sales?page=82,HTTP/1.1,200,510103,python-requests/2.21.0
2019-04-16 14:34:46-05:00,97.105.19.58,GET,/api/v1/sales?page=83,HTTP/1.1,200,510003,python-requests/2.21.0
2019-04-16 14:34:48-05:00,97.105.19.58,GET,/api/v1/sales?page=84,HTTP/1.1,200,511963,python-requests/2.21.0
2019-04-16 14:34:48-05:00,97.105.19.58,GET,/api/v1/stores,HTTP/1.1,200,1328,python-requests/2.21.0
2019-04-16 14:34:50-05:00,97.105.19.58,GET,/api/v1/sales?page=85,HTTP/1.1,200,510753,python-requests/2.21.0
2019-04-16 14:34:52-05:00,97.105.19.58,GET,/api/v1/sales?page=86,HTTP/1.1,200,510348,python-requests/2.21.0
2019-04-16 14:34:52-05:00,97.105.19.58,GET,/,HTTP/1.1,200,42,python-requests/2.21.0
2019-04-16 14:34:53-05:00,97.105.19.58,GET,/api/v1/items,HTTP/1.1,200,3561,python-requests/2.21.0


In [3]:
df.reset_index(inplace=True)
df.head(10)

Unnamed: 0,timestamp,ip_address,http_method,path,protocol,status,size,user_agent
0,2019-04-16 14:34:42-05:00,97.105.19.58,GET,/api/v1/sales?page=81,HTTP/1.1,200,512495,python-requests/2.21.0
1,2019-04-16 14:34:42-05:00,97.105.19.58,GET,/api/v1/items,HTTP/1.1,200,3561,python-requests/2.21.0
2,2019-04-16 14:34:44-05:00,97.105.19.58,GET,/api/v1/sales?page=82,HTTP/1.1,200,510103,python-requests/2.21.0
3,2019-04-16 14:34:46-05:00,97.105.19.58,GET,/api/v1/sales?page=83,HTTP/1.1,200,510003,python-requests/2.21.0
4,2019-04-16 14:34:48-05:00,97.105.19.58,GET,/api/v1/sales?page=84,HTTP/1.1,200,511963,python-requests/2.21.0
5,2019-04-16 14:34:48-05:00,97.105.19.58,GET,/api/v1/stores,HTTP/1.1,200,1328,python-requests/2.21.0
6,2019-04-16 14:34:50-05:00,97.105.19.58,GET,/api/v1/sales?page=85,HTTP/1.1,200,510753,python-requests/2.21.0
7,2019-04-16 14:34:52-05:00,97.105.19.58,GET,/api/v1/sales?page=86,HTTP/1.1,200,510348,python-requests/2.21.0
8,2019-04-16 14:34:52-05:00,97.105.19.58,GET,/,HTTP/1.1,200,42,python-requests/2.21.0
9,2019-04-16 14:34:53-05:00,97.105.19.58,GET,/api/v1/items,HTTP/1.1,200,3561,python-requests/2.21.0


In [4]:
def add_year(df, column):
    return df[column].dt.year


def add_quarter(df, column):
    return df[column].dt.quarter


def add_month(df, column):
    return df[column].dt.month


def add_day(df, column):
    return df[column].dt.day


def add_hour(df, column):
    return df[column].dt.hour


def add_weekday(df, column):
    return df[column].dt.weekday


def add_date_columns(df, column):
    df.reset_index(inplace=True)
    df['year'] = add_year(df, column)
    df['quarter'] = add_quarter(df, column)
    df['month'] = add_month(df, column)
    df['day'] = add_day(df, column)
    df['hour'] = add_hour(df, column)
    df['weekday'] = add_weekday(df, column)
    return df

In [5]:
df = add_date_columns(df, 'timestamp')

In [6]:
df.head(10)

Unnamed: 0,index,timestamp,ip_address,http_method,path,protocol,status,size,user_agent,year,quarter,month,day,hour,weekday
0,0,2019-04-16 14:34:42-05:00,97.105.19.58,GET,/api/v1/sales?page=81,HTTP/1.1,200,512495,python-requests/2.21.0,2019,2,4,16,14,1
1,1,2019-04-16 14:34:42-05:00,97.105.19.58,GET,/api/v1/items,HTTP/1.1,200,3561,python-requests/2.21.0,2019,2,4,16,14,1
2,2,2019-04-16 14:34:44-05:00,97.105.19.58,GET,/api/v1/sales?page=82,HTTP/1.1,200,510103,python-requests/2.21.0,2019,2,4,16,14,1
3,3,2019-04-16 14:34:46-05:00,97.105.19.58,GET,/api/v1/sales?page=83,HTTP/1.1,200,510003,python-requests/2.21.0,2019,2,4,16,14,1
4,4,2019-04-16 14:34:48-05:00,97.105.19.58,GET,/api/v1/sales?page=84,HTTP/1.1,200,511963,python-requests/2.21.0,2019,2,4,16,14,1
5,5,2019-04-16 14:34:48-05:00,97.105.19.58,GET,/api/v1/stores,HTTP/1.1,200,1328,python-requests/2.21.0,2019,2,4,16,14,1
6,6,2019-04-16 14:34:50-05:00,97.105.19.58,GET,/api/v1/sales?page=85,HTTP/1.1,200,510753,python-requests/2.21.0,2019,2,4,16,14,1
7,7,2019-04-16 14:34:52-05:00,97.105.19.58,GET,/api/v1/sales?page=86,HTTP/1.1,200,510348,python-requests/2.21.0,2019,2,4,16,14,1
8,8,2019-04-16 14:34:52-05:00,97.105.19.58,GET,/,HTTP/1.1,200,42,python-requests/2.21.0,2019,2,4,16,14,1
9,9,2019-04-16 14:34:53-05:00,97.105.19.58,GET,/api/v1/items,HTTP/1.1,200,3561,python-requests/2.21.0,2019,2,4,16,14,1
