In [None]:
import gzip
import os
import sys
import re

from tqdm import tqdm
from tqdm import tnrange, tqdm_notebook

import dask
import dask.dataframe as dd
import pandas as pd

import pytz
from datetime import datetime

In [None]:
from dask import compute, delayed
from dask.distributed import Client, LocalCluster

In [None]:
client = Client()

client

In [None]:
client.close()

In [None]:
INPUT_DIRS = [
    "in/dataweb-01/logs",
    "in/dataweb-01/nginx",
    "in/dataweb-03/logs",
    "in/dataweb-03/nginx"
]

INPUT_DIRS = [
    "in/dataweb-01/logs/moved-2018-07-20",
]

In [None]:
def parse_file(folder, file):
    
    lineformat = re.compile(r"""(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} (\+|\-)\d{4})\] ((\"(GET|POST) )(?P<url>.+)(http\/1\.1")) (?P<statuscode>\d{3}) (?P<bytessent>\d+) (?P<refferer>-|"([^"]+)") (["](?P<useragent>[^"]+)["])""", re.IGNORECASE)
    
    if file.endswith(".gz"):
        logfile = gzip.open(os.path.join(folder, file))
    else:
        logfile = open(os.path.join(folder, file))

    logs = []
        
    for l in logfile.readlines():
        log = str(l)
        
        data = re.search(lineformat, log)

        if data:
            d = data.groupdict()
            
            d["dateandtime"] = pd.to_datetime(d["dateandtime"], format='%d/%b/%Y:%H:%M:%S %z')
            
            logs.append(d)

    if(len(logs) > 0):
        df = pd.DataFrame(logs)
        df.to_csv("{}/{}.csv".format(folder,file))
        
        return df
    #return (len(logs), file)

In [None]:
values = [ delayed(parse_file)(folder, file) for folder in INPUT_DIRS for file in os.listdir(folder) ]

In [None]:
dask.compute(*values)

In [None]:
df = dd.read_csv('out/access/*/*.csv')

In [None]:
len(df)

In [None]:
len(df.columns)

In [None]:
df.columns

In [None]:
df.head()

In [None]:
test = "21/Jan/2019:18:18:43 +0100"

In [None]:
def parse_datetime(t):
    dt = datetime.strptime(t[1:-7], '%d/%b/%Y:%H:%M:%S')
    dt_tz = int(t[-6:-3])*60+int(t[-3:-1])
    
    return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))

parse_datetime(test)

In [None]:
pd.to_datetime(test, format='%d/%b/%Y:%H:%M:%S %z')

In [None]:
meta = ('time', 'datetime64[ns]')

def parse_dateandtime(df):
    return pd.to_datetime(df["dateandtime"], format='%d/%b/%Y:%H:%M:%S %z')

df.map_partitions(parse_dateandtime, meta=meta).compute()

In [None]:
df.head()

In [None]:
df._meta.dtypes

In [None]:
df.known_divisions

In [None]:
client.close()