# Examine and Prep NASA weblogs

Data obtained from: http://ita.ee.lbl.gov/html/contrib/NASA-HTTP.html

In [1]:
from data_fetch_utils import fetch_and_cache
data_file1 = fetch_and_cache("ftp://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz", 
                            "nasa1.gz")
data_file2 = fetch_and_cache("ftp://ita.ee.lbl.gov/traces/NASA_access_log_Aug95.gz", 
                            "nasa2.gz")

Using version already downloaded: Tue Apr 24 23:55:58 2018
MD5 hash of file: b7e1447e521cdb602f51e78a170db542
Using version already downloaded: Wed Apr 25 00:31:29 2018
MD5 hash of file: 5eb26c301d3a7476fbc289e95ed4d4ae


In [2]:
import gzip
import re
from datetime import datetime

def extract_dates(fp):
    for line in fp:
        line = line.decode("UTF8", errors="ignore")
        m = re.match(r".*\[([^\]]*)\]", line)
        if m:
            yield datetime.strptime(m.groups()[0], "%d/%b/%Y:%H:%M:%S %z") 

with gzip.open(data_file1, "r") as f:
    dates1 = list(extract_dates(f))
with gzip.open(data_file2, "r") as f:
    dates2 = list(extract_dates(f))

In [3]:
import pandas as pd
dates = pd.concat([pd.Series(dates1), pd.Series(dates2)]).reset_index(drop=True)

In [4]:
elapsed = dates - dates.min()
elapsed_ms = elapsed.dt.total_seconds() * 1000

In [5]:
import matplotlib.pyplot as plt
plt.plot(elapsed_ms)

[<matplotlib.lines.Line2D at 0x1252960f0>]

In [6]:
import pickle
with gzip.open("nasa_weblog_times.pkl.gz", "wb") as f:
    pickle.dump(elapsed_ms.values, f)