# Parser for Apache access log

In [8]:
import pandas as pd
from apachelogs import LogParser
from tqdm.notebook import tqdm
import time
import os, glob
import json

### Parsing each line and adding it to df

I'm going to parse each line and create a Pandas DataFrame to work with the data.
I will flush the rows array to a temp csv every 1M rows to free up memory, and then I will combine these csv in a big one.

In [4]:
#defining the parser params
parser = LogParser("%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"")

cols = ["ip", "date", "method", "code", "size", "useragent", "url", "http_v"]
rows = []
counter = 1 #for csv name
with open("access2.log", "r") as fp:
    total = sum(1 for line in fp) #calculate numb of rows for the progressbar
    fp.seek(0) #bring the pointer back to the first line
    for row in tqdm(parser.parse_lines(fp), total=total):
        df_row = {}
        df_row["ip"] = row.remote_host
        df_row["date"] = row.request_time
        df_row["code"] = row.final_status
        df_row["size"] = row.bytes_sent
        
        req = row.request_line.split(" ")
        if (len(req)==3):
            df_row["method"] = req[0]
            df_row["url"] = req[1]
            df_row["http_v"] = req[2]
        else:
            df_row["method"] = None
            df_row["url"] = None
            df_row["http_v"] = None
        if "User-Agent" in row.headers_in:
            df_row["useragent"] = row.headers_in["User-Agent"]
        else:
            df_row["useragent"] = None
        rows.append(df_row)
        
        #if multiple of 1M flush to csv and free up memory
        if (len(rows)%1000000 == 0):
            df = pd.DataFrame(rows, columns=cols)
            df.to_csv("access_"+str(counter)+".csv", sep="\t")
            counter = counter+1
            rows = []

  0%|          | 0/594168 [00:00<?, ?it/s]

In [6]:
#flushing to csv last items left from last flush
df = pd.DataFrame(rows, columns=cols)
df.to_csv("access_"+str(counter)+".csv", sep="\t")
counter = counter+1
rows = []

In [7]:
#importing all csv file to create a single one
files = glob.glob("access_*.csv")
df = pd.concat((pd.read_csv(f, sep="\t") for f in files))

In [8]:
#exporting all csv in a single file
df.to_csv("access.csv", sep="\t")

### Load checkpoint

In [9]:
if "df" not in locals():
    df = pd.read_csv("access.csv", sep="\t")

### Adding Country and City to the dataset for each IP

In [58]:
import geoip2.database

unique_ips = df["ip"].unique() #getting unique ips

def getCityName(names):
    if len(names)==0:
        return ""
    elif "en" in names:
        return names["en"]
    else:
        return names[names.keys()[0]]

rows = []
with geoip2.database.Reader('geoip/GeoLite2-City.mmdb') as reader:
    for ip in tqdm(unique_ips, total=len(unique_ips)):
        try:
            info = reader.city(ip)
            rows.append({"lat":info.location.latitude, "lon": info.location.longitude, "city": getCityName(info.city.names), "country":info.country.iso_code})
        except Exception as e:
            print(e)

#creating new df for ips
ips = pd.DataFrame(rows, columns=["lat", "lon", "city", "country"])
del rows

  0%|          | 0/5265 [00:00<?, ?it/s]

The address 127.0.0.1 is not in the database.
The address 104.207.73.126 is not in the database.
The address 104.207.73.83 is not in the database.


In [68]:
#storing country statistics and cities in file
with open("output/world.json", "w+") as fp:
    countries = ips["country"].value_counts().to_dict()
    cities = ips[["lat", "lon", "city"]].drop_duplicates().dropna().values.tolist()
    json.dump({"countries": countries, "cities": cities},fp)


In [14]:
df['url'] = df['url'].str.split('?').str[0]

In [67]:
#


AttributeError: 'numpy.ndarray' object has no attribute 'to_dict'