# Parser for Apache access log

In [165]:
import pandas as pd
from apachelogs import LogParser
from user_agents import parse
from crawlerdetect import CrawlerDetect
from tqdm.notebook import tqdm
import time
import os, glob
import json

crawler_detect = CrawlerDetect()
linux_distros = ["Ubuntu", "Debian", "Solaris", "Gentoo", "OpenBSD", "SUSE", "FreeBSD", "Fedora", "Red Hat"]

### Parsing each line and adding it to df

I'm going to parse each line and create a Pandas DataFrame to work with the data.
I will flush the rows array to a temp csv every 1M rows to free up memory, and then I will combine these csv in a big one.

In [200]:
#defining the parser params
parser = LogParser("%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"")

cols = ["ip", "date", "time", "weekday", "method", "code", "size", "url", "http_v", "bot", "browser", "os", "device", "useragent"]
rows = []
counter = 1 #for csv name
with open("access.log", "r") as fp:
    total = sum(1 for line in fp) #calculate numb of rows for the progressbar
    fp.seek(0) #bring the pointer back to the first line
    for row in tqdm(parser.parse_lines(fp), total=total):
        df_row = {}
        df_row["ip"] = row.remote_host
        datetime = row.request_time
        df_row["date"] = datetime.strftime("%d/%m/%Y")
        df_row["time"] = datetime.strftime("%H:%M:%S")
        df_row["weekday"] = datetime.weekday()
        df_row["code"] = row.final_status
        df_row["size"] = row.bytes_sent
        
        req = row.request_line.split(" ")
        if (len(req)==3):
            df_row["method"] = req[0]
            df_row["url"] = req[1].split('?')[0]
            df_row["http_v"] = req[2]
        else:
            df_row["method"] = None
            df_row["url"] = None
            df_row["http_v"] = None
        if "User-Agent" in row.headers_in and row.headers_in["User-Agent"] is not None:
            df_row["bot"] = crawler_detect.isCrawler(row.headers_in["User-Agent"])
                
            if df_row["bot"] is False:
                ua = parse(row.headers_in["User-Agent"]) 
                df_row["browser"] = ua.browser.family
                
                reqos = ua.os.family
                if reqos in linux_distros:
                    df_row["os"] = "Linux"
                if reqos in ["Mac OS", "Mac OS X", "macOS"]:
                    df_row["os"] = "macOS"
                else:
                    df_row["os"] = reqos
                
                if ua.is_pc or df_row["os"] == "Windows":
                    df_row["device"] = "Desktop"
                elif ua.is_mobile:
                    df_row["device"] = "Smartphone"
                elif ua.is_tablet:
                    df_row["device"] = "Tablet"
                else:
                    df_row["device"] = "Unknown"
                    
            df_row["useragent"] = row.headers_in["User-Agent"]
            
        else:
            df_row["useragent"] = None
        rows.append(df_row)
        
        #if multiple of 1M flush to csv and free up memory
        if (len(rows)%1000000 == 0):
            df = pd.DataFrame(rows, columns=cols)
            df.to_csv("access_"+str(counter)+".csv", sep="\t")
            counter = counter+1
            rows = []

  0%|          | 0/594168 [00:00<?, ?it/s]

In [201]:
#flushing to csv last items left from last flush
df = pd.DataFrame(rows, columns=cols)
df.to_csv("access_"+str(counter)+".csv", sep="\t")
counter = counter+1
rows = []

In [202]:
#importing all csv file to create a single one
files = glob.glob("access_*.csv")
df = pd.concat((pd.read_csv(f, sep="\t") for f in files))

In [203]:
#exporting all csv in a single file
df.to_csv("access.csv", sep="\t")

### Load checkpoint

In [81]:
if "df" not in locals():
    df = pd.read_csv("access.csv", sep="\t")

### Adding Country and City to the dataset for each IP

In [190]:
import geoip2.database

unique_ips = df["ip"].unique() #getting unique ips

def getCityName(names):
    if len(names)==0:
        return ""
    elif "en" in names:
        return names["en"]
    else:
        return names[names.keys()[0]]

rows = []
with geoip2.database.Reader('geoip/GeoLite2-City.mmdb') as reader:
    for ip in tqdm(unique_ips, total=len(unique_ips)):
        try:
            info = reader.city(ip)
            rows.append({"lat":info.location.latitude, "lon": info.location.longitude, "city": getCityName(info.city.names), "country":info.country.iso_code})
        except Exception as e:
            print(e)

#creating new df for ips
ips = pd.DataFrame(rows, columns=["lat", "lon", "city", "country"])
del rows

  0%|          | 0/5265 [00:00<?, ?it/s]

The address 127.0.0.1 is not in the database.
The address 104.207.73.126 is not in the database.
The address 104.207.73.83 is not in the database.


In [196]:
#storing country statistics and cities in file
with open("output/world.json", "w+") as fp:
    countries = ips["country"].value_counts().to_dict()
    cities = ips.drop_duplicates().dropna().values.tolist()
    for city in cities:
        if type(countries[city[3]]) is int:
            countries[city[3]] = { "reqs": countries[city[3]], "cities": []}
        countries[city[3]]["cities"].append(city[:-1])
    json.dump({"countries": countries},fp)


In [225]:
#distribution by hour and weekday
with open("output/dist.json", "w+") as fp:
    dist = {}
    for wd in tqdm(range(7)):
        dist[wd] = []
        for hh in range(24):
            dist[wd].append(df[(df["weekday"] == wd) & (df["time"].str[0:2] == str(hh).zfill(2))]["time"].shape[0])
    json.dump(dist,fp)

  0%|          | 0/7 [00:00<?, ?it/s]

In [226]:
dist

{0: [1830,
  2136,
  2150,
  2282,
  1861,
  2202,
  2414,
  1968,
  2246,
  2982,
  3500,
  3345,
  2999,
  3628,
  7001,
  10643,
  11215,
  7834,
  12437,
  9686,
  8980,
  5737,
  3122,
  2176],
 1: [2285,
  2397,
  3220,
  2195,
  1986,
  1849,
  1871,
  2852,
  5834,
  7374,
  12617,
  18756,
  15111,
  7379,
  8620,
  9690,
  11964,
  9817,
  6849,
  4750,
  8755,
  5449,
  4240,
  1572],
 2: [1066,
  1424,
  2705,
  629,
  913,
  1021,
  1080,
  890,
  3315,
  7388,
  6800,
  7359,
  8004,
  7973,
  8562,
  14586,
  4183,
  4698,
  4448,
  2126,
  3096,
  4201,
  2591,
  1640],
 3: [1503,
  1142,
  1614,
  898,
  2376,
  1244,
  1107,
  2140,
  2310,
  5338,
  6282,
  4288,
  2974,
  2918,
  8060,
  8040,
  5659,
  3129,
  3562,
  3372,
  3546,
  3163,
  2490,
  1596],
 4: [650,
  863,
  1308,
  1088,
  916,
  777,
  827,
  1093,
  2045,
  5857,
  2167,
  4363,
  2877,
  3219,
  5316,
  3896,
  2614,
  2402,
  2625,
  2092,
  2640,
  1982,
  1724,
  2390],
 5: [1490,
  600,
  1