In [None]:
# Equivalent grep "error" Apache_2k.log
with open("data/Apache_2k.log", "r", encoding="utf-8", errors="replace") as f:
    for line in f:
        if "error" in line:
            print(line, end="")

In [None]:
# Equivalent to grep -i "error" Apache_2k.log
with open("data/Apache_2k.log", "r", encoding="utf-8") as f:
    for line in f:
        if "error" in line.lower():
            print(line)

In [None]:
# Equivalent grep -E "\[error\]" Apache_2k.log
import re

pattern = re.compile(r"\[error\]")
with open("data/Apache_2k.log", "r", encoding="utf-8") as f:
    for line in f:
        if pattern.search(line):
            print(line, end="")

In [None]:
# grep -n (line numbers)

with open("data/Apache_2k.log", "r", encoding="utf-8") as log_file:
    for i, line in enumerate(log_file, start=1):
        """
        Return an enumerate object.
        iterable an object supporting iteration
        The enumerate object yields pairs containing a count (from start, which defaults to zero) and a value yielded by the iterable argument
        """
        if "error" in line:
            print(f"{i}:\t{line}", end="\n")

In [None]:
# grep "error" file.log | wc -l
count = 0
with open("data/Apache_2k.log", "r", encoding="utf-8") as log_file:
    for line in log_file:
        if "error" in line:
            count += 1
count

 In the above similar to grep the snippet counts lines where "errors" shows. However, the snippet has error two times in some cases.
Creating an alternative to count for how many times the "error" shows per line
log line: [Mon Dec 05 18:56:04 2005] [error] mod_jk child workerEnv in error state 6

```bash
 grep "error" Apache_2k.log | wc -l
 595
```



In [None]:
count = 0
with open("data/Apache_2k.log", "r", encoding="utf-8") as log_file:
    for line in log_file:
        count += line.count("error")
        """
        count() Return the number of non-overlapping occurrences of substring sub in string S[start:end]
        """
count

In [None]:

import json
"""
JSONL (JSON Lines) is a text format where each line is a separate, valid JSON object, ideal for streaming and large datasets because it avoids wrapping everything in a single array, making it easy to process one record at a time, even in compressed files.


Input:
"[Sun Dec 04 04:47:44 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties"

output:
{"timestamp": "Sun Dec 04 04:47:44 2005", "level": "notice", "message": "workerEnv.init() ok /etc/httpd/conf/workers2.properties"}

## jq example on output file
# head -n1 Apache_2k.jsonl | jq .level
"notice"

# head -n1 Apache_2k.jsonl | jq .message
"workerEnv.init() ok /etc/httpd/conf/workers2.properties"

# head -n1 Apache_2k.jsonl | jq .
{
  "timestamp": "Sun Dec 04 04:47:44 2005",
  "level": "notice",
  "message": "workerEnv.init() ok /etc/httpd/conf/workers2.properties"
}
"""

with open("data/Apache_2k.log", "r", encoding="utf-8") as log_file, \
     open("data/Apache_2k.jsonl", "w", encoding="utf-8") as json_file:
    for line in log_file:
        try:
            log_fields = line.split("] ")
            event = {
                "timestamp": log_fields[0].strip("["),
                "level": log_fields[1].strip("["),
                "message": log_fields[2].strip("[").strip("\n")
            }
            json_file.write(json.dumps(event, ensure_ascii=False) + "\n")
        except (IndexError,TypeError, ValueError) as exec:
            print(exec)



In [None]:
a = "file"

",".join(["a","b", str("new string "  + a)])

In [None]:
"/usr/bin/python".split("/")[-1]


In [41]:
#top 5 urls from log

import re
import json
count_urls_occurrences = {}

pattern = re.compile(r"\/10.10")
try:
    with open("data/Zookeeper_2k.log", "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            unique_url =  line.split("-")[-1]
            if unique_url:
                if pattern.search(unique_url):
                    if count_urls_occurrences.get(unique_url):
                        count_urls_occurrences[unique_url] = count_urls_occurrences[unique_url] + 1
                    else:
                        count_urls_occurrences[unique_url] = 1
        sorted_counters = dict(sorted(count_urls_occurrences.items(), key=lambda kv: kv[1], reverse=True))
        print(json.dumps(sorted_counters, indent=4)) # {'10.10.0.3': 2, '10.10.0.1': 4}
except FileExistsError as e:
    print(e)

{
    " Cannot open channel to 2 at election address /10.10.34.12:3888\n": 48,
    " Cannot open channel to 3 at election address /10.10.34.13:3888\n": 38,
    " Received connection request /10.10.34.12:48811\n": 2,
    " Received connection request /10.10.34.11:45307\n": 1,
    " Received connection request /10.10.34.13:57707\n": 1,
    " Received connection request /10.10.34.11:45382\n": 1,
    " Received connection request /10.10.34.11:45440\n": 1,
    " Received connection request /10.10.34.13:57895\n": 1,
    " Received connection request /10.10.34.12:47727\n": 1,
    " Received connection request /10.10.34.13:58035\n": 1,
    " Received connection request /10.10.34.12:47838\n": 1,
    " Received connection request /10.10.34.13:58116\n": 1,
    " Received connection request /10.10.34.11:45957\n": 1,
    " Received connection request /10.10.34.13:58303\n": 1,
    " Received connection request /10.10.34.12:48096\n": 1,
    " Received connection request /10.10.34.12:48141\n": 1,
    

In [33]:
count_urls_occurrences

{' Received connection request /10.10.34.11:45307\n': 1,
 ' Received connection request /10.10.34.13:57707\n': 1,
 ' Received connection request /10.10.34.11:45382\n': 1,
 ' Received connection request /10.10.34.11:45440\n': 1,
 ' Received connection request /10.10.34.13:57895\n': 1,
 ' Received connection request /10.10.34.12:47727\n': 1,
 ' Received connection request /10.10.34.13:58035\n': 1,
 ' Received connection request /10.10.34.12:47838\n': 1,
 ' Received connection request /10.10.34.13:58116\n': 1,
 ' Received connection request /10.10.34.11:45957\n': 1,
 ' Received connection request /10.10.34.13:58303\n': 1,
 ' Received connection request /10.10.34.12:48096\n': 1,
 ' Received connection request /10.10.34.12:48141\n': 1,
 ' Received connection request /10.10.34.13:58421\n': 1,
 ' Received connection request /10.10.34.11:46128\n': 1,
 ' Received connection request /10.10.34.13:58452\n': 1,
 ' Received connection request /10.10.34.11:46173\n': 1,
 ' Received connection request 

In [39]:
# ascending by count
import json
sorted_counters = dict(sorted(count_urls_occurrences.items(), key=lambda kv: kv[1], reverse=True))
print(json.dumps(sorted_counters, indent=4)) # {'10.10.0.3': 2, '10.10.0.1': 4}

# descending by count
# sorted_counters_desc = dict(sorted(counters.items(), key=lambda kv: kv[1], reverse=True))
# print(sorted_counters_desc)  # {'10.10.0.1': 4, '10.10.0.3': 2}


{
    " Cannot open channel to 2 at election address /10.10.34.12:3888\n": 48,
    " Cannot open channel to 3 at election address /10.10.34.13:3888\n": 38,
    " Received connection request /10.10.34.12:48811\n": 2,
    " Received connection request /10.10.34.11:45307\n": 1,
    " Received connection request /10.10.34.13:57707\n": 1,
    " Received connection request /10.10.34.11:45382\n": 1,
    " Received connection request /10.10.34.11:45440\n": 1,
    " Received connection request /10.10.34.13:57895\n": 1,
    " Received connection request /10.10.34.12:47727\n": 1,
    " Received connection request /10.10.34.13:58035\n": 1,
    " Received connection request /10.10.34.12:47838\n": 1,
    " Received connection request /10.10.34.13:58116\n": 1,
    " Received connection request /10.10.34.11:45957\n": 1,
    " Received connection request /10.10.34.13:58303\n": 1,
    " Received connection request /10.10.34.12:48096\n": 1,
    " Received connection request /10.10.34.12:48141\n": 1,
    