Skip to content

Commit

Permalink
data-mining-logs: extract documents and messages (#5)
Browse files Browse the repository at this point in the history
* Add Python log_parser.py

Get the meaningful events to monitor messages and put as:

    {
        '<log timestamp>': {json event update}
    }

* Remove prefix parsed from generated file

* data-mining-logs: log parsing + extraction of documents and messages
  • Loading branch information
ryukinix authored Aug 9, 2020
1 parent 7015110 commit a5b8e84
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,5 @@

*.log
/kills.txt
*.mypy_cache
*.coverage
5 changes: 5 additions & 0 deletions data-mining-logs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
docs/
token.txt
logs/
.venv
.mypy_cache
105 changes: 105 additions & 0 deletions data-mining-logs/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import json
import os
import telepot # notype
from tqdm import tqdm # notype

token = open("token.txt").read().strip()
bot = telepot.Bot(token)
chat_title = "Common Lisp Brasil"
logs_fpath = "logs/putaria.log.json"
dir_name = os.path.join("docs", chat_title.replace(" ", "_").lower())


def get_title(result):
return result.get("message", {}).get("chat", {}).get("title")


def collect_documents(chat_title=chat_title):
doc_types = [
"document",
"video",
"voice",
"photo"
]
mime_type = {
"video": "video/mp4",
"voice": "audio/ogg",
"photo": "image/jpg",
}
logs = json.load(open(logs_fpath))
docs = []
for timestamp, event in logs.items():
for result in event["result"]:
if get_title(result) == chat_title:
for doc_type in doc_types:
doc = result["message"].get(doc_type)
if isinstance(doc, list):
doc = doc[-1] # multiple thumbs, get the best quality
doc["mime_type"] = mime_type[doc_type]
if doc:
docs.append(doc)

return docs


def collect_messages(chat_title=chat_title):
logs = json.load(open(logs_fpath))
docs = []
for timestamp, event in logs.items():
for result in event["result"]:
if get_title(result) == chat_title:
doc = result["message"]
if doc:
docs.append({
"date": timestamp,
"message": doc
})
return docs


def dump_messages(messages):
with open(os.path.join(dir_name, "messages.txt"), "w") as f:
for message in sorted(messages, key = lambda x: x["date"]):
date = message["date"]
msg = message["message"]
username = message["message"]["from"]["first_name"]
text = message["message"].get("text")
if text:
template = f"{date} / {username}: {text}".replace("\n", " ")
f.write(template + "\n")


def download_document(doc, dir_name=dir_name):
try:
mime_type = doc.get("mime_type")
extension = ".raw"
if mime_type:
extension = mime_type.replace("/", ".")
elif doc.get("file_name"):
extension = doc["file_name"]
fname = doc["file_unique_id"] + extension
folder = mime_type.split("/")[0]
dir_path = os.path.join(dir_name, folder)
os.makedirs(dir_path, exist_ok=True)
fpath = os.path.join(dir_name, folder, fname)
if not os.path.exists(fpath):
bot.download_file(doc["file_id"], fpath)
except telepot.exception.TelegramError as e:
print(f"Telegram exception for {fname}: {e}")
except Exception as e:
print(f"Python exception, I screw up: {e}")


def download_documents(docs):
for doc in tqdm(docs):
download_document(doc, dir_name)


if __name__ == "__main__":
os.makedirs(dir_name, exist_ok=True)
print(f"-- Collecting documents in: {dir_name}")
docs = collect_documents()
download_documents(docs)
print(f"-- Collecting messages in: {dir_name}")
messages = collect_messages()
dump_messages(messages)
46 changes: 46 additions & 0 deletions data-mining-logs/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python3
import re
import enum
import sys
import json

log_end_regex = re.compile(r"^(\d{4}/\d{2}/\d{2}.*).*")
log_start_regex = re.compile(r"^(\d{4}/\d{2}/\d{2}.*).getUpdates.resp: ({.*)")

class ParsingState(enum.Enum):
stop = 0
start = 1


def parse(fpath: str):
logs = {}
state_machine = ParsingState.stop
with open(fpath) as f:
key = ""
for line in f.readlines():
if log_start_regex.match(line):
match = log_start_regex.search(line)
key = match.group(1)
value = match.group(2)
logs[key] = value
state_machine = ParsingState.start
elif log_end_regex.match(line):
state_machine = ParsingState.stop
elif state_machine == ParsingState.start:
logs[key] += line

for k, v in sorted(logs.items(), key = lambda x: x[1]):
try:
j = json.loads(v)
if not j or not j["result"]:
del logs[k]
logs[k] = j
except Exception as e:
del logs[k]
return logs


if __name__ == '__main__':
fname = sys.argv[1]
logs = parse(fname)
json.dump(logs, open(fname + ".json", "w"))
2 changes: 2 additions & 0 deletions data-mining-logs/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
telepot
tqdm

0 comments on commit a5b8e84

Please sign in to comment.