-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
data-mining-logs: extract documents and messages (#5)
* Add Python log_parser.py Get the meaningful events to monitor messages and put as: { '<log timestamp>': {json event update} } * Remove prefix parsed from generated file * data-mining-logs: log parsing + extraction of documents and messages
- Loading branch information
Showing
5 changed files
with
160 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,3 +18,5 @@ | |
|
||
*.log | ||
/kills.txt | ||
*.mypy_cache | ||
*.coverage |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
docs/ | ||
token.txt | ||
logs/ | ||
.venv | ||
.mypy_cache |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import json | ||
import os | ||
import telepot # notype | ||
from tqdm import tqdm # notype | ||
|
||
token = open("token.txt").read().strip() | ||
bot = telepot.Bot(token) | ||
chat_title = "Common Lisp Brasil" | ||
logs_fpath = "logs/putaria.log.json" | ||
dir_name = os.path.join("docs", chat_title.replace(" ", "_").lower()) | ||
|
||
|
||
def get_title(result): | ||
return result.get("message", {}).get("chat", {}).get("title") | ||
|
||
|
||
def collect_documents(chat_title=chat_title): | ||
doc_types = [ | ||
"document", | ||
"video", | ||
"voice", | ||
"photo" | ||
] | ||
mime_type = { | ||
"video": "video/mp4", | ||
"voice": "audio/ogg", | ||
"photo": "image/jpg", | ||
} | ||
logs = json.load(open(logs_fpath)) | ||
docs = [] | ||
for timestamp, event in logs.items(): | ||
for result in event["result"]: | ||
if get_title(result) == chat_title: | ||
for doc_type in doc_types: | ||
doc = result["message"].get(doc_type) | ||
if isinstance(doc, list): | ||
doc = doc[-1] # multiple thumbs, get the best quality | ||
doc["mime_type"] = mime_type[doc_type] | ||
if doc: | ||
docs.append(doc) | ||
|
||
return docs | ||
|
||
|
||
def collect_messages(chat_title=chat_title): | ||
logs = json.load(open(logs_fpath)) | ||
docs = [] | ||
for timestamp, event in logs.items(): | ||
for result in event["result"]: | ||
if get_title(result) == chat_title: | ||
doc = result["message"] | ||
if doc: | ||
docs.append({ | ||
"date": timestamp, | ||
"message": doc | ||
}) | ||
return docs | ||
|
||
|
||
def dump_messages(messages): | ||
with open(os.path.join(dir_name, "messages.txt"), "w") as f: | ||
for message in sorted(messages, key = lambda x: x["date"]): | ||
date = message["date"] | ||
msg = message["message"] | ||
username = message["message"]["from"]["first_name"] | ||
text = message["message"].get("text") | ||
if text: | ||
template = f"{date} / {username}: {text}".replace("\n", " ") | ||
f.write(template + "\n") | ||
|
||
|
||
def download_document(doc, dir_name=dir_name): | ||
try: | ||
mime_type = doc.get("mime_type") | ||
extension = ".raw" | ||
if mime_type: | ||
extension = mime_type.replace("/", ".") | ||
elif doc.get("file_name"): | ||
extension = doc["file_name"] | ||
fname = doc["file_unique_id"] + extension | ||
folder = mime_type.split("/")[0] | ||
dir_path = os.path.join(dir_name, folder) | ||
os.makedirs(dir_path, exist_ok=True) | ||
fpath = os.path.join(dir_name, folder, fname) | ||
if not os.path.exists(fpath): | ||
bot.download_file(doc["file_id"], fpath) | ||
except telepot.exception.TelegramError as e: | ||
print(f"Telegram exception for {fname}: {e}") | ||
except Exception as e: | ||
print(f"Python exception, I screw up: {e}") | ||
|
||
|
||
def download_documents(docs): | ||
for doc in tqdm(docs): | ||
download_document(doc, dir_name) | ||
|
||
|
||
if __name__ == "__main__": | ||
os.makedirs(dir_name, exist_ok=True) | ||
print(f"-- Collecting documents in: {dir_name}") | ||
docs = collect_documents() | ||
download_documents(docs) | ||
print(f"-- Collecting messages in: {dir_name}") | ||
messages = collect_messages() | ||
dump_messages(messages) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#!/usr/bin/env python3 | ||
import re | ||
import enum | ||
import sys | ||
import json | ||
|
||
log_end_regex = re.compile(r"^(\d{4}/\d{2}/\d{2}.*).*") | ||
log_start_regex = re.compile(r"^(\d{4}/\d{2}/\d{2}.*).getUpdates.resp: ({.*)") | ||
|
||
class ParsingState(enum.Enum): | ||
stop = 0 | ||
start = 1 | ||
|
||
|
||
def parse(fpath: str): | ||
logs = {} | ||
state_machine = ParsingState.stop | ||
with open(fpath) as f: | ||
key = "" | ||
for line in f.readlines(): | ||
if log_start_regex.match(line): | ||
match = log_start_regex.search(line) | ||
key = match.group(1) | ||
value = match.group(2) | ||
logs[key] = value | ||
state_machine = ParsingState.start | ||
elif log_end_regex.match(line): | ||
state_machine = ParsingState.stop | ||
elif state_machine == ParsingState.start: | ||
logs[key] += line | ||
|
||
for k, v in sorted(logs.items(), key = lambda x: x[1]): | ||
try: | ||
j = json.loads(v) | ||
if not j or not j["result"]: | ||
del logs[k] | ||
logs[k] = j | ||
except Exception as e: | ||
del logs[k] | ||
return logs | ||
|
||
|
||
if __name__ == '__main__': | ||
fname = sys.argv[1] | ||
logs = parse(fname) | ||
json.dump(logs, open(fname + ".json", "w")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
telepot | ||
tqdm |