Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a remote wordlist updater (rebase) #1005

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions .bin/checkers/check-if-auto-updated.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3

import os,sys,json

if not sys.argv[1]:
exit(0)

IS_WRAPPED=False

if "IS_RUNNING_UNDER_CALLER_SCRIPT" in os.environ:
IS_WRAPPED=os.environ['IS_RUNNING_UNDER_CALLER_SCRIPT']=="1"

def print_normal(msg):

if IS_WRAPPED:
return
print(msg)

def print_err(file,line_number):

if IS_WRAPPED:
print("E,%s,%s"%(file,line_number))

def print_warn(file,line_number):

if IS_WRAPPED:
print("W,%s,%s"%(file,line_number))

print_normal("[+] Remote wordlist overwrite check")
if IS_WRAPPED:
print("Remote wordlist overwrite check")
print("Files that the script catches will be overwritten next update.")

files=sys.argv[1].split(" ")

for i in files:
if not os.path.isfile(i):
print_err(i,0)
print_normal("[!] %s does not exist!"%(i))
exit(2)

overall_pass_status=True

sources = json.load(open(".bin/wordlist-updaters/sources.json"))
overwritten_paths = {
"dirs": [],
"files": []
}

for source in sources:
found_paths = []

if "output" in source.keys():
found_paths.append(source["output"])

if "additional_paths" in source.keys():
found_paths += source["additional_paths"]

for path in found_paths:

if os.path.isdir(path):
overwritten_paths["dirs"].append(path)

elif os.path.isfile(path):
overwritten_paths["files"].append(path)

for i in files:

for dir_path in overwritten_paths["dirs"]:
if i.startswith(dir_path):
print_normal(f"[!] Warning: file {i} is in a directory that will get overwritten!")
print_err(i, 0)
overall_pass_status=False
break

for file_path in overwritten_paths["files"]:
if i == file_path:
print_normal(f"[!] Warning: file {i} will get overwritten!")
print_err(i, 0)
overall_pass_status=False
break

if overall_pass_status:
print_normal("[+] All files passed overwrite checks")
exit(0)

print_normal("[!] Warning: One or more files failed to pass the overwrite checks")

if IS_WRAPPED:
exit(0)
else:
exit(2)
4 changes: 1 addition & 3 deletions .bin/trickest-patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
shutil.copytree(path,OUTPUT_ROBOTS,dirs_exist_ok=True)

print("[+] Copied all the files")

for i in [OUTPUT_ROBOTS,OUTPUT_TECHNOLOGIES]:
for root,_,file_list in os.walk(i):
for file in file_list:
Expand All @@ -64,6 +65,3 @@

if len(contents)!=len(patch_content):
open(path,"wb").write(b"\n".join(patch_content))



12 changes: 0 additions & 12 deletions .bin/trickest-updater.sh

This file was deleted.

56 changes: 56 additions & 0 deletions .bin/wordlist-updaters/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Wordlist updaters

## Overview
The purpose of the scripts are to update wordlists from remote sources defined in sources.json.

A github action should check every hour to see if the update conditions are met, then updates accordingly

`status.json` is not meant to be edited in a pr.

## Format

Example sources.json

```json
[
{
"name": "Jwt secrets update",
"type": "file",
"source": "https://raw.githubusercontent.com/wallarm/jwt-secrets/master/jwt.secrets.list",
"output": "Passwords/scraped-JWT-secrets.txt",
"post_run_script": "",
"frequency": "3h"
}
]
```

All fields are required unless otherwise stated.

`name` is the name of the task.

`type` can be one of the following: `file, git_dir`.

`source` specify the remote location. If type is `git_dir`, the folder at that location will be cloned using git.

`frequency` is the update frequency. The script will use the `status.json` file to know when to update. Accepted units of time are `h,H` for hours and `d,D` for days. Frequency can be specified with only days or hours, or with both of them. Hours cannot be before days. (`6h1d`)

`update_time` specifies the daily frequency in utc 24 hour syntax (0300). Only one update frequency field can be set at a time. (`frequency` or `update_time`)

`output` is the output file/dir the script will put the output in.

`post_run_script` is the script to be run after pulling the list successfully. This field is optional.

`additional_paths` is the additional paths that the workflow script should alert if there is a pull request for the file. This field is optional and won't be used for the updater, but rather the checker.

- - -

Example status.json

```json
{
"Jwt secrets update": {
"last_update" : 0
}
}
```

22 changes: 22 additions & 0 deletions .bin/wordlist-updaters/sources.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[
{
"name": "Jwt secrets update",
"type": "file",
"source": "https://raw.githubusercontent.com/wallarm/jwt-secrets/master/jwt.secrets.list",
"output": "Passwords/scraped-JWT-secrets.txt",
"post_run_script": "",
"frequency": "6h"
},
{
"name": "Trickest wordlist update",
"type": "git_dir",
"source": "https://github.com/trickest/wordlists.git",
"output": ".working_space",
"post_run_script": ".bin/trickest-patcher.py",
"update_time": "1030",
"additional_paths": [
"Discovery/Web-Content/trickest-robots-disallowed-wordlists/",
"Discovery/Web-Content/CMS/trickest-cms-wordlist/"
]
}
]
8 changes: 8 additions & 0 deletions .bin/wordlist-updaters/status.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"Jwt secrets update": {
"last_update": 1712376971
},
"Trickest wordlist update": {
"last_update": 1712310048
}
}
178 changes: 178 additions & 0 deletions .bin/wordlist-updaters/updater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#!/usr/bin/env python3

import os
import re
import json
import requests
import subprocess
from datetime import datetime, timedelta

# TODO Summary file
# TODO Advanced crontab syntax

BASE_PATH = ".bin/wordlist-updaters"
SOURCE_PATH = os.path.join(BASE_PATH, "sources.json")
STATUS_PATH = os.path.join(BASE_PATH, "status.json")
FREQUENCY_REGEX = r"^(?:([0-9]+)d|())(?:([0-9]+)h|())(?!.*?d)$"
VALID_TYPES = ["file", "git_dir"]
TIME_NOW = datetime.now()

def request_wrapper(url):

for i in range(1,4):
r = requests.get(url)
if r.status_code == 200:
# print("[+] Got %s successfully!"%(url))
break
if i == 3:
print("[!] Failed to get %s."%(url))
exit(2)
print("[!] Getting %s failed(%i/3)"%(url,i))

return r.text

# Check if the files exists
if not os.path.isfile(SOURCE_PATH):
print("[!] Sources.json is missing!")
exit(2)

if not os.path.isfile(STATUS_PATH):
print("[!] Status.json is missing!")
exit(2)

SOURCES = json.load(open(SOURCE_PATH, "r"))
STATUS = json.load(open(STATUS_PATH, "r"))

to_check = []

for source in SOURCES:
task_name = source["name"]
source_keys = source.keys()

if not task_name in STATUS.keys():
print(f"[+] Queuing task {task_name} as task was never checked before")
to_check.append(source)
continue

if not "output" in source_keys or not isinstance(source["output"], str):
print(f"[!] Skipping task {task_name} as output field is missing/invalid")
continue

if not "type" in source_keys or not isinstance(source["type"], str):
print(f"[!] Skipping task {task_name} as type field is missing/invalid")
continue

if not source["type"] in VALID_TYPES:
print(f"[!] Skipping task {task_name} as type is invalid")
continue

if source["output"].startswith("/"):
print(f"[!] Skipping task {task_name} as output path is not relative.")
continue

if source["type"].startswith("git_") and not source["source"].endswith(".git"):
print(f"[!] Skipping task {task_name} as a git task was defined with a non git url.")
continue

if not "last_update" in STATUS[task_name].keys() or not isinstance(STATUS[task_name]["last_update"], int):
print(f"[!] Queuing task {task_name} as last_update field is missing/invalid")
to_check.append(source)
continue

if not ("frequency" in source_keys) ^ ("update_time" in source_keys):
print(f"[!] Skipping task {task_name} as only frequency or update_time can be specified")
continue

if "frequency" in source_keys and isinstance(source["frequency"], str):
regex_match = re.search(FREQUENCY_REGEX, source["frequency"])

if not regex_match:
print(f"[!] Skipping task {task_name} as frequency field contains invalid formatting of days and hours")
continue

days, _, hours, _ = regex_match.groups()

days = bool(days) | 0
hours = bool(hours) | 0

next_update_time = datetime.fromtimestamp(STATUS[task_name]["last_update"]) + timedelta(days=days, hours=hours)
time_from_update = TIME_NOW - next_update_time
time_to_update = next_update_time - TIME_NOW

if TIME_NOW < next_update_time:
if time_to_update.seconds <= 300:
print(f"[+] Queuing task {task_name} as it is less than 5 minutes to update. ({time_to_update.seconds} seconds to update)")
to_check.append(source)
continue

print(f"[!] Skipping task {task_name} as it is more than 5 minutes to update ({time_to_update.seconds} seconds to update)")
continue

print(f"[+] Queuing task {task_name} as it is {time_to_update.seconds} seconds after scheduled update time.")
to_check.append(source)

elif "update_time" in source_keys and isinstance(source["update_time"], str):
update_time = source["update_time"]

if len(update_time) != 4 and update_time.isnumeric():
print(f"[!] Skipping task {task_name} as it is in a incorrect format")
continue

hours = int(update_time[:2])
minutes = int(update_time[2:])

if not hours in range(1, 25):
print(f"[!] Skipping task {task_name} as hours is not in range 1-24.")
continue

if not minutes in range(1, 61):
print(f"[!] Skipping task {task_name} as minutes is not in range 1-60.")
continue

scheduled_update_time = TIME_NOW.replace(hour=hours, minute=minutes)
if TIME_NOW <= scheduled_update_time and TIME_NOW + timedelta(hours=1) >= scheduled_update_time:
print(f"[+] Queuing task {task_name} as update time is within the next hour")
to_check.append(source)
continue

else:
print(f"[!] Skipping task {task_name} as update_time field is invalid")
continue

if len(to_check) == 0:
print(f"[!] No task were queued. Exiting.")
exit()

print(f"[+] Queued a total of {len(to_check)} tasks to run.")

for task in to_check:
print(f"[+] Starting task {task['name']}")

if not task["name"] in STATUS.keys():
STATUS[task["name"]] = {}

task_type = task["type"]

if task_type == "file":
content = request_wrapper(task["source"])
open(task["output"], "w").write(content)
print(f"[+] Saved file to output location")

STATUS[task["name"]]["last_update"] = int(datetime.now().timestamp())

elif task_type == "git_dir":
if not os.path.exists(task['output']):
print(f"[+] Making directory {task['output']}")
os.makedirs(task["output"])

subprocess.run(["git", "clone", "-q", "--depth=1", task["source"]], cwd=task["output"])
STATUS[task["name"]]["last_update"] = int(datetime.now().timestamp())

if task["post_run_script"]:
print("[+] Running post run script")
subprocess.run(task["post_run_script"])
print("[+] Finished running post run script")

print(f"[+] Finished task {task['name']}")

json.dump(STATUS, open(STATUS_PATH, "w"), indent=4)
Loading
Loading