Skip to content

Commit

Permalink
site-add: attempt HTML to structured data
Browse files Browse the repository at this point in the history
  • Loading branch information
chapmanjacobd committed Apr 17, 2024
1 parent c90153d commit a55c035
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 128 deletions.
128 changes: 0 additions & 128 deletions xklb/createdb/fs_add.py
Expand Up @@ -317,134 +317,6 @@ def munge_image_tags(m: dict, e: dict) -> dict:
),
}

for s in (
"Offset",
"Pattern",
"SubSampling",
"Zoom",
"Gain",
"FileNumber",
"RedEye",
"Conditional",
"RedBalance",
"BlueBalance",
"LensInfo",
"Shutter",
"FileSource",
"Chromaticities",
"Owner",
"DOF",
"Mode",
"Sensitivity",
"Exposure",
"Levels",
"Gamma",
"YCbCr",
"Serial",
"PDF",
"ObjectName",
"YCbCrPositioning",
":ISO",
"ExposureProgram",
"FNumber",
"SensingMethod",
"SubjectDistance",
"Scene",
"Sharpness",
"Saturation",
"Flash",
"Contrast",
"MeteringMode",
"InteropIndex",
"ExposureCompensation",
"BrightnessValue",
"CustomRendered",
"ComponentsConfiguration",
"ShutterSpeed",
"ExposureMode",
"Aperture",
"ScaleFactor",
"LightValue",
"GPSProcessingMethod",
"GPSPosition",
"focalDistance",
"GPSDOP",
"FocalLength",
"FOV",
"CircleOfConfusion",
"GPSLatitudeRef",
"GPSLongitudeRef",
"Thumbnail",
"PrintStyle",
"Angle",
"Altitude",
"Displayed",
"WriterName",
"ReaderName",
"Date",
"History",
"Version",
"Compression",
"Digest",
"PrintPosition",
"PrintScale",
"Copyright",
"WhiteBalance",
"WhitePoint",
"ColorSpace",
"ColorTransform",
"ColorComponents",
"LightSource",
"Swatch",
"Profile",
"XMP:",
"ByteOrder",
"Comment",
"BitsPerSample",
"BitsPerPixel",
"Interpretation",
"EncodingProcess",
"Megapixels",
"PixelAspectRatio",
"ImageSize",
"PhotoshopFormat",
"OriginalTransmissionReference",
"Time",
"NumSlices",
"ImageUniqueID",
"HasRealMergedData",
"CodedCharacterSet",
"Flags0",
"Flags1",
"Padding",
"ProgressiveScans",
"HasColorMap",
"Ducky:Quality",
"PhotoshopQuality",
"SlicesGroupName",
"SupplementalCategories",
"Duration",
"Animation",
"FrameCount",
):
pop_substring_keys(e, s)

for k in (
"File:FileName",
"File:Directory",
"File:FileSize",
"File:MIMEType",
"File:FilePermissions",
"File:FileTypeExtension",
"File:FileType",
"IPTC:SpecialInstructions",
"File:Exif",
):
e.pop(k, None)

if e != {}:
log.info("Extra data %s", e)

return m


Expand Down
24 changes: 24 additions & 0 deletions xklb/createdb/site_add.py
Expand Up @@ -14,6 +14,7 @@ def parse_args() -> argparse.Namespace:
parser.set_defaults(selenium=True)

parser.add_argument("--local-file", "--local-html", action="store_true", help="Treat paths as Local HTML files")
parser.add_argument("--extract-html", action="store_true", help="Extract data from HTML")

arggroups.debug(parser)
arggroups.database(parser)
Expand Down Expand Up @@ -133,6 +134,29 @@ def response_interceptor(request, response):
for d in tables:
db_thread[d["table_name"]].insert_all(iterables.list_dict_filter_bool(d["data"]), alter=True) # type: ignore

if (
args.extract_html
and response
and response.status_code // 100 == 2 # HTTP 2xx
and "Content-Type" in response.headers
and response.headers["Content-Type"].startswith(("text/html",))
):
body = decode(response.body, response.headers.get("Content-Encoding", "identity"))
body = body.decode()

import xmltodict

o = xmltodict.parse(body)
tables = nosql_to_sql(o)

if args.verbose > 2:
breakpoint()

tables = db_utils.add_missing_table_names(args, tables)
db_thread = db_utils.connect(argparse.Namespace(database=args.database, verbose=args.verbose))
for d in tables:
db_thread[d["table_name"]].insert_all(iterables.list_dict_filter_bool(d["data"]), alter=True) # type: ignore

elif (
response
and "Content-Type" in response.headers
Expand Down
5 changes: 5 additions & 0 deletions xklb/usage.py
Expand Up @@ -1673,6 +1673,11 @@ def play(action) -> str:
library siteadd jobs.st.db --poke https://hk.jobsdb.com/hk/search-jobs/python/
Requires selenium-wire
Requires xmltodict when using --extract-html
pip install selenium-wire xmltodict
Run with `-vv` to see and interact with the browser
"""

Expand Down

0 comments on commit a55c035

Please sign in to comment.