basic metadata extraction

guigrpa · Aug 18, 2020 · 1cfe9f5 · 1cfe9f5
1 parent f5af303
commit 1cfe9f5
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 9 deletions.
diff --git a/src/__tests__/unit.test.ts b/src/__tests__/unit.test.ts
@@ -1,6 +1,6 @@
 import path from 'path';
 import { zipLoad } from '../zip';
-import { readContentTypes, getMainDoc } from '../main';
+import { readContentTypes, getMainDoc, getMetadata } from '../main';
 import fs from 'fs';
 import { setDebugLogSink } from '../debug';
 
@@ -26,3 +26,32 @@ describe('[Content_Types].xml parser', () => {
     expect(main_doc).toStrictEqual('document2.xml');
   });
 });
+
+describe('getMetadata', () => {
+  it('finds the number of pages', async () => {
+    const template = await fs.promises.readFile(
+      path.join(__dirname, 'fixtures', 'simpleQuery.docx')
+    );
+    expect(await getMetadata(template)).toMatchInlineSnapshot(`
+      Object {
+        "category": undefined,
+        "characters": 24,
+        "company": undefined,
+        "created": "2015-08-16T18:55:00Z",
+        "creator": "Unga Graorg",
+        "description": undefined,
+        "lastModifiedBy": "Grau Panea, Guillermo",
+        "lastPrinted": undefined,
+        "lines": 1,
+        "modified": "2016-12-15T11:21:00Z",
+        "pages": 1,
+        "paragraphs": 1,
+        "revision": "32",
+        "subject": undefined,
+        "template": "Normal.dotm",
+        "title": undefined,
+        "words": 4,
+      }
+    `);
+  });
+});
diff --git a/src/main.ts b/src/main.ts
@@ -330,18 +330,74 @@ export async function listCommands(
   return commands;
 }
 
-export async function readContentTypes(zip: JSZip): Promise<NonTextNode> {
-  const contentTypesXml = await zipGetText(zip, CONTENT_TYPES_PATH);
-  if (contentTypesXml == null)
-    throw new TemplateParseError(`${CONTENT_TYPES_PATH} could not be read`);
-  const node = await parseXml(contentTypesXml);
+export async function getMetadata(template: Buffer) {
+  const app_xml_path = `docProps/app.xml`;
+  const core_xml_path = `docProps/core.xml`;
+  const zip = await zipLoad(template);
+  const appXml = await parsePath(zip, app_xml_path);
+  const coreXml = await parsePath(zip, core_xml_path);
+  // TODO: custom.xml as well?
+
+  function getText(t: Node): string | undefined {
+    if (t._children.length === 0) return undefined;
+    const n = t._children[0];
+    if (n._fTextNode) return n._text;
+    throw new Error(`Not a text node`);
+  }
+
+  function findNodeText(m: Node, tag: string): string | undefined {
+    for (const t of m._children) {
+      if (t._fTextNode) continue;
+      if (t._tag === tag) return getText(t);
+    }
+    return;
+  }
+
+  const numberize = (a: any): number | undefined => {
+    try {
+      const c = Number(a);
+      if (Number.isFinite(c)) return c;
+      return;
+    } catch {}
+  };
+
+  return {
+    pages: numberize(findNodeText(appXml, 'Pages')),
+    words: numberize(findNodeText(appXml, 'Words')),
+    characters: numberize(findNodeText(appXml, 'Characters')),
+    lines: numberize(findNodeText(appXml, 'Lines')),
+    paragraphs: numberize(findNodeText(appXml, 'Paragraphs')),
+    company: findNodeText(appXml, 'Company'),
+    template: findNodeText(appXml, 'Template'),
+
+    // from CoreXML
+    title: findNodeText(coreXml, 'dc:title'),
+    subject: findNodeText(coreXml, 'dc:subject'),
+    creator: findNodeText(coreXml, 'dc:creator'),
+    description: findNodeText(coreXml, 'dc:description'),
+    lastModifiedBy: findNodeText(coreXml, 'cp:lastModifiedBy'),
+    revision: findNodeText(coreXml, 'cp:revision'),
+    lastPrinted: findNodeText(coreXml, 'cp:lastPrinted'),
+    created: findNodeText(coreXml, 'dcterms:created'),
+    modified: findNodeText(coreXml, 'dcterms:modified'),
+    category: findNodeText(coreXml, 'cp:category'),
+  };
+}
+
+async function parsePath(zip: JSZip, xml_path: string): Promise<NonTextNode> {
+  const xmlFile = await zipGetText(zip, xml_path);
+  if (xmlFile == null)
+    throw new TemplateParseError(`${xml_path} could not be read`);
+  const node = await parseXml(xmlFile);
   if (node._fTextNode)
-    throw new TemplateParseError(
-      `${CONTENT_TYPES_PATH} is a text node when parsed`
-    );
+    throw new TemplateParseError(`${xml_path} is a text node when parsed`);
   return node;
 }
 
+export async function readContentTypes(zip: JSZip): Promise<NonTextNode> {
+  return await parsePath(zip, CONTENT_TYPES_PATH);
+}
+
 export function getMainDoc(contentTypes: NonTextNode): string {
   const MAIN_DOC_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml' as const;
   for (const t of contentTypes._children) {