datopian · mohamedsalem401 · Nov 22, 2023 · Nov 15, 2023 · Nov 15, 2023 · Nov 15, 2023
diff --git a/.changeset/strange-ads-tan.md b/.changeset/strange-ads-tan.md
@@ -0,0 +1,7 @@
+---
+"mddb": minor
+---
+
+Add Tags Extraction from Markdown Content.
+Resolved issues with link extraction from Markdown documents.
+Conducted code refactoring for improved readability and maintainability.
diff --git a/__mocks__/content/index.mdx b/__mocks__/content/index.mdx
@@ -1,5 +1,8 @@
 ---
 title: Homepage
+tags: tag1, tag2, tag3
 ---
 
-# Welcome
+# Welcome
+
+[link](blog0.mdx)
diff --git a/src/lib/indexFolder.ts b/src/lib/indexFolder.ts
@@ -0,0 +1,33 @@
+import { recursiveWalkDir } from "../utils/index.js";
+import { FileInfo, processFile } from "./process.js";
+
+export function indexFolder(
+  folderPath: string,
+  pathToUrlResolver: (filePath: string) => string,
+  ignorePatterns?: RegExp[]
+) {
+  const filePathsToIndex = recursiveWalkDir(folderPath);
+  const filteredFilePathsToIndex = filePathsToIndex.filter((filePath) =>
+    shouldIncludeFile(filePath, ignorePatterns)
+  );
+  const files: FileInfo[] = [];
+  for (const filePath of filteredFilePathsToIndex) {
+    const fileObject = processFile(
+      folderPath,
+      filePath,
+      pathToUrlResolver,
+      filePathsToIndex
+    );
+    files.push(fileObject);
+  }
+  return files;
+}
+
+function shouldIncludeFile(
+  filePath: string,
+  ignorePatterns?: RegExp[]
+): boolean {
+  return !(
+    ignorePatterns && ignorePatterns.some((pattern) => pattern.test(filePath))
+  );
+}
diff --git a/src/lib/markdowndb.spec.ts b/src/lib/markdowndb.spec.ts
@@ -173,14 +173,14 @@
    test("can find file by url path", async () => {
      const dbFile = await mddb.getFileByUrl("blog/blog2");
      expect(dbFile).not.toBeNull();
      expect(dbFile!.url_path).toBe("blog/blog2");
    });

    test("can find file by id", async () => {
      const dbFile = await mddb.getFileByUrl("blog/blog2");
      const dbFileById = await mddb.getFileById(dbFile!._id);
      expect(dbFileById).not.toBeNull();
      expect(dbFileById!.url_path).toBe("blog/blog2");
    });
  });

@@ -189,6 +189,9 @@
     test("can get all tags", async () => {
       const dbTags = await mddb.getTags();
       const extectedTags = [
+        { name: "tag1" },
+        { name: "tag2" },
+        { name: "tag3" },
         { name: "economy" },
         { name: "politics" },
         { name: "sports" },
@@ -208,10 +211,10 @@
      const toFile = await mddb.getFileByUrl("blog0");

      const forwardLinks = await mddb.getLinks({
        fileId: fromFile!._id,
      });
      expect(forwardLinks.length).toBe(1);
      expect(forwardLinks[0].to).toBe(toFile!._id);
    });

    test("can get all backward links of a file", async () => {
@@ -220,13 +223,13 @@
      const fromFile2 = await mddb.getFileByUrl("blog/blog1");

      const backwardLinks = await mddb.getLinks({
        fileId: toFile!._id,
        direction: "backward",
      });
      const backwardLinksFileIds = backwardLinks.map((l) => l.from);
      expect(backwardLinksFileIds).toHaveLength(2);
      expect(backwardLinksFileIds).toContain(fromFile1!._id);
      expect(backwardLinksFileIds).toContain(fromFile2!._id);
    });
  });

@@ -268,7 +271,7 @@
        },
      ];
      // TODO fix types
      expect(() => MddbFile.batchInsert(mddb as any, files)).toThrow();
    });
  });
 });
@@ -289,7 +292,7 @@
     await mddb.init();
     await mddb.indexFolder({
       folderPath: pathToContentFixture,
-      ignorePatterns: [/\/ignore\/.*/],
+      ignorePatterns: [/[\\/]ignore[\\/].*/],
       pathToUrlResolver: (path) =>
         path
           .replace(/\.mdx?$/, "")

diff --git a/src/lib/markdowndb.ts b/src/lib/markdowndb.ts
@@ -1,19 +1,16 @@
-import crypto from "crypto";
-import fs from "fs";
 import path from "path";
 import knex, { Knex } from "knex";
 
-import { recursiveWalkDir, parseFile, WikiLink } from "../utils/index.js";
+import { MddbFile, MddbTag, MddbLink, MddbFileTag } from "./schema.js";
+import { indexFolder } from "./indexFolder.js";
 import {
-  File,
-  MddbFile,
-  Link,
-  Tag,
-  FileTag,
-  MddbTag,
-  MddbFileTag,
-  MddbLink,
-} from "./schema.js";
+  resetDatabaseTables,
+  mapFileToInsert,
+  mapLinksToInsert,
+  isLinkToDefined,
+  mapFileTagsToInsert,
+  getUniqueValues,
+} from "../utils/databaseUtils.js";
 
 const defaultFilePathToUrl = (filePath: string) => {
   let url = filePath
@@ -24,7 +21,7 @@
  return encodeURI(url);
 };

 const resolveLinkToUrlPath = (link: string, sourceFilePath?: string) => {
  if (!sourceFilePath) {
    return link;
  }
@@ -60,164 +57,29 @@
     ignorePatterns?: RegExp[];
     pathToUrlResolver?: (filePath: string) => string;
   }) {
-    //  Temporary, we don't want to handle updates now
-    //  so database is refreshed every time the folder
-    //  is indexed
-    await MddbFile.deleteTable(this.db);
-    await MddbTag.deleteTable(this.db);
-    await MddbFileTag.deleteTable(this.db);
-    await MddbLink.deleteTable(this.db);
-
-    await MddbFile.createTable(this.db);
-    await MddbTag.createTable(this.db);
-    await MddbFileTag.createTable(this.db);
-    await MddbLink.createTable(this.db);
-
-    const filePathsToIndex = recursiveWalkDir(folderPath);
-
-    const filesToInsert: File[] = [];
-    const fileTagsToInsert: FileTag[] = [];
-    // TODO shouldn't available tags be explicitly defined in some config file
-    // instead of being extracted from all files? I think it's better even from user perspective
-    // as he can easily manage and see all the tags he is using
-    // (he can qickly look up tag if he's not sure what term he was using in other files)
-    // + it's easier to implement
-    const tagsToInsert: Tag[] = [];
-    const linksToInsert: Link[] = [];
-
-    // TODO is there a better way to do this?
-    // Temporary containter for storing links extracted from each file
-    // as a map of file id -> extracted links.
-    // This is used after all files have been parsed and added to filesToInsert
-    // to resolve paths in links to target file ids
-    const filesLinksMap: {
-      [fileId: string]: {
-        url: string;
-        links: WikiLink[];
-      };
-    } = {};
-
-    for (const filePath of filePathsToIndex) {
-      if (ignorePatterns.some((pattern) => pattern.test(filePath))) {
-        continue;
-      }
-
-      // id
-      // TODO this can be autogenerated by database
-      const encodedPath = Buffer.from(filePath, "utf-8").toString();
-      const id = crypto.createHash("sha1").update(encodedPath).digest("hex");
-
-      // extension
-      const [, extension] = filePath.match(/.(\w+)$/) || [];
-
-      if (!MddbFile.supportedExtensions.includes(extension)) {
-        filesToInsert.push({
-          _id: id,
-          file_path: filePath,
-          extension,
-          url_path: null,
-          filetype: null,
-          metadata: null,
-        });
-        continue;
-      }
-
-      // url_path
-      const pathRelativeToFolder = path.relative(folderPath, filePath);
-      const urlPath = pathToUrlResolver(pathRelativeToFolder);
-
-      // metadata, tags, links
-      const source: string = fs.readFileSync(filePath, {
-        encoding: "utf8",
-        flag: "r",
-      });
-
-      const { metadata, links } = parseFile(source, {
-        permalinks: filePathsToIndex,
-      });
-      const filetype = metadata?.type || null;
-
-      // TODO is there a better way to do this?
-      filesLinksMap[id] = {
-        url: urlPath,
-        links,
-      };
-
-      const tags = metadata?.tags || [];
-      tags.forEach((tag: string) => {
-        if (!tagsToInsert.some((t) => t.name === tag)) {
-          tagsToInsert.push({ name: tag });
-        }
-        fileTagsToInsert.push({ file: id, tag });
-      });
-
-      filesToInsert.push({
-        _id: id,
-        file_path: filePath,
-        extension,
-        url_path: urlPath,
-        filetype,
-        metadata,
-      });
-    }
-
-    Object.entries(filesLinksMap).forEach(([fileId, { url, links }]) => {
-      links.forEach(({ linkSrc, linkType }) => {
-        const destPath = resolveLinkToUrlPath(linkSrc, url);
-        const destFile = filesToInsert.find(
-          (file) => file.url_path === destPath
-        );
-        if (!destFile) {
-          return;
-        }
-        const linkToInsert = {
-          // _id: id,
-          from: fileId,
-          to: destFile._id,
-          link_type: linkType,
-        };
-        linksToInsert.push(linkToInsert);
-      });
-    });
-
-    if (filesToInsert.length >= 500) {
-      for (let i = 0; i < filesToInsert.length; i += 500) {
-        await MddbFile.batchInsert(this.db, filesToInsert.slice(i, i + 500));
-      }
-    } else {
-      await MddbFile.batchInsert(this.db, filesToInsert);
-    }
-
-    // TODO  what happens if some of the files were not inserted?
-    // I guess inserting tags or links with such files used as foreign keys will fail too,
-    // but need to check
-
-    if (tagsToInsert.length >= 500) {
-      for (let i = 0; i < tagsToInsert.length; i += 500) {
-        await MddbTag.batchInsert(this.db, tagsToInsert.slice(i, i + 500));
-      }
-    } else {
-      await MddbTag.batchInsert(this.db, tagsToInsert);
-    }
-
-    if (fileTagsToInsert.length >= 500) {
-      for (let i = 0; i < fileTagsToInsert.length; i += 500) {
-        await MddbFileTag.batchInsert(
-          this.db,
-          fileTagsToInsert.slice(i, i + 500)
-        );
-      }
-    } else {
-      await MddbFileTag.batchInsert(this.db, fileTagsToInsert);
-    }
+    await resetDatabaseTables(this.db);
+
+    const fileObjects = indexFolder(
+      folderPath,
+      pathToUrlResolver,
+      ignorePatterns
+    );
+    const filesToInsert = fileObjects.map(mapFileToInsert);
+    const uniqueTags = getUniqueValues(
+      fileObjects.flatMap((file) => file.tags)
+    );
+    const tagsToInsert = uniqueTags.map((tag) => ({ name: tag }));
+    const linksToInsert = fileObjects
+      .flatMap((fileObject) => {
+        return mapLinksToInsert(filesToInsert, fileObject);
+      })
+      .filter(isLinkToDefined);
+    const fileTagsToInsert = fileObjects.flatMap(mapFileTagsToInsert);
 
-    if (linksToInsert.length >= 500) {
-      for (let i = 0; i < linksToInsert.length; i += 500) {
-        await MddbLink.batchInsert(this.db, linksToInsert.slice(i, i + 500));
-      }
-    } else {
-      await MddbLink.batchInsert(this.db, linksToInsert);
-    }
+    await MddbFile.batchInsert(this.db, filesToInsert);
+    await MddbTag.batchInsert(this.db, tagsToInsert);
+    await MddbFileTag.batchInsert(this.db, fileTagsToInsert);
+    await MddbLink.batchInsert(this.db, getUniqueValues(linksToInsert));
   }
 
   async getFileById(id: string): Promise<MddbFile | null> {

diff --git a/src/lib/process.spec.ts b/src/lib/process.spec.ts
@@ -0,0 +1,36 @@
+import { processFile } from "./process";
+import Path from "path";
+
+describe("Can parse a file and get file info", () => {
+  const pathToContentFixture = "__mocks__/content";
+
+  test("can parse a file", async () => {
+    const filePath = "index.mdx";
+    const fullPath = Path.join(pathToContentFixture, filePath);
+    const fileInfo = processFile(
+      pathToContentFixture,
+      fullPath,
+      (filePath) => filePath,
+      []
+    );
+
+    expect(fileInfo.file_path).toBe(fullPath);
+    expect(fileInfo.url_path).toBe("index.mdx");
+    expect(fileInfo.extension).toBe("mdx");
+    expect(fileInfo.tags).toEqual(["tag1", "tag2", "tag3"]);
+    expect(fileInfo.metadata).toEqual({
+      title: "Homepage",
+      tags: ["tag1", "tag2", "tag3"],
+    });
+    expect(fileInfo.links).toEqual([
+      {
+        embed: false,
+        from: "index.mdx",
+        internal: true,
+        text: "link",
+        to: "blog0.mdx",
+        toRaw: "blog0.mdx",
+      },
+    ]);
+  });
+});
diff --git a/src/lib/process.ts b/src/lib/process.ts
@@ -0,0 +1,71 @@
+import crypto from "crypto";
+import fs from "fs";
+import path from "path";
+
+import { parseFile, WikiLink } from "../utils/index.js";
+import { File } from "./schema.js";
+
+export interface FileInfo extends File {
+  tags: string[];
+  links: WikiLink[];
+}
+
+// this file is an extraction of the file info parsing from markdowndb.ts without any sql stuff
+// TODO: add back (as an option) - providing a "root folder" path for resolve
+export function processFile(
+  rootFolder: string,
+  filePath: string,
+  pathToUrlResolver: (filePath: string) => string,
+  filePathsToIndex: string[]
+) {
+  // Remove rootFolder from filePath
+  const relativePath = path.relative(rootFolder, filePath);
+
+  // gets key file info if any e.g. extension (file size??)
+  const encodedPath = Buffer.from(relativePath, "utf-8").toString();
+  const id = crypto.createHash("sha1").update(encodedPath).digest("hex");
+
+  // extension
+  const extension = path.extname(relativePath).slice(1);
+
+  const fileInfo: FileInfo = {
+    _id: id,
+    file_path: filePath,
+    extension,
+    url_path: null,
+    filetype: null,
+    metadata: {},
+    tags: [],
+    links: [],
+  };
+
+  // if not a file type we can parse exit here ...
+  // if (extension ! in list of supported extensions exit now ...)
+  const isExtensionSupported = extension === "md" || extension === "mdx";
+  if (!isExtensionSupported) {
+    return fileInfo;
+  }
+
+  // metadata, tags, links
+  const source: string = fs.readFileSync(filePath, {
+    encoding: "utf8",
+    flag: "r",
+  });
+
+  const { metadata, links } = parseFile(source, {
+    from: relativePath,
+    permalinks: filePathsToIndex,
+  });
+
+  fileInfo.url_path = pathToUrlResolver(relativePath);
+  fileInfo.metadata = metadata;
+  fileInfo.links = links;
+
+  const filetype = metadata?.type || null;
+  fileInfo.filetype = filetype;
+
+  const tags = metadata?.tags || [];
+  fileInfo.tags = tags;
+
+  return fileInfo;
+}