Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge v2 into main #61

Merged
merged 25 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9d0084e
[#47,refactor][m]: create a process.ts file that just does extraction…
rufuspollock Nov 15, 2023
309757e
update index.mdx for testing
mohamedsalem401 Nov 15, 2023
8fcda1d
Add process function and add some tests
mohamedsalem401 Nov 15, 2023
9b44ab1
[ #4 , extract links ] add link extraction
mohamedsalem401 Nov 16, 2023
fae96aa
Merge pull request #56 from datopian/links-extraction
mohamedsalem401 Nov 17, 2023
fed0e28
[#49, Add body tags extraction]
mohamedsalem401 Nov 17, 2023
30106b7
Merge pull request #57 from datopian/body-tags
mohamedsalem401 Nov 17, 2023
40dfe1d
#58 refactor mddb code
mohamedsalem401 Nov 20, 2023
5378eff
Integrate extracting tags from body
mohamedsalem401 Nov 20, 2023
cb40cc3
Integrate links extraction
mohamedsalem401 Nov 20, 2023
26a7164
Merge pull request #59 from datopian/refactor-2
mohamedsalem401 Nov 21, 2023
0b03330
Add changeset
mohamedsalem401 Nov 21, 2023
502d808
Merge branch 'main' into v2
mohamedsalem401 Nov 21, 2023
c95e78e
update include config
mohamedsalem401 Nov 21, 2023
5c9eaa7
Restore pathToUrlResolver
mohamedsalem401 Nov 21, 2023
826ed81
update tests
mohamedsalem401 Nov 21, 2023
a8a9ec7
Implement obsidian links
mohamedsalem401 Nov 21, 2023
223a566
No need for this
mohamedsalem401 Nov 21, 2023
98a4b04
Update tests for links
mohamedsalem401 Nov 21, 2023
b8bc45a
Update tests for links
mohamedsalem401 Nov 22, 2023
338a916
Merge branch 'update-tests' of https://github.com/datopian/markdowndb…
mohamedsalem401 Nov 22, 2023
c313c36
undo changing tests
mohamedsalem401 Nov 22, 2023
7d73442
Merge pull request #62 from datopian/update-tests
mohamedsalem401 Nov 22, 2023
d04f225
Update obsidian links test
mohamedsalem401 Nov 22, 2023
eea5b9b
Merge pull request #63 from datopian/update-tests
mohamedsalem401 Nov 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changeset/strange-ads-tan.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"mddb": minor
---

Add Tags Extraction from Markdown Content.
Resolved issues with link extraction from Markdown documents.
Conducted code refactoring for improved readability and maintainability.
5 changes: 4 additions & 1 deletion __mocks__/content/index.mdx
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
---
title: Homepage
tags: tag1, tag2, tag3
---

# Welcome
# Welcome

[link](blog0.mdx)
33 changes: 33 additions & 0 deletions src/lib/indexFolder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { recursiveWalkDir } from "../utils/index.js";
import { FileInfo, processFile } from "./process.js";

export function indexFolder(
folderPath: string,
pathToUrlResolver: (filePath: string) => string,
ignorePatterns?: RegExp[]
) {
const filePathsToIndex = recursiveWalkDir(folderPath);
const filteredFilePathsToIndex = filePathsToIndex.filter((filePath) =>
shouldIncludeFile(filePath, ignorePatterns)
);
const files: FileInfo[] = [];
for (const filePath of filteredFilePathsToIndex) {
const fileObject = processFile(
folderPath,
filePath,
pathToUrlResolver,
filePathsToIndex
);
files.push(fileObject);
}
return files;
}

function shouldIncludeFile(
filePath: string,
ignorePatterns?: RegExp[]
): boolean {
return !(
ignorePatterns && ignorePatterns.some((pattern) => pattern.test(filePath))
);
}
5 changes: 4 additions & 1 deletion src/lib/markdowndb.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,14 +173,14 @@
test("can find file by url path", async () => {
const dbFile = await mddb.getFileByUrl("blog/blog2");
expect(dbFile).not.toBeNull();
expect(dbFile!.url_path).toBe("blog/blog2");

Check warning on line 176 in src/lib/markdowndb.spec.ts

View workflow job for this annotation

GitHub Actions / Lint & format check

Forbidden non-null assertion
});

test("can find file by id", async () => {
const dbFile = await mddb.getFileByUrl("blog/blog2");
const dbFileById = await mddb.getFileById(dbFile!._id);

Check warning on line 181 in src/lib/markdowndb.spec.ts

View workflow job for this annotation

GitHub Actions / Lint & format check

Forbidden non-null assertion
expect(dbFileById).not.toBeNull();
expect(dbFileById!.url_path).toBe("blog/blog2");

Check warning on line 183 in src/lib/markdowndb.spec.ts

View workflow job for this annotation

GitHub Actions / Lint & format check

Forbidden non-null assertion
});
});

Expand All @@ -189,6 +189,9 @@
test("can get all tags", async () => {
const dbTags = await mddb.getTags();
const extectedTags = [
{ name: "tag1" },
{ name: "tag2" },
{ name: "tag3" },
{ name: "economy" },
{ name: "politics" },
{ name: "sports" },
Expand All @@ -208,10 +211,10 @@
const toFile = await mddb.getFileByUrl("blog0");

const forwardLinks = await mddb.getLinks({
fileId: fromFile!._id,

Check warning on line 214 in src/lib/markdowndb.spec.ts

View workflow job for this annotation

GitHub Actions / Lint & format check

Forbidden non-null assertion
});
expect(forwardLinks.length).toBe(1);
expect(forwardLinks[0].to).toBe(toFile!._id);

Check warning on line 217 in src/lib/markdowndb.spec.ts

View workflow job for this annotation

GitHub Actions / Lint & format check

Forbidden non-null assertion
});

test("can get all backward links of a file", async () => {
Expand All @@ -220,13 +223,13 @@
const fromFile2 = await mddb.getFileByUrl("blog/blog1");

const backwardLinks = await mddb.getLinks({
fileId: toFile!._id,

Check warning on line 226 in src/lib/markdowndb.spec.ts

View workflow job for this annotation

GitHub Actions / Lint & format check

Forbidden non-null assertion
direction: "backward",
});
const backwardLinksFileIds = backwardLinks.map((l) => l.from);
expect(backwardLinksFileIds).toHaveLength(2);
expect(backwardLinksFileIds).toContain(fromFile1!._id);

Check warning on line 231 in src/lib/markdowndb.spec.ts

View workflow job for this annotation

GitHub Actions / Lint & format check

Forbidden non-null assertion
expect(backwardLinksFileIds).toContain(fromFile2!._id);

Check warning on line 232 in src/lib/markdowndb.spec.ts

View workflow job for this annotation

GitHub Actions / Lint & format check

Forbidden non-null assertion
});
});

Expand Down Expand Up @@ -268,7 +271,7 @@
},
];
// TODO fix types
expect(() => MddbFile.batchInsert(mddb as any, files)).toThrow();

Check warning on line 274 in src/lib/markdowndb.spec.ts

View workflow job for this annotation

GitHub Actions / Lint & format check

Unexpected any. Specify a different type
});
});
});
Expand All @@ -289,7 +292,7 @@
await mddb.init();
await mddb.indexFolder({
folderPath: pathToContentFixture,
ignorePatterns: [/\/ignore\/.*/],
ignorePatterns: [/[\\/]ignore[\\/].*/],
pathToUrlResolver: (path) =>
path
.replace(/\.mdx?$/, "")
Expand Down
200 changes: 31 additions & 169 deletions src/lib/markdowndb.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
import crypto from "crypto";
import fs from "fs";
import path from "path";
import knex, { Knex } from "knex";

import { recursiveWalkDir, parseFile, WikiLink } from "../utils/index.js";
import { MddbFile, MddbTag, MddbLink, MddbFileTag } from "./schema.js";
import { indexFolder } from "./indexFolder.js";
import {
File,
MddbFile,
Link,
Tag,
FileTag,
MddbTag,
MddbFileTag,
MddbLink,
} from "./schema.js";
resetDatabaseTables,
mapFileToInsert,
mapLinksToInsert,
isLinkToDefined,
mapFileTagsToInsert,
getUniqueValues,
} from "../utils/databaseUtils.js";

const defaultFilePathToUrl = (filePath: string) => {
let url = filePath
Expand All @@ -24,7 +21,7 @@
return encodeURI(url);
};

const resolveLinkToUrlPath = (link: string, sourceFilePath?: string) => {

Check warning on line 24 in src/lib/markdowndb.ts

View workflow job for this annotation

GitHub Actions / Lint & format check

'resolveLinkToUrlPath' is assigned a value but never used
if (!sourceFilePath) {
return link;
}
Expand Down Expand Up @@ -60,164 +57,29 @@
ignorePatterns?: RegExp[];
pathToUrlResolver?: (filePath: string) => string;
}) {
// Temporary, we don't want to handle updates now
// so database is refreshed every time the folder
// is indexed
await MddbFile.deleteTable(this.db);
await MddbTag.deleteTable(this.db);
await MddbFileTag.deleteTable(this.db);
await MddbLink.deleteTable(this.db);

await MddbFile.createTable(this.db);
await MddbTag.createTable(this.db);
await MddbFileTag.createTable(this.db);
await MddbLink.createTable(this.db);

const filePathsToIndex = recursiveWalkDir(folderPath);

const filesToInsert: File[] = [];
const fileTagsToInsert: FileTag[] = [];
// TODO shouldn't available tags be explicitly defined in some config file
// instead of being extracted from all files? I think it's better even from user perspective
// as he can easily manage and see all the tags he is using
// (he can qickly look up tag if he's not sure what term he was using in other files)
// + it's easier to implement
const tagsToInsert: Tag[] = [];
const linksToInsert: Link[] = [];

// TODO is there a better way to do this?
// Temporary containter for storing links extracted from each file
// as a map of file id -> extracted links.
// This is used after all files have been parsed and added to filesToInsert
// to resolve paths in links to target file ids
const filesLinksMap: {
[fileId: string]: {
url: string;
links: WikiLink[];
};
} = {};

for (const filePath of filePathsToIndex) {
if (ignorePatterns.some((pattern) => pattern.test(filePath))) {
continue;
}

// id
// TODO this can be autogenerated by database
const encodedPath = Buffer.from(filePath, "utf-8").toString();
const id = crypto.createHash("sha1").update(encodedPath).digest("hex");

// extension
const [, extension] = filePath.match(/.(\w+)$/) || [];

if (!MddbFile.supportedExtensions.includes(extension)) {
filesToInsert.push({
_id: id,
file_path: filePath,
extension,
url_path: null,
filetype: null,
metadata: null,
});
continue;
}

// url_path
const pathRelativeToFolder = path.relative(folderPath, filePath);
const urlPath = pathToUrlResolver(pathRelativeToFolder);

// metadata, tags, links
const source: string = fs.readFileSync(filePath, {
encoding: "utf8",
flag: "r",
});

const { metadata, links } = parseFile(source, {
permalinks: filePathsToIndex,
});
const filetype = metadata?.type || null;

// TODO is there a better way to do this?
filesLinksMap[id] = {
url: urlPath,
links,
};

const tags = metadata?.tags || [];
tags.forEach((tag: string) => {
if (!tagsToInsert.some((t) => t.name === tag)) {
tagsToInsert.push({ name: tag });
}
fileTagsToInsert.push({ file: id, tag });
});

filesToInsert.push({
_id: id,
file_path: filePath,
extension,
url_path: urlPath,
filetype,
metadata,
});
}

Object.entries(filesLinksMap).forEach(([fileId, { url, links }]) => {
links.forEach(({ linkSrc, linkType }) => {
const destPath = resolveLinkToUrlPath(linkSrc, url);
const destFile = filesToInsert.find(
(file) => file.url_path === destPath
);
if (!destFile) {
return;
}
const linkToInsert = {
// _id: id,
from: fileId,
to: destFile._id,
link_type: linkType,
};
linksToInsert.push(linkToInsert);
});
});

if (filesToInsert.length >= 500) {
for (let i = 0; i < filesToInsert.length; i += 500) {
await MddbFile.batchInsert(this.db, filesToInsert.slice(i, i + 500));
}
} else {
await MddbFile.batchInsert(this.db, filesToInsert);
}

// TODO what happens if some of the files were not inserted?
// I guess inserting tags or links with such files used as foreign keys will fail too,
// but need to check

if (tagsToInsert.length >= 500) {
for (let i = 0; i < tagsToInsert.length; i += 500) {
await MddbTag.batchInsert(this.db, tagsToInsert.slice(i, i + 500));
}
} else {
await MddbTag.batchInsert(this.db, tagsToInsert);
}

if (fileTagsToInsert.length >= 500) {
for (let i = 0; i < fileTagsToInsert.length; i += 500) {
await MddbFileTag.batchInsert(
this.db,
fileTagsToInsert.slice(i, i + 500)
);
}
} else {
await MddbFileTag.batchInsert(this.db, fileTagsToInsert);
}
await resetDatabaseTables(this.db);

const fileObjects = indexFolder(
folderPath,
pathToUrlResolver,
ignorePatterns
);
const filesToInsert = fileObjects.map(mapFileToInsert);
const uniqueTags = getUniqueValues(
fileObjects.flatMap((file) => file.tags)
);
const tagsToInsert = uniqueTags.map((tag) => ({ name: tag }));
const linksToInsert = fileObjects
.flatMap((fileObject) => {
return mapLinksToInsert(filesToInsert, fileObject);
})
.filter(isLinkToDefined);
const fileTagsToInsert = fileObjects.flatMap(mapFileTagsToInsert);

if (linksToInsert.length >= 500) {
for (let i = 0; i < linksToInsert.length; i += 500) {
await MddbLink.batchInsert(this.db, linksToInsert.slice(i, i + 500));
}
} else {
await MddbLink.batchInsert(this.db, linksToInsert);
}
await MddbFile.batchInsert(this.db, filesToInsert);
await MddbTag.batchInsert(this.db, tagsToInsert);
await MddbFileTag.batchInsert(this.db, fileTagsToInsert);
await MddbLink.batchInsert(this.db, getUniqueValues(linksToInsert));
}

async getFileById(id: string): Promise<MddbFile | null> {
Expand Down
36 changes: 36 additions & 0 deletions src/lib/process.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import { processFile } from "./process";
import Path from "path";

describe("Can parse a file and get file info", () => {
const pathToContentFixture = "__mocks__/content";

test("can parse a file", async () => {
const filePath = "index.mdx";
const fullPath = Path.join(pathToContentFixture, filePath);
const fileInfo = processFile(
pathToContentFixture,
fullPath,
(filePath) => filePath,
[]
);

expect(fileInfo.file_path).toBe(fullPath);
expect(fileInfo.url_path).toBe("index.mdx");
expect(fileInfo.extension).toBe("mdx");
expect(fileInfo.tags).toEqual(["tag1", "tag2", "tag3"]);
expect(fileInfo.metadata).toEqual({
title: "Homepage",
tags: ["tag1", "tag2", "tag3"],
});
expect(fileInfo.links).toEqual([
{
embed: false,
from: "index.mdx",
internal: true,
text: "link",
to: "blog0.mdx",
toRaw: "blog0.mdx",
},
]);
});
});
71 changes: 71 additions & 0 deletions src/lib/process.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import crypto from "crypto";
import fs from "fs";
import path from "path";

import { parseFile, WikiLink } from "../utils/index.js";
import { File } from "./schema.js";

export interface FileInfo extends File {
tags: string[];
links: WikiLink[];
}

// this file is an extraction of the file info parsing from markdowndb.ts without any sql stuff
// TODO: add back (as an option) - providing a "root folder" path for resolve
export function processFile(
rootFolder: string,
filePath: string,
pathToUrlResolver: (filePath: string) => string,
filePathsToIndex: string[]
) {
// Remove rootFolder from filePath
const relativePath = path.relative(rootFolder, filePath);

// gets key file info if any e.g. extension (file size??)
const encodedPath = Buffer.from(relativePath, "utf-8").toString();
const id = crypto.createHash("sha1").update(encodedPath).digest("hex");

// extension
const extension = path.extname(relativePath).slice(1);

const fileInfo: FileInfo = {
_id: id,
file_path: filePath,
extension,
url_path: null,
filetype: null,
metadata: {},
tags: [],
links: [],
};

// if not a file type we can parse exit here ...
// if (extension ! in list of supported extensions exit now ...)
const isExtensionSupported = extension === "md" || extension === "mdx";
if (!isExtensionSupported) {
return fileInfo;
}

// metadata, tags, links
const source: string = fs.readFileSync(filePath, {
encoding: "utf8",
flag: "r",
});

const { metadata, links } = parseFile(source, {
from: relativePath,
permalinks: filePathsToIndex,
});

fileInfo.url_path = pathToUrlResolver(relativePath);
fileInfo.metadata = metadata;
fileInfo.links = links;

const filetype = metadata?.type || null;
fileInfo.filetype = filetype;

const tags = metadata?.tags || [];
fileInfo.tags = tags;

return fileInfo;
}
Loading