Skip to content

Commit

Permalink
Merge pull request #57 from datopian/body-tags
Browse files Browse the repository at this point in the history
[#49, Add body tags extraction]
  • Loading branch information
mohamedsalem401 committed Nov 17, 2023
2 parents fae96aa + fed0e28 commit 30106b7
Show file tree
Hide file tree
Showing 2 changed files with 178 additions and 0 deletions.
149 changes: 149 additions & 0 deletions src/utils/extractTagsFromBody.spec.ts
@@ -0,0 +1,149 @@
import { extractTagsFromBody } from "./extractTagsFromBody";

describe("extractTagsFromBody", () => {
test("should extract tags from body", () => {
const source = "#tag";
const tags = extractTagsFromBody(source);
const expectedTags = ["tag"];
expect(tags).toEqual(expectedTags);
});

test("should extract tags from heading", () => {
const source = "# heading #tag";
const tags = extractTagsFromBody(source);
const expectedTags = ["tag"];
expect(tags).toEqual(expectedTags);
});

test("should extract 2 tags from heading", () => {
const source = "# heading #tag #tag2";
const tags = extractTagsFromBody(source);
const expectedTags = ["tag", "tag2"];
expect(tags).toEqual(expectedTags);
});

test("should extract tags from body text", () => {
const source = "This is a #tag in the body text.";
const tags = extractTagsFromBody(source);
const expectedTags = ["tag"];
expect(tags).toEqual(expectedTags);
});

test("should extract 2 tags from body text", () => {
const source = "This is #tag1 and #tag2 in the body text.";
const tags = extractTagsFromBody(source);
const expectedTags = ["tag1", "tag2"];
expect(tags).toEqual(expectedTags);
});

test("should extract tags from both heading and body text", () => {
const source = `# head #tag
in heading and also in the #tag-body body text.`;
const tags = extractTagsFromBody(source);
const expectedTags = ["tag", "tag-body"];
expect(tags).toEqual(expectedTags);
});

test("should extract tags with numbers", () => {
const source = "This is #tag123 with numbers.";
const tags = extractTagsFromBody(source);
const expectedTags = ["tag123"];
expect(tags).toEqual(expectedTags);
});

test("should extract tags with special characters", () => {
const source =
"This is #special-tag #special_tag2 with special characters.";
const tags = extractTagsFromBody(source);
const expectedTags = ["special-tag", "special_tag2"];
expect(tags).toEqual(expectedTags);
});

test("should extract tags with slash", () => {
const source = "This is #tag/with/slash.";
const tags = extractTagsFromBody(source);
const expectedTags = ["tag/with/slash"];
expect(tags).toEqual(expectedTags);
});

test("should extract tags with multiple tags in a line", () => {
const source = "#tag1 #tag2 #tag3";
const tags = extractTagsFromBody(source);
const expectedTags = ["tag1", "tag2", "tag3"];
expect(tags).toEqual(expectedTags);
});

// for now we will pass the body content only not the whole source
test("shouldn't extract frontmatter tags", () => {
const content = `
No tags in this content.
#gr3
`;
const tags = extractTagsFromBody(content);
const expectedTags: string[] = ["gr3"];
expect(tags).toEqual(expectedTags);
});

test("should extract tags from multiline text", () => {
const source = `This is a multiline text with #tag1 and #tag2.
Multiple tags on different lines:
#tag3
#tag4
And another tag: #tag5.
`;
const tags = extractTagsFromBody(source);
const expectedTags: string[] = ["tag1", "tag2", "tag3", "tag4", "tag5"];
expect(tags).toEqual(expectedTags);
});

test("should handle multiple tags in the same line", () => {
const source = `#tag1 #tag2 #tag3
#tag4 #tag5`;
const tags = extractTagsFromBody(source);
const expectedTags: string[] = ["tag1", "tag2", "tag3", "tag4", "tag5"];
expect(tags).toEqual(expectedTags);
});

test("should handle tags with numbers and slashes in multiline text", () => {
const source = `Tags with numbers: #tag123 and #tag456.
Tags with slashes: #tag/one and #tag/two/three.
`;
const tags = extractTagsFromBody(source);
const expectedTags: string[] = [
"tag123",
"tag456",
"tag/one",
"tag/two/three",
];
expect(tags).toEqual(expectedTags);
});

test("should handle tags with special characters in multiline text", () => {
const source = `Tags with special characters: #special-tag and #tag$percent.
Another tag: #tag_with_underscore.
`;
const tags = extractTagsFromBody(source);
const expectedTags: string[] = [
"special-tag",
"tag",
"tag_with_underscore",
];
expect(tags).toEqual(expectedTags);
});

test("should handle edge case with no tags in multiline text", () => {
const source = `No tags in this multiline content.
Another line without tags.
`;
const tags = extractTagsFromBody(source);
const expectedTags: string[] = [];
expect(tags).toEqual(expectedTags);
});

test("should handle edge case with no tags", () => {
const source = "No tags in this content.";
const tags = extractTagsFromBody(source);
const expectedTags: string[] = [];
expect(tags).toEqual(expectedTags);
});
});
29 changes: 29 additions & 0 deletions src/utils/extractTagsFromBody.ts
@@ -0,0 +1,29 @@
import markdown from "remark-parse";
import { unified } from "unified";
import { selectAll, Node } from "unist-util-select";

export interface TagExtractors {
[test: string]: (node: Node) => string[]; // Updated interface for tag extractors
}

const extractTagsFromBody = (source: string) => {
let tags: string[] = [];

const processor = unified().use(markdown);

const ast = processor.parse(source);
const nodes = selectAll("*", ast);
for (let index = 0; index < nodes.length; index++) {
const node: any = nodes[index];
if (node.value) {
const textTags = node.value.match(/(?:^|\s)(#(\w+|\/|-|_)+)/g);
if (textTags) {
tags = tags.concat(textTags.map((tag: string) => tag.trim().slice(1))); // Extract tags and remove the '#'
}
}
}

return tags;
};

export { extractTagsFromBody };

0 comments on commit 30106b7

Please sign in to comment.