Skip to content
Merged
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
17f7b28
fixed merge_extractor method to handle the global flag in middle error.
vaishakhRaveendran Jul 14, 2024
bb1eab4
Update the to_pandas() method to handle For a concise commit message …
vaishakhRaveendran Jul 14, 2024
0cb9f87
Revert "Update the to_pandas() method to handle For a concise commit …
vaishakhRaveendran Jul 14, 2024
244caf4
Update the to_pandas() method to handle unexpected datatypes
vaishakhRaveendran Jul 14, 2024
0e51ce2
Add to_dict() method to QAC class
vaishakhRaveendran Jul 14, 2024
b340e10
Add error handling for abstract questions
vaishakhRaveendran Jul 14, 2024
8792f49
Add error handling for specific questions
vaishakhRaveendran Jul 14, 2024
cb8fd01
New pattern for headline for using with textloader
vaishakhRaveendran Jul 16, 2024
f24361d
update the regular expression
vaishakhRaveendran Jul 17, 2024
cc0218a
remove print statements
vaishakhRaveendran Jul 17, 2024
173c167
Updated regex_based.py from iter-v3 branch
vaishakhRaveendran Jul 18, 2024
fee0556
loader package created
vaishakhRaveendran Jul 22, 2024
96c9edd
create RAGASLoader class
vaishakhRaveendran Jul 22, 2024
1b4e0c9
test script for lazy_aload method
vaishakhRaveendran Jul 22, 2024
357d063
test script for lazy_load method
vaishakhRaveendran Jul 22, 2024
6d5002c
Update test_sync.py
vaishakhRaveendran Jul 22, 2024
a352c62
Merge pull request #1 from vaishakhRaveendran/iter-v3
vaishakhRaveendran Jul 22, 2024
79f5633
Merge pull request #2 from explodinggradients/main
vaishakhRaveendran Jul 22, 2024
c5f0b22
Revert "Iter v3"
vaishakhRaveendran Jul 22, 2024
186755c
Merge pull request #3 from vaishakhRaveendran/revert-1-iter-v3
vaishakhRaveendran Jul 22, 2024
1f35245
Merge pull request #4 from explodinggradients/main
vaishakhRaveendran Jul 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,16 @@ def extract_text(self, text):
)
result = defaultdict(list)
for m in matches:
m = {k: v for k, v in m.groupdict().items() if v is not None}
for key in m:
result[key].append(m[key])

m_dict = {k: v for k, v in m.groupdict().items() if v is not None}
for key, value in m_dict.items():
result[key].append(value)
return result

def extract(self, node: t.Union[Node, LCDocument]) -> t.Any:
return super().extract(node)

def merge_extractors(self, *extractors) -> t.List[Extractor]:
if isinstance(
self, RulebasedExtractor
): # Check if called by an initiated class
if isinstance(self, RulebasedExtractor):
extractors = (self,) + extractors

assert all(
Expand All @@ -69,13 +66,28 @@ def merge_extractors(self, *extractors) -> t.List[Extractor]:
added_indices.append(extractors.index(ext))

extractors_to_return = []
for extractors in final_extractors:
for group_index, extractors in enumerate(final_extractors):
if len(extractors) > 1:
pattern = "|".join([extractor.pattern for extractor in extractors])
updated_regex = Regex(name="merged_extractor", pattern=pattern)
# Process each pattern individually
processed_patterns = []
for extractor in extractors:
pattern = extractor.pattern
# Extract flags from the beginning of the pattern
flags = ""
if pattern.startswith("(?"):
flag_end = pattern.index(")")
flags = pattern[2:flag_end]
pattern = pattern[flag_end + 1:]
# Wrap the pattern in a non-capturing group with flags
processed_patterns.append(f"(?{flags}:{pattern})")

# Join all processed patterns
merged_pattern = "|".join(processed_patterns)

updated_regex = Regex(name="merged_extractor", pattern=merged_pattern)
else:
pattern = extractors[0].pattern
updated_regex = extractors[0].regex

extractors_to_return.append(
RulebasedExtractor(
attribute=extractors[0].attribute,
Expand All @@ -85,10 +97,9 @@ def merge_extractors(self, *extractors) -> t.List[Extractor]:
)
return extractors_to_return


links_extractor_pattern = r"(?i)\b(?:https?://|www\.)\S+\b"
emails_extractor_pattern = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
markdown_headings = r"^(#{1,6})\s+(.*)"
markdown_headings_pattern = r"^(#{1,6})\s+(.*)"

email_extractor = RulebasedExtractor(
regex=Regex(name="email", pattern=emails_extractor_pattern)
Expand All @@ -97,5 +108,6 @@ def merge_extractors(self, *extractors) -> t.List[Extractor]:
regex=Regex(name="link", pattern=links_extractor_pattern)
)
markdown_headings = RulebasedExtractor(
regex=Regex(name="markdown_headings", pattern=markdown_headings), is_multiline=True
regex=Regex(name="markdown_headings", pattern=markdown_headings_pattern),
is_multiline=True
)