Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add support for wide characters when building index of dataset files #728

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 11 additions & 15 deletions metaseq/data/jsonl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# LICENSE file in the root directory of this source tree.

import argparse
from io import TextIOWrapper
import json
import logging
import mmap
Expand Down Expand Up @@ -132,22 +133,17 @@ def _get_subshard_id(self):
# and then wraps around if the epoch id goes beyond the data_subshard_count
return (self.epoch - 1) % self.data_subshard_count

def _build_index(self, path: str):
def _build_index(self, file_path: str):
"""Build index of start positions of each line."""
logger.info(f"Building index for file: {path}")
f = self._get_mmap()
f.seek(0)
offsets = []
cur = 0
line_num = 0
while True:
line = f.readline()
if line == b"":
break
offsets.append(cur)
cur += len(line)
mattmazzola marked this conversation as resolved.
Show resolved Hide resolved
line_num += 1
return offsets
logger.info(f"Building index for file: {file_path}")
file: TextIOWrapper = self._get_mmap()

offsets = [0]
for _ in iter(file.readline, b""):
offsets.append(file.tell())

# return all offsets except the last one, which is the end of the file
return offsets[:-1]

def __setstate__(self, state):
self.__dict__ = state
Expand Down
Loading