Skip to content

Commit

Permalink
weigit status and checking for file modification and tracking (#1021)
Browse files Browse the repository at this point in the history
* [Fix] Restructure for wgit availability as a package

* Preliminary implementation of wgit status

* [Feat] Addition of wgit status
1. Functionalities to check the status of the repo.
2. Checks if file has been modified, whether changes added or added changes commited.

* [test] Addition of tests for weigit status
1. Some minor refactors and docstring changes

* [Fix] Changes in repo status test

* [test] status test fix
1. made the test status printing order independent

* [refactor] Metadata dirs mirroring chkpt paths, changes in wgit status
1. Metadata files are now created within wgit with directory structure mirroring the relative paths of the checkpoint/files they track.
2. Changes in status: 3 statuses now.
3. Changes in tests.
4. Some code refactoring.

* [cleanup] minor changes in comments and cleanup
  • Loading branch information
riohib committed Jul 5, 2022
1 parent 775a0f0 commit 5b5db28
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 34 deletions.
3 changes: 2 additions & 1 deletion fairscale/experimental/wgit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ def main(argv: List[str] = None) -> None:

if args.command == "status":
repo = Repo(Path.cwd())
repo.status()
out = repo.status()
print(out)

if args.command == "log":
repo = Repo(Path.cwd())
Expand Down
17 changes: 13 additions & 4 deletions fairscale/experimental/wgit/pygit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path
import subprocess
import sys
from typing import List, Tuple
from typing import Dict, List, Tuple

import pygit2

Expand Down Expand Up @@ -123,9 +123,18 @@ def _path(self) -> str:
"""returns the path of the git repository PyGit is wrapped around"""
return self.repo.path

def status(self) -> None:
"""Show the status of the git repo"""
print(self.repo.status())
def status(self) -> Dict:
"""Gathers the status of the git repo within wgit and returns a dictionary detailing the status.
The dictionary contains the relative paths of the metadata files as keys and the values represent
the status of the file in the form of an int number as status codes. These status codes are
elaborated within PyGit2's documentation: https://www.pygit2.org/index_file.html#status and
https://github.com/libgit2/pygit2/blob/320ee5e733039d4a3cc952b287498dbc5737c353/src/pygit2.c#L312-L320
Returns: {"relative path to a file" : pygit2 status codes}
"""
status_dict = self.repo.status()
tracking_dict = dict(filter(lambda item: item[1] != pygit2.GIT_STATUS_IGNORED, status_dict.items()))
return tracking_dict

def _set_author_config(self, name: str, email: str) -> Tuple[str, str]:
"""Set the name and email for the pygit repo collecting from the gitconfig.
Expand Down
110 changes: 97 additions & 13 deletions fairscale/experimental/wgit/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.

from enum import Enum
import json
import pathlib
from pathlib import Path
import sys
from typing import Tuple, Union
from typing import Dict, Tuple, Union

from .pygit import PyGit
from .sha1_store import SHA1_store
Expand Down Expand Up @@ -49,7 +50,10 @@ def __init__(self, parent_dir: Union[Path, str] = Path.cwd(), init: bool = False
self._sha1_store = SHA1_store(weigit_dir, init=True)

# # Make the .wgit a git repo
gitignore_files = [self._sha1_store_path.name, self._sha1_store.ref_file_path.name]
gitignore_files = [
self._sha1_store_path.name,
self._sha1_store.ref_file_path.name,
]
self._pygit = PyGit(weigit_dir, gitignore=gitignore_files)

elif exists and init:
Expand Down Expand Up @@ -78,13 +82,14 @@ def add(self, in_file_path: str) -> None:
if self._exists(self.wgit_parent):
# create the corresponding metadata file
file_path = Path(in_file_path)
metadata_file, parent_sha1 = self._process_metadata_file(file_path.name)
rel_file_path = self._rel_file_path(file_path)
metadata_file, parent_sha1 = self._process_metadata_file(rel_file_path)

# add the file to the sha1_store
sha1_hash = self._sha1_store.add(file_path, parent_sha1)

# write metadata to the metadata-file
self._write_metadata(metadata_file, sha1_hash)
self._write_metadata(metadata_file, file_path, sha1_hash)
self._pygit.add() # add to the .wgit/.git repo
else:
sys.stderr.write("fatal: no wgit repo exists!\n")
Expand All @@ -104,10 +109,28 @@ def commit(self, message: str) -> None:
sys.stderr.write("fatal: no wgit repo exists!\n")
sys.exit(1)

def status(self) -> None:
"""Show the state of the working tree."""
def status(self) -> Dict:
"""Show the state of the weigit working tree. State can be
1. dirty with changes/modifications not added to weigit repo,
2. dirty with a file changes added but not committed
3. clean and tracking files after a change has been committed, or clean with with an empty repo.
"""
if self._exists(self.wgit_parent):
print("wgit status")
pygit_status = self._pygit.status()
status = self._get_metdata_files()
if status:
out_status = dict()
for metadata_file, is_modified in status.items():
# if metadata_file is among the keys of pygit_status dict, it has not been commited to git yet.
if is_modified:
out_status[str(metadata_file)] = RepoStatus.CHANGES_NOT_ADDED
elif not is_modified and metadata_file in pygit_status.keys():
out_status[str(metadata_file)] = RepoStatus.CHANGES_ADDED_NOT_COMMITED
elif not is_modified and metadata_file not in pygit_status.keys():
out_status[str(metadata_file)] = RepoStatus.CLEAN
return out_status
else: # if status dict is empty, nothing has been added so far.
return {"": RepoStatus.CLEAN} # sub case of case-3, clean with an empty repo
else:
sys.stderr.write("fatal: no wgit repo exists!\n")
sys.exit(1)
Expand Down Expand Up @@ -153,10 +176,52 @@ def path(self) -> Path:
self._exists(self.wgit_parent)
return self._repo_path

def _process_metadata_file(self, metadata_fname: str) -> Tuple[Path, str]:
"""Create a metadata_file corresponding to the file to be tracked by weigit if the first version of the file is encountered.
If a version already exists, open the file and get the sha1_hash of the last version as parent_sha1"""
def _get_metdata_files(self) -> Dict:
"""Walk the directories that contain the metadata files and check the status of those files,
whether they have been modified or not.
"""
metadata_d = dict()
for file in self.path.iterdir(): # iterate over the .wgit directory
# exlude all the .wgit files and directory
if file.name not in {"sha1_store", "sha1_refs.json", ".git", ".gitignore"}:
# perform a directory walk on the metadata_file directories to find the metadata files
for path in file.rglob("*"):
if path.is_file():
rel_path = str(path.relative_to(self.path)) # metadata path relative to .wgit dir
metadata_d[rel_path] = self._is_file_modified(path)
return metadata_d

def _is_metadata_file(self, file: Path) -> bool:
"""Checks whether a file is a valid metadata file by matching keys and checking if it has valid
json data."""
try:
with open(file) as f:
metadata = json.load(f)
is_metadata = set(metadata.keys()) == {
"SHA1",
"file_path",
"last_modified_time_stamp",
} # TODO: Consider storing the keys as a class attribute, instead of hard coding.
except json.JSONDecodeError:
return False # not a json file, so not valid metadata file
return is_metadata

def _is_file_modified(self, file: Path) -> bool:
"""Checks whether a file has been modified since its last recorded modification time recorded in the metadata_file"""
with open(file) as f:
data = json.load(f)
# get the last modified timestamp recorded by weigit and the current modified timestamp. If not the
# same, then file has been modified since last weigit updated metadata
last_mod_timestamp = data["last_modified_time_stamp"]
curr_mod_timestamp = Path(data["file_path"]).stat().st_mtime
return not curr_mod_timestamp == last_mod_timestamp

def _process_metadata_file(self, metadata_fname: Path) -> Tuple[Path, str]:
"""Create a metadata_file corresponding to the file to be tracked by weigit if the first version of the file
is encountered. If a version already exists, open the file and get the sha1_hash of the last version as parent_sha1"""
metadata_file = self.path.joinpath(metadata_fname)
metadata_file.parent.mkdir(parents=True, exist_ok=True) # create parent dirs for metadata file

if not metadata_file.exists() or not metadata_file.stat().st_size:
metadata_file.touch()
parent_sha1 = "ROOT"
Expand All @@ -166,18 +231,29 @@ def _process_metadata_file(self, metadata_fname: str) -> Tuple[Path, str]:
parent_sha1 = ref_data["SHA1"]["__sha1_full__"]
return metadata_file, parent_sha1

def _write_metadata(self, metadata_file: Path, sha1_hash: str) -> None:
"""Write metadata to the metadata file file"""
change_time = Path(metadata_file).stat().st_ctime
def _write_metadata(self, metadata_file: Path, file_path: Path, sha1_hash: str) -> None:
"""Write metadata to the metadata file"""
change_time = Path(file_path).stat().st_mtime
metadata = {
"SHA1": {
"__sha1_full__": sha1_hash,
},
"file_path": str(file_path),
"last_modified_time_stamp": change_time,
}
with open(metadata_file, "w", encoding="utf-8") as f:
json.dump(metadata, f, ensure_ascii=False, indent=4)

def _rel_file_path(self, filepath: Path) -> Path:
"""Find the relative part to the filepath from the current working directory and return the relative path."""
# get the absolute path
filepath = filepath.resolve()
# using zipped loop we get the path common to the filepath and cwd
for i, (x, y) in enumerate(zip(filepath.parts, Path.cwd().parts)):
pass
# return the relative part (path not common to cwd)
return Path(*filepath.parts[i:])

def _exists(self, check_dir: Path) -> bool:
"""Returns True if a valid wgit exists within the cwd and iteratively checks to the root directory and
sets the self._repo_path attribute to the wgit path.
Expand Down Expand Up @@ -209,3 +285,11 @@ def _weight_repo_file_check(self, check_dir: Path) -> tuple:
git_exists = check_dir.joinpath(".wgit/.git").exists()
gitignore_exists = check_dir.joinpath(".wgit/.gitignore").exists()
return wgit_exists, sha1_refs, git_exists, gitignore_exists


class RepoStatus(Enum):
"""Collections of Repo Statuses"""

CLEAN = 1
CHANGES_NOT_ADDED = 2
CHANGES_ADDED_NOT_COMMITED = 3
60 changes: 51 additions & 9 deletions tests/experimental/wgit/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.


import json
import os
from pathlib import Path
Expand All @@ -13,6 +12,7 @@
import pytest

from fairscale.experimental.wgit import repo as api
from fairscale.experimental.wgit.repo import RepoStatus


@pytest.fixture
Expand All @@ -30,7 +30,7 @@ def create_test_dir():
os.chdir(test_dir)

# create random checkpoints
size_list = [30e5, 35e5, 40e5]
size_list = [30e5, 35e5, 40e5, 40e5]
for i, size in enumerate(size_list):
with open(f"checkpoint_{i}.pt", "wb") as f:
f.write(os.urandom(int(size)))
Expand Down Expand Up @@ -58,10 +58,11 @@ def test_api_init(capsys, repo):
def test_api_add(capsys, repo):
fnum = random.randint(0, 2)
chkpt0 = f"checkpoint_{fnum}.pt"
repo.add(f"checkpoint_{fnum}.pt")

repo.add(chkpt0)
sha1_hash = repo._sha1_store._get_sha1_hash(chkpt0)
with open(os.path.join(".wgit", f"checkpoint_{fnum}.pt"), "r") as f:
metadata_path = repo._rel_file_path(Path(chkpt0))

with open(os.path.join(".wgit", metadata_path), "r") as f:
json_data = json.load(f)

sha1_dir_0 = f"{sha1_hash[:2]}/" + f"{sha1_hash[2:]}"
Expand All @@ -77,10 +78,51 @@ def test_api_commit(capsys, repo):


def test_api_status(capsys, repo):
repo.status()
captured = capsys.readouterr()
assert captured.out == "wgit status\n"
assert captured.err == ""
# delete the repo and initialize a new one:
shutil.rmtree(".wgit")
repo = api.Repo(Path.cwd(), init=True)

# check status before any file is added
out = repo.status()
assert out == {"": RepoStatus.CLEAN}

# check status before after a file is added but not committed
chkpt0 = f"checkpoint_{random.randint(0, 1)}.pt"
repo.add(chkpt0)
out = repo.status()
key_list = list(repo._get_metdata_files().keys())
assert out == {key_list[0]: RepoStatus.CHANGES_ADDED_NOT_COMMITED}

# check status after commit
repo.commit("e1")
out = repo.status()
assert out == {key_list[0]: RepoStatus.CLEAN}

# check status after a new change has been made to the file
with open(chkpt0, "wb") as f:
f.write(os.urandom(int(15e5)))
out = repo.status()
assert out == {key_list[0]: RepoStatus.CHANGES_NOT_ADDED}

# add the new changes made to weigit
repo.add(chkpt0)
out = repo.status()
assert out == {key_list[0]: RepoStatus.CHANGES_ADDED_NOT_COMMITED}

# check status after a new different file is added to be tracked by weigit
chkpt3 = "checkpoint_3.pt"
repo.add(chkpt3)
key_list = list(repo._get_metdata_files().keys())
out = repo.status()
assert out == {
key_list[0]: RepoStatus.CHANGES_ADDED_NOT_COMMITED,
key_list[1]: RepoStatus.CHANGES_ADDED_NOT_COMMITED,
}

# check status after the new file is commited to be tracked by weigit
repo.commit("e2")
out = repo.status()
assert out == {key_list[0]: RepoStatus.CLEAN, key_list[1]: RepoStatus.CLEAN}


def test_api_log(capsys, repo):
Expand Down
7 changes: 2 additions & 5 deletions tests/experimental/wgit/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def create_test_dir():
shutil.rmtree(test_dir)
os.makedirs(test_dir)
os.chdir(test_dir)

# create random checkpoints
size_list = [30e5, 35e5, 40e5]
for i, size in enumerate(size_list):
Expand Down Expand Up @@ -56,10 +55,8 @@ def test_cli_add(capsys):
Path.cwd().joinpath(".wgit"),
init=False,
)

sha1_hash = sha1_store._get_sha1_hash(chkpt0)

with open(os.path.join(".wgit", "checkpoint_0.pt"), "r") as f:
with open(os.path.join(".wgit", "wgit_testing/checkpoint_0.pt"), "r") as f:
json_data = json.load(f)

sha1_dir_0 = f"{sha1_hash[:2]}/" + f"{sha1_hash[2:]}"
Expand All @@ -77,7 +74,7 @@ def test_cli_commit(capsys):
def test_cli_status(capsys):
cli.main(["status"])
captured = capsys.readouterr()
assert captured.out == "wgit status\n"
assert captured.out == "{'wgit_testing/checkpoint_0.pt': <RepoStatus.CLEAN: 1>}\n"
assert captured.err == ""


Expand Down
4 changes: 2 additions & 2 deletions tests/experimental/wgit/test_sha1_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_sha1_add(sha1_configs, sha1_store):
metadata_file, parent_sha1 = repo._process_metadata_file(chkpt1.name)

sha1_hash = sha1_store.add(sha1_configs.checkpoint_1a, parent_sha1)
repo._write_metadata(metadata_file, sha1_hash)
repo._write_metadata(metadata_file, chkpt1, sha1_hash)

# for checkpoint 1
metadata_file = sha1_configs.test_path.joinpath(sha1_configs.checkpoint_1a.name)
Expand All @@ -97,7 +97,7 @@ def test_sha1_refs(sha1_configs, sha1_store):
def add_checkpoint(checkpoint):
metadata_file, parent_sha1 = repo._process_metadata_file(checkpoint.name)
sha1_hash = sha1_store.add(checkpoint, parent_sha1)
repo._write_metadata(metadata_file, sha1_hash)
repo._write_metadata(metadata_file, checkpoint, sha1_hash)
return sha1_hash

with open(sha1_configs.sha1_ref, "r") as file:
Expand Down

0 comments on commit 5b5db28

Please sign in to comment.