Skip to content

Commit

Permalink
feat: add consistency check tools
Browse files Browse the repository at this point in the history
  • Loading branch information
chaen committed Dec 20, 2023
1 parent c619b45 commit d9083c3
Show file tree
Hide file tree
Showing 2 changed files with 277 additions and 0 deletions.
126 changes: 126 additions & 0 deletions consistency_check/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# Consistency check

this script is here to help compare storage and DFC dumps.

## What you need

### SE definitions

A CSV file containing its name and base path. Like

```
CSCS-DST;/pnfs/lcg.cscs.ch/lhcb
CSCS_MC-DST;/pnfs/lcg.cscs.ch/lhcb
```

You can obtain it with something like

```python
from DIRAC import initialize
initialize()
from DIRAC import gConfig
from DIRAC.Resources.Storage.StorageElement import StorageElement

for se in gConfig.getSections("/Resources/StorageElements")["Value"]:
print(f"{se};{list(StorageElement(se).storages.values())[0].basePath}")
```

### StorageElement dump

This is typically provided by the site, and we expect just a flat list of the files

```
/pnfs/lcg.cscs.ch/lhcb/generated/2013-07-07/fileeed071eb-1aa0-4d00-8775-79624737224e
/pnfs/lcg.cscs.ch/lhcb/generated/2013-07-10/fileed08b040-196c-46d9-b4d6-37d80cba27eb
/pnfs/lcg.cscs.ch/lhcb/lhcb/test/SAM/testfile-put-LHCb-Disk-1494915199-61e6d085bb84.txt
```

### Catalog dump(s)

Ideally, you should have two catalog dumps for the SE that you are concerned about: one before the SE dump, and one after. Having only one of the two only allows to get partial comparison

You could get it with a script like

```python
import sys
from datetime import datetime,timezone
from DIRAC import initialize
initialize()
from DIRAC import gConfig
from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient
dfc = FileCatalogClient()

# Something like LCG.CERN.ch
site_name = sys.argv[1]

ses = gConfig.getOption(f"/Resources/Sites/{site_name.split('.')[0]}/{site_name}/SE",[])["Value"]

timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
output_file = f"{site_name}_dfc_{timestamp}.dump"
print(f"Getting FC dump for {ses} in {output_file}")
res = dfc.getSEDump(ses, output_file)
print(res)
```


Or from a `BaseSE`

```python
#!/usr/bin/env python3

import sys
from datetime import datetime,timezone
from DIRAC import initialize
initialize()
from DIRAC import gConfig
from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient
dfc = FileCatalogClient()

# Something like RAL-ECHO
base_se_name = sys.argv[1]

ses = []
ses_data = gConfig.getOptionsDictRecursively(f"/Resources/StorageElements")["Value"]
for key, val in ses_data.items():
try:
if val['BaseSE'] == base_se_name:
ses.append(key)
except (KeyError, TypeError):
pass

timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
output_file = f"{base_se_name}_dfc_{timestamp}.dump"
print(f"Getting FC dump for {ses} in {output_file}")
res = dfc.getSEDump(ses, output_file)
print(res)
```

## How it works

We look at the differences and the intersections between the dump of the old catalog, the new catalog, and the storage element.

For example, you find dark data by looking at files that are in the SE dump, but not in any of the catalog dump. Lost data is data that is in both catalog dump, but not in the SE dump.


| Old FC | New FC | SE | Status |
|--------|--------|----|------------------|
| 0 | 0 | 1 | Dark data |
| 0 | 1 | 0 | Very new |
| 0 | 1 | 1 | New |
| 1 | 0 | 0 | Deleted |
| 1 | 0 | 1 | Recently deleted |
| 1 | 1 | 0 | Lost file |
| 1 | 1 | 1 | OK |

## How to use

Although you probably need DIRAC to be able to get the DFC dump or the SE config, you do not need DIRAC installed once you have all the `csv` files.
You will however need `pandas` and `typer`


The `consistency` script has 3 commands:
* `threeways`: do a proper comparison of 1 old DFC dump, one SE dump, one new DFC dump. Results are as good as it gets
* `possibly-dark-data`: Tries to find dark data but be careful of the result (see `help`).
* `possibly-lost-data`: Tries to find lost data but be careful of the result (see `help`).

In any case, you should check the output with commands like `dirac-dms-replica-stats` or `dirac-dms-pfn-exists`.
151 changes: 151 additions & 0 deletions consistency_check/consistency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/usr/bin/env python
import pandas as pd
import typer
from pathlib import Path
from typer import colors
from typing import Annotated


RED = colors.RED
GREEN = colors.GREEN

app = typer.Typer()


def load_se_definition(se_def_path):
return pd.read_csv(se_def_path, names=["seName", "basePath"], delimiter=";", index_col="seName")


def load_dfc_dump(dfc_dump_path, version):
fc_dump = pd.read_csv(dfc_dump_path, names=["seName", "lfn", "cks", "size"], delimiter="|")
fc_dump["version"] = version
return fc_dump


def load_se_dump(se_dump_path):
se_dump = pd.read_csv(se_dump_path, names=["pfn"], delimiter=";", index_col="pfn")
se_dump["version"] = "se_dump"
return se_dump


@app.command()
def possibly_lost_data(
fc_dump_file: Annotated[Path, typer.Option(help="DFC dump AFTER the SE dump")],
se_def_file: Annotated[Path, typer.Option(help="Definition of the SE path")],
se_dump_file: Annotated[Path, typer.Option(help="Dump of the SE")],
lost_file_output: Annotated[Path, typer.Option(help="Output file in which to dump lost")] = "lost.csv",
):
"""
DANGER: make a partial comparison of an SE dump and an FC dump to find lost data
Be careful because you can't trust the result:
* if the FC dump is more recent than the SE dump, you may get files that were added on the SE after the dump
* if the FC dump is older than the SE dump, the file may have been purposedly removed
"""
se_dump = load_se_dump(se_dump_file)
se_def = load_se_definition(se_def_file)

# Compute the PFN for each LFN in the DFC dump

fc_dump = load_dfc_dump(fc_dump_file, "fc")
fc_dump = pd.merge(fc_dump, se_def, on="seName")
fc_dump["pfn"] = fc_dump["basePath"] + fc_dump["lfn"]
fc_dump.set_index("pfn", inplace=True)

# Lost files: in both FC dump but not in the SE

lostData = fc_dump.index.difference(se_dump.index)
if len(lostData):
typer.secho(f"Found {len(lostData)} lost files, dumping them in {lost_file_output}", err=True, fg=RED)
lastDataDetail = fc_dump[fc_dump.index.isin(lostData)]
lastDataDetail.to_csv(lost_file_output)
else:
typer.secho("No dark data found", fg=GREEN)


@app.command()
def possibly_dark_data(
fc_dump_file: Annotated[Path, typer.Option(help="DFC dump")],
se_def_file: Annotated[Path, typer.Option(help="Definition of the SE path")],
se_dump_file: Annotated[Path, typer.Option(help="Dump of the SE")],
dark_file_output: Annotated[Path, typer.Option(help="Output file in which to dump dark data")] = "dark.csv",
):
"""
DANGER: make a partial comparison of an SE dump and an FC dump to find dark data.
Be careful because you can't trust the result:
* if the FC dump is more recent than the SE dump, you may get files that were already removed
* if the FC dump is older than the SE dump, you may find files that were added properly after the dump (DANGER)
"""
se_dump = load_se_dump(se_dump_file)
se_def = load_se_definition(se_def_file)

# Compute the PFN for each LFN in the DFC dump

fc_dump = load_dfc_dump(fc_dump_file, "fc")
fc_dump = pd.merge(fc_dump, se_def, on="seName")
fc_dump["pfn"] = fc_dump["basePath"] + fc_dump["lfn"]
fc_dump.set_index("pfn", inplace=True)

# Dark data: in the SE dump but not in any of the FC dump

typer.echo(f"Computing dark data")
# Find the dark data
darkData = se_dump.index.difference(fc_dump.index)

if len(darkData):
typer.secho(f"Found {len(darkData)} dark data, dumping them in {dark_file_output}", err=True, fg=RED)
pd.DataFrame(index=darkData).to_csv(dark_file_output)
else:
typer.secho("No dark data found", fg=GREEN)


@app.command()
def threeway(
old_fc_dump_file: Annotated[Path, typer.Option(help="DFC dump BEFORE the SE dump")],
new_fc_dump_file: Annotated[Path, typer.Option(help="DFC dump AFTER the SE dump")],
se_def_file: Annotated[Path, typer.Option(help="Definition of the SE path")],
se_dump_file: Annotated[Path, typer.Option(help="Dump of the SE")],
lost_file_output: Annotated[Path, typer.Option(help="Output file in which to dump lost files")] = "lost.csv",
dark_file_output: Annotated[Path, typer.Option(help="Output file in which to dump dark data")] = "dark.csv",
):
"""
Make a full comparison of two FC dump and one SE dump
"""
se_dump = load_se_dump(se_dump_file)
se_def = load_se_definition(se_def_file)

# Compute the PFN for each LFN in the DFC dump
old_fc_dump = load_dfc_dump(old_fc_dump_file, "old_fc")
old_fc_dump = pd.merge(old_fc_dump, se_def, on="seName")
old_fc_dump["pfn"] = old_fc_dump["basePath"] + old_fc_dump["lfn"]
old_fc_dump.set_index("pfn", inplace=True)

new_fc_dump = load_dfc_dump(new_fc_dump_file, "new_fc")
new_fc_dump = pd.merge(new_fc_dump, se_def, on="seName")
new_fc_dump["pfn"] = new_fc_dump["basePath"] + new_fc_dump["lfn"]
new_fc_dump.set_index("pfn", inplace=True)

# Dark data: in the SE dump but not in any of the FC dump

typer.echo(f"Computing dark data")
# Find the dark data
darkData = se_dump.index.difference(old_fc_dump.index.union(new_fc_dump.index))

if len(darkData):
typer.secho(f"Found {len(darkData)} dark data, dumping them in {dark_file_output}", err=True, fg=RED)
pd.DataFrame(index=darkData).to_csv(dark_file_output)
else:
typer.secho("No dark data found", fg=GREEN)

# Lost files: in both FC dump but not in the SE

lostData = (old_fc_dump.index.intersection(new_fc_dump.index)).difference(se_dump.index)
if len(lostData):
typer.secho(f"Found {len(lostData)} lost files, dumping them in {lost_file_output}", err=True, fg=RED)
lastDataDetail = new_fc_dump[new_fc_dump.index.isin(lostData)]
lastDataDetail.to_csv(lost_file_output)
else:
typer.secho("No dark data found", fg=GREEN)


if __name__ == "__main__":
app()

0 comments on commit d9083c3

Please sign in to comment.