Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update parse_user_storage_data to use backend NCI API #23

Merged
merged 8 commits into from Nov 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS
Expand Up @@ -2,3 +2,4 @@ Aidan Heerdegen <aidan.heerdegen@anu.edu.au>
Aidan Heerdegen <aidan.heerdegen@gmail.com>
Scott Wales <scott.wales@unimelb.edu.au>
Scott Wales <scottwales@outlook.com.au>
Dale Roberts <dsroberts@unimelb.edu.au>
11 changes: 9 additions & 2 deletions ChangeLog
@@ -1,8 +1,15 @@
CHANGES
=======

* Altered test data to match change in scratch accounting
* Swap project and folder codes to account for scratch accounting change
* Handle uids and gids that do not correspond to users/groups on the system
* Use nci-files-report --json.
* Update README info
* Remove travis CI config. Add Github Actions status badge

0.3.3
-----

* Swap project and folder codes to account for scratch accounting change (#22)

0.3.2
-----
Expand Down
103 changes: 50 additions & 53 deletions ncigrafana/parse_user_storage_data.py
Expand Up @@ -21,23 +21,23 @@
from __future__ import print_function

import argparse
import pwd
import datetime
import json
import os
import sys
import re
import shutil
import pwd
import grp
import datetime

from .UsageDataset import *
from .DBcommon import extract_num_unit, parse_size, mkdir, archive
from .DBcommon import date_range_from_quarter, datetoyearquarter
from .DBcommon import date_range_from_quarter, datetoyearquarter, archive

databases = {}
dbfileprefix = '.'

def parse_file_report(filename, verbose, db=None, dburl=None):

# Filename contains project and storage point information
(timestamp, project, storagepoint, tmp) = os.path.basename(filename).split('.')
(_, _, storagepoint, _) = os.path.basename(filename).split('.')

# Hard code the system based on storagepoint as this information
# does not exist in the dumpfile. Not even sure NCI make this distinction
Expand All @@ -48,60 +48,58 @@ def parse_file_report(filename, verbose, db=None, dburl=None):
system = 'gadi'

with open(filename) as f:

print("Parsing {file}".format(file=filename))

parsing_usage = False

for line in f:
if verbose: print("> ",line)
if line.startswith("%%%%%%%%%%%%%%%%"):
# Grab date string
date = datetime.datetime.strptime(f.readline().strip(os.linesep),
"%a %b %d %H:%M:%S %Z %Y").date()
year, quarter = datetoyearquarter(date)
startdate, enddate = date_range_from_quarter(year,quarter)
db.addquarter(year, quarter, startdate, enddate)
parsing_usage = True
# Gobble header line
line = f.readline()
continue

if parsing_usage:
try:
(filesystem,scandate,folder,proj,user,size,filesize,inodes) = line.strip(os.linesep).split()
except:
if verbose: print('Finished parsing usage')
parsing_usage = False
continue
db.adduser(user)
if storagepoint == 'scratch':
# Swap folder and proj in the case of scratch as it is now accounted for by
# location, so folder never changes but project code can and subsequent entries
# overwrite previous ones unless values of folder and proj are swapped
folder, proj = proj, folder
if verbose: print('Adding ', project, user, system, storagepoint, str(date), folder,
parse_size(size.upper(), u='', pre='BKMGTPEZY'), inodes)
db.adduserstorage(project,
user,
system,
storagepoint,
str(date),
folder,
parse_size(size.upper(), u='', pre='BKMGTPEZY'),
inodes)
all_data=json.loads(f.read())

### Grab timestamp - pretend there are no cross-quarter entries
datestamp = datetime.datetime.fromisoformat(all_data[0]["scan_time"])
year, quarter = datetoyearquarter(datestamp)
startdate, enddate = date_range_from_quarter(year,quarter)
db.addquarter(year,quarter,startdate,enddate)

for entry in all_data:
### Handle uids that don't exist
try:
user = pwd.getpwuid(entry['uid']).pw_name
except KeyError:
user = str(entry['uid'])
db.adduser(user)

if storagepoint == 'scratch':
# Swap folder and proj in the case of scratch as it is now accounted for by
# location, so folder never changes but project code can and subsequent entries
# overwrite previous ones unless values of folder and proj are swapped
### Handle gids that don't exist
try:
folder=grp.getgrgid(entry['gid']).gr_name
except KeyError:
folder=str(entry['gid'])
project=entry['project']
else:
folder=entry['project']
### Handle gids that don't exist
try:
project=grp.getgrgid(entry['gid']).gr_name
except KeyError:
project=str(entry['gid'])

### Derived from nci-files-report client (formatters/table.py)
size = 512 * int(entry['blocks']['single'] + entry['blocks']['multiple'])
inodes = int(entry['count']['single'] + entry['count']['multiple'])

if verbose:
### Date comes out in iso format, first 10 characters will be YYYY-MM-DD
print(f"Adding {project}, {user}, {system}, {storagepoint}, {entry['scan_time'][:10]}, {folder}, {size}, {inodes}")
db.adduserstorage(project,user,system,storagepoint,entry['scan_time'][:10],folder,size,inodes)

def main(args):

verbose = args.verbose

db = None
if args.dburl:
db = ProjectDataset(dburl=args.dburl)

for f in args.inputs:
try:
parse_file_report(f, verbose, db=db)
parse_file_report(f,args.verbose,db=db)
except:
raise
else:
Expand Down Expand Up @@ -138,4 +136,3 @@ def main_argv():
if __name__ == "__main__":

main_argv()

2 changes: 1 addition & 1 deletion requirements.txt
Expand Up @@ -4,4 +4,4 @@ numpy
dataset
sqlalchemy
Psycopg2
pytest
pytest
1 change: 1 addition & 0 deletions test/2022-11-02T11:36:45.w40.gdata.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions test/2022-11-02T11:36:45.w40.scratch.json

Large diffs are not rendered by default.

138 changes: 123 additions & 15 deletions test/test_parse_user_storage.py
Expand Up @@ -10,6 +10,7 @@
import pytest
import sys
import time
import grp

from ncigrafana.UsageDataset import *
from ncigrafana.DBcommon import datetoyearquarter
Expand All @@ -31,8 +32,8 @@ def db():

def test_parse_lquota(db):

parse_file_report('test/2020-04-16T08:34:58.w35.scratch.dump', verbose=verbose, db=db)
parse_file_report('test/2020-04-16T08:34:58.w35.gdata.dump', verbose=verbose, db=db)
parse_file_report('test/2022-11-02T11:36:45.w40.scratch.json', verbose=verbose, db=db)
parse_file_report('test/2022-11-02T11:36:45.w40.gdata.json', verbose=verbose, db=db)

def test_getstoragepoints(db):

Expand All @@ -46,24 +47,131 @@ def test_getstoragepoints(db):

def test_getstorage(db):

project = 'w35'
year = 2020
quarter = 'q2'
scratch_project = 'w40'
gdata_project = 'w40'
gid_available=True
try:
_ = grp.getgrnam(gdata_project)
except KeyError:
gid_available = False
gdata_project = '5653'
year = 2022
quarter = 'q4'
system = 'gadi'
storagepoint = 'scratch'
dp = db.getstorage(project, year, quarter, system, storagepoint, namefield='user')
assert(len(dp) == 17)
assert((dp.iloc[0,:].values == [837124096., 654982688768., 5983174656.,
81819126988.8000030517578125, 1897922560.,
40531821149388.796875]).all())
dp = db.getstorage(scratch_project, year, quarter, system, storagepoint, namefield='user')
assert(len(dp) == 33)
#assert((dp.iloc[0,:].values == [837124096., 654982688768., 5983174656.,
# 81819126988.8000030517578125, 1897922560.,
# 40531821149388.796875]).all())
### if gid_available is false, the ordering of this array will change
if gid_available:
assert((dp.iloc[0,:].values == [ 8192, 15037362176, 4096, 8192,
6455296, 8192, 12555550720, 61440,
8192, 8192, 8192, 8192,
8192, 4096, 8192, 66303352832,
8192, 425984, 401322278912, 8192,
454656, 12024569856, 8192, 8192,
98304, 8192, 6972870656, 1216239837184,
8192, 8192, 8192, 13619200,
11571253248, 8192, 24576, 8192,
8192, 12288, 8192, 8192,
8192, 8192, 12288, 983040,
8192, 9566094172160, 6467893768192, 12288,
509981904896, 8192, 8192, 8192,
28672, 12288, 8192, 8192,
8192, 8192, 8192, 8192,
20480, 12288, 3584000, 8192,
8192, 8192, 8192, 8192,
8192, 8192, 8192, 159137792,
16384, 8192, 121728925696, 215461888,
8192, 16384, 2867015331840, 4096,
8192, 8192, 40960, 8192,
698904379392, 165306880000, 8192, 8192,
28672, 8192, 8192, 2284138496,
32768, 12288, 8192, 8192,
2060288, 8192, 1274325000192, 8192,
8192, 8192, 135168, 8192,
8192, 65536, 288980849664, 8192,
8192, 8192, 24576, 8192,
8192, 29022916608, 8192, 64513818624,
454656, 8192, 8192, 8192,
8192, 10104832, 1819267072]).all())
else:
assert((dp.iloc[0,:].values == [ 8192, 8192, 2867015331840, 454656,
454656, 4096, 8192, 8192,
16384, 8192, 8192, 8192,
8192, 8192, 8192, 8192,
8192, 8192, 8192, 15037362176,
8192, 11571253248, 8192, 425984,
8192, 12288, 8192, 8192,
9566094172160, 1819267072, 8192, 20480,
65536, 12288, 8192, 24576,
8192, 8192, 66303352832, 8192,
8192, 10104832, 1216239837184, 1274325000192,
8192, 98304, 3584000, 8192,
8192, 8192, 8192, 215461888,
165306880000, 8192, 8192, 8192,
29022916608, 12288, 983040, 32768,
8192, 40960, 12288, 8192,
16384, 8192, 2284138496, 61440,
12024569856, 6972870656, 8192, 24576,
8192, 698904379392, 8192, 28672,
121728925696, 8192, 12288, 64513818624,
6467893768192, 159137792, 8192, 8192,
8192, 8192, 8192, 8192,
2060288, 8192, 8192, 8192,
8192, 8192, 13619200, 509981904896,
6455296, 8192, 8192, 8192,
4096, 8192, 8192, 8192,
8192, 8192, 8192, 4096,
8192, 8192, 8192, 8192,
8192, 12288, 8192, 12555550720,
8192, 8192, 401322278912, 135168,
28672, 8192, 288980849664]).all())

system = 'global'
storagepoint = 'gdata'
dp = db.getstorage(project, year, quarter, system, storagepoint, namefield='user')
assert(len(dp) == 17)
assert((dp.iloc[0,:].values == [4209602560., 1891963632025.60009765625, 2846720.,
101591390617.5999908447265625, 1329627922432.,
7364434976768.]).all())
dp = db.getstorage(gdata_project, year, quarter, system, storagepoint, namefield='user')
assert(len(dp) == 32)
#assert((dp.iloc[0,:].values == [4209602560., 1891963632025.60009765625, 2846720.,
# 101591390617.5999908447265625, 1329627922432.,
# 7364434976768.]).all())
### if gid_available is false, the ordering of this array will change
if gid_available:
assert((dp.iloc[0,:].values == [ 378253836288, 139264, 19640320, 1785151488,
8122368, 77824, 227143680, 4096,
4096, 6244943798272, 34355089817600, 4096,
338371608576, 2428928, 9422868480, 24785586880512,
4096, 4096, 5369627049984, 12756226048,
620734619648, 1225300582400, 195948544, 31176721473536,
8375311802368, 919977984, 45187072, 561438191616,
2595426304, 197181440, 832443146240, 1370082246656,
5154693120, 40789091491840, 234704838656, 73728,
5154811904, 146066354176, 9107041492992, 1369709383680,
192512, 2779975680, 3248198533120, 1551341441024,
1953792, 22146326528, 15386636288, 4096,
704050032640, 4243776442368, 93698002944, 8605416587264,
814183866368, 28672, 52714979328, 84480143360,
15162470400, 185345052672, 65202511872, 342954827776,
4096, 268537856]).all())
else:
assert((dp.iloc[0,:].values == [ 9107041492992, 4096, 1369709383680, 1225300582400,
146066354176, 77824, 268537856, 4096,
8122368, 139264, 227143680, 4096,
12756226048, 6244943798272, 5154693120, 45187072,
2428928, 31176721473536, 378253836288, 1370082246656,
620734619648, 814183866368, 24785586880512, 8605416587264,
197181440, 338371608576, 40789091491840, 3248198533120,
185345052672, 2779975680, 4096, 832443146240,
65202511872, 704050032640, 195948544, 15386636288,
192512, 2595426304, 5154811904, 22146326528,
9422868480, 561438191616, 15162470400, 1551341441024,
234704838656, 342954827776, 73728, 93698002944,
1953792, 28672, 4243776442368, 5369627049984,
919977984, 19640320, 84480143360, 4096,
4096, 8375311802368, 4096, 1785151488,
34355089817600, 52714979328]).all())

# import pytest
# pytest.set_trace()
Expand Down