coecms · dsroberts · Nov 3, 2022 · Oct 31, 2022 · Nov 2, 2022 · Nov 2, 2022
diff --git a/AUTHORS b/AUTHORS
@@ -2,3 +2,4 @@ Aidan Heerdegen <aidan.heerdegen@anu.edu.au>
 Aidan Heerdegen <aidan.heerdegen@gmail.com>
 Scott Wales <scott.wales@unimelb.edu.au>
 Scott Wales <scottwales@outlook.com.au>
+Dale Roberts <dsroberts@unimelb.edu.au>
diff --git a/ChangeLog b/ChangeLog
@@ -1,8 +1,15 @@
 CHANGES
 =======
 
-* Altered test data to match change in scratch accounting
-* Swap project and folder codes to account for scratch accounting change
+* Handle uids and gids that do not correspond to users/groups on the system
+* Use nci-files-report --json.
+* Update README info
+* Remove travis CI config. Add Github Actions status badge
+
+0.3.3
+-----
+
+* Swap project and folder codes to account for scratch accounting change (#22)
 
 0.3.2
 -----

diff --git a/ncigrafana/parse_user_storage_data.py b/ncigrafana/parse_user_storage_data.py
@@ -21,23 +21,23 @@
 from __future__ import print_function
 
 import argparse
-import pwd
-import datetime
+import json
 import os
 import sys
-import re
-import shutil
+import pwd
+import grp
+import datetime
+
 from .UsageDataset import *
-from .DBcommon import extract_num_unit, parse_size, mkdir, archive
-from .DBcommon import date_range_from_quarter, datetoyearquarter
+from .DBcommon import date_range_from_quarter, datetoyearquarter, archive
 
 databases = {}
 dbfileprefix = '.'
 
 def parse_file_report(filename, verbose, db=None, dburl=None):
 
     # Filename contains project and storage point information
-    (timestamp, project, storagepoint, tmp) = os.path.basename(filename).split('.')
+    (_, _, storagepoint, _) = os.path.basename(filename).split('.')
 
     # Hard code the system based on storagepoint as this information
     # does not exist in the dumpfile. Not even sure NCI make this distinction
@@ -48,60 +48,58 @@ def parse_file_report(filename, verbose, db=None, dburl=None):
         system = 'gadi'
 
     with open(filename) as f:
-
-        print("Parsing {file}".format(file=filename))
-
-        parsing_usage = False
-
-        for line in f:
-            if verbose: print("> ",line)
-            if line.startswith("%%%%%%%%%%%%%%%%"):
-                # Grab date string
-                date = datetime.datetime.strptime(f.readline().strip(os.linesep), 
-                                                  "%a %b %d %H:%M:%S %Z %Y").date()
-                year, quarter = datetoyearquarter(date)
-                startdate, enddate = date_range_from_quarter(year,quarter)
-                db.addquarter(year, quarter, startdate, enddate)
-                parsing_usage = True
-                # Gobble header line
-                line = f.readline()
-                continue
-
-            if parsing_usage:
-                try:
-                    (filesystem,scandate,folder,proj,user,size,filesize,inodes) = line.strip(os.linesep).split() 
-                except:
-                    if verbose: print('Finished parsing usage')
-                    parsing_usage = False
-                    continue
-                db.adduser(user)
-                if storagepoint == 'scratch':
-                    # Swap folder and proj in the case of scratch as it is now accounted for by 
-                    # location, so folder never changes but project code can and subsequent entries 
-                    # overwrite previous ones unless values of folder and proj are swapped
-                    folder, proj = proj, folder 
-                if verbose: print('Adding ', project, user, system, storagepoint, str(date), folder, 
-                                             parse_size(size.upper(), u='', pre='BKMGTPEZY'), inodes)
-                db.adduserstorage(project, 
-                                  user, 
-                                  system, 
-                                  storagepoint, 
-                                  str(date), 
-                                  folder, 
-                                  parse_size(size.upper(), u='', pre='BKMGTPEZY'), 
-                                  inodes)
+        all_data=json.loads(f.read())
+
+    ### Grab timestamp - pretend there are no cross-quarter entries
+    datestamp = datetime.datetime.fromisoformat(all_data[0]["scan_time"])
+    year, quarter = datetoyearquarter(datestamp)
+    startdate, enddate = date_range_from_quarter(year,quarter)
+    db.addquarter(year,quarter,startdate,enddate)
+
+    for entry in all_data:
+        ### Handle uids that don't exist
+        try:
+            user = pwd.getpwuid(entry['uid']).pw_name
+        except KeyError:
+            user = str(entry['uid'])
+        db.adduser(user)
+
+        if storagepoint == 'scratch':
+        # Swap folder and proj in the case of scratch as it is now accounted for by 
+        # location, so folder never changes but project code can and subsequent entries 
+        # overwrite previous ones unless values of folder and proj are swapped
+            ### Handle gids that don't exist
+            try:
+                folder=grp.getgrgid(entry['gid']).gr_name
+            except KeyError:
+                folder=str(entry['gid'])
+            project=entry['project']
+        else:
+            folder=entry['project']
+            ### Handle gids that don't exist
+            try:
+                project=grp.getgrgid(entry['gid']).gr_name
+            except KeyError:
+                project=str(entry['gid'])
+
+        ### Derived from nci-files-report client (formatters/table.py)
+        size = 512 * int(entry['blocks']['single'] + entry['blocks']['multiple'])
+        inodes = int(entry['count']['single'] + entry['count']['multiple'])
+
+        if verbose:
+            ### Date comes out in iso format, first 10 characters will be YYYY-MM-DD
+            print(f"Adding {project}, {user}, {system}, {storagepoint}, {entry['scan_time'][:10]}, {folder}, {size}, {inodes}")
+        db.adduserstorage(project,user,system,storagepoint,entry['scan_time'][:10],folder,size,inodes)
 
 def main(args):
 
-    verbose = args.verbose
-
     db = None
     if args.dburl:
         db = ProjectDataset(dburl=args.dburl)
 
     for f in args.inputs:
         try:
-            parse_file_report(f, verbose, db=db)
+            parse_file_report(f,args.verbose,db=db)
         except:
             raise
         else:
@@ -138,4 +136,3 @@ def main_argv():
 if __name__ == "__main__":
 
     main_argv()
-
diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,4 @@ numpy
 dataset
 sqlalchemy
 Psycopg2
-pytest
+pytest
diff --git a/test/2022-11-02T11:36:45.w40.gdata.json b/test/2022-11-02T11:36:45.w40.gdata.json
diff --git a/test/2022-11-02T11:36:45.w40.scratch.json b/test/2022-11-02T11:36:45.w40.scratch.json
diff --git a/test/test_parse_user_storage.py b/test/test_parse_user_storage.py
@@ -10,6 +10,7 @@
 import pytest
 import sys
 import time
+import grp
 
 from ncigrafana.UsageDataset import *
 from ncigrafana.DBcommon import datetoyearquarter
@@ -31,8 +32,8 @@ def db():
 
 def test_parse_lquota(db):
 
-    parse_file_report('test/2020-04-16T08:34:58.w35.scratch.dump', verbose=verbose, db=db)
-    parse_file_report('test/2020-04-16T08:34:58.w35.gdata.dump', verbose=verbose, db=db)
+    parse_file_report('test/2022-11-02T11:36:45.w40.scratch.json', verbose=verbose, db=db)
+    parse_file_report('test/2022-11-02T11:36:45.w40.gdata.json', verbose=verbose, db=db)
 
 def test_getstoragepoints(db):
 
@@ -46,24 +47,131 @@ def test_getstoragepoints(db):
 
 def test_getstorage(db):
 
-    project = 'w35'
-    year = 2020
-    quarter = 'q2'
+    scratch_project = 'w40'
+    gdata_project = 'w40'
+    gid_available=True
+    try:
+        _ = grp.getgrnam(gdata_project)
+    except KeyError:
+        gid_available = False
+        gdata_project = '5653'
+    year = 2022
+    quarter = 'q4'
     system = 'gadi'
     storagepoint = 'scratch'
-    dp = db.getstorage(project, year, quarter, system, storagepoint, namefield='user')
-    assert(len(dp) == 17)
-    assert((dp.iloc[0,:].values == [837124096., 654982688768., 5983174656.,
-                                    81819126988.8000030517578125, 1897922560., 
-                                    40531821149388.796875]).all())
+    dp = db.getstorage(scratch_project, year, quarter, system, storagepoint, namefield='user')
+    assert(len(dp) == 33)
+    #assert((dp.iloc[0,:].values == [837124096., 654982688768., 5983174656.,
+    #                                81819126988.8000030517578125, 1897922560., 
+    #                                40531821149388.796875]).all())
+    ### if gid_available is false, the ordering of this array will change
+    if gid_available:
+        assert((dp.iloc[0,:].values == [         8192,   15037362176,          4096,          8192,
+                                              6455296,          8192,   12555550720,         61440,
+                                                 8192,          8192,          8192,          8192,
+                                                 8192,          4096,          8192,   66303352832,
+                                                 8192,        425984,  401322278912,          8192,
+                                               454656,   12024569856,          8192,          8192,
+                                                98304,          8192,    6972870656, 1216239837184,
+                                                 8192,          8192,          8192,      13619200,
+                                          11571253248,          8192,         24576,          8192,
+                                                 8192,         12288,          8192,          8192,
+                                                 8192,          8192,         12288,        983040,
+                                                 8192, 9566094172160, 6467893768192,         12288,
+                                         509981904896,          8192,          8192,          8192,
+                                                28672,         12288,          8192,          8192,
+                                                 8192,          8192,          8192,          8192,
+                                                20480,         12288,       3584000,          8192,
+                                                 8192,          8192,          8192,          8192,
+                                                 8192,          8192,          8192,     159137792,
+                                                16384,          8192,  121728925696,     215461888,
+                                                 8192,         16384, 2867015331840,          4096,
+                                                 8192,          8192,         40960,          8192,
+                                         698904379392,  165306880000,          8192,          8192,
+                                                28672,          8192,          8192,    2284138496,
+                                                32768,         12288,          8192,          8192,
+                                              2060288,          8192, 1274325000192,          8192,
+                                                 8192,          8192,        135168,          8192,
+                                                 8192,         65536,  288980849664,          8192,
+                                                 8192,          8192,         24576,          8192,
+                                                 8192,   29022916608,          8192,   64513818624,
+                                               454656,          8192,          8192,          8192,
+                                                 8192,      10104832,    1819267072]).all())
+    else:
+        assert((dp.iloc[0,:].values == [         8192,          8192, 2867015331840,        454656,
+                                               454656,          4096,          8192,          8192,
+                                                16384,          8192,          8192,          8192,
+                                                 8192,          8192,          8192,          8192,
+                                                 8192,          8192,          8192,   15037362176,
+                                                 8192,   11571253248,          8192,        425984,
+                                                 8192,         12288,          8192,          8192,
+                                        9566094172160,    1819267072,          8192,         20480,
+                                                65536,         12288,          8192,         24576,
+                                                 8192,          8192,   66303352832,          8192,
+                                                 8192,      10104832, 1216239837184, 1274325000192,
+                                                 8192,         98304,       3584000,          8192,
+                                                 8192,          8192,          8192,     215461888,
+                                         165306880000,          8192,          8192,          8192,
+                                          29022916608,         12288,        983040,         32768,
+                                                 8192,         40960,         12288,          8192,
+                                                16384,          8192,    2284138496,         61440,
+                                          12024569856,    6972870656,          8192,         24576,
+                                                 8192,  698904379392,          8192,         28672,
+                                         121728925696,          8192,         12288,   64513818624,
+                                        6467893768192,     159137792,          8192,          8192,
+                                                 8192,          8192,          8192,          8192,
+                                              2060288,          8192,          8192,          8192,
+                                                 8192,          8192,      13619200,  509981904896,
+                                              6455296,          8192,          8192,          8192,
+                                                 4096,          8192,          8192,          8192,
+                                                 8192,          8192,          8192,          4096,
+                                                 8192,          8192,          8192,          8192,
+                                                 8192,         12288,          8192,   12555550720,
+                                                 8192,          8192,  401322278912,        135168,
+                                                28672,          8192,  288980849664]).all())
 
     system = 'global'
     storagepoint = 'gdata'
-    dp = db.getstorage(project, year, quarter, system, storagepoint, namefield='user')
-    assert(len(dp) == 17)
-    assert((dp.iloc[0,:].values == [4209602560., 1891963632025.60009765625, 2846720.,
-                                    101591390617.5999908447265625, 1329627922432.,
-                                    7364434976768.]).all())
+    dp = db.getstorage(gdata_project, year, quarter, system, storagepoint, namefield='user')
+    assert(len(dp) == 32)
+    #assert((dp.iloc[0,:].values == [4209602560., 1891963632025.60009765625, 2846720.,
+    #                                101591390617.5999908447265625, 1329627922432.,
+    #                                7364434976768.]).all())
+    ### if gid_available is false, the ordering of this array will change
+    if gid_available:
+        assert((dp.iloc[0,:].values == [  378253836288,         139264,       19640320,     1785151488,
+                                               8122368,          77824,      227143680,           4096,
+                                                  4096,  6244943798272, 34355089817600,           4096,
+                                          338371608576,        2428928,     9422868480, 24785586880512,
+                                                  4096,           4096,  5369627049984,    12756226048,
+                                          620734619648,  1225300582400,      195948544, 31176721473536,
+                                         8375311802368,      919977984,       45187072,   561438191616,
+                                            2595426304,      197181440,   832443146240,  1370082246656,
+                                            5154693120, 40789091491840,   234704838656,          73728,
+                                            5154811904,   146066354176,  9107041492992,  1369709383680,
+                                                192512,     2779975680,  3248198533120,  1551341441024,
+                                               1953792,    22146326528,    15386636288,           4096,
+                                          704050032640,  4243776442368,    93698002944,  8605416587264,
+                                          814183866368,          28672,    52714979328,    84480143360,
+                                           15162470400,   185345052672,    65202511872,   342954827776,
+                                                  4096,      268537856]).all())
+    else:
+        assert((dp.iloc[0,:].values == [ 9107041492992,           4096,  1369709383680,  1225300582400,
+                                          146066354176,          77824,      268537856,           4096,
+                                               8122368,         139264,      227143680,           4096,
+                                           12756226048,  6244943798272,     5154693120,       45187072,
+                                               2428928, 31176721473536,   378253836288,  1370082246656,
+                                          620734619648,   814183866368, 24785586880512,  8605416587264,
+                                             197181440,   338371608576, 40789091491840,  3248198533120,
+                                          185345052672,     2779975680,           4096,   832443146240,
+                                           65202511872,   704050032640,      195948544,    15386636288,
+                                                192512,     2595426304,     5154811904,    22146326528,
+                                            9422868480,   561438191616,    15162470400,  1551341441024,
+                                          234704838656,   342954827776,          73728,    93698002944,
+                                               1953792,          28672,  4243776442368,  5369627049984,
+                                             919977984,       19640320,    84480143360,           4096,
+                                                  4096,  8375311802368,           4096,     1785151488,
+                                        34355089817600,    52714979328]).all())
 
     # import pytest
     # pytest.set_trace()