From c854b5be40fb7a1225eb23cdd5014b47940e6306 Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Wed, 8 Dec 2021 15:57:50 -0500 Subject: [PATCH 01/46] Update Ansible - Updates deployment playbooks for prod and staging to handle the possibility of a `secrets.py` file. - Updates vars and vault files with new variable pointers and encryption. - Adds a new template file that will be rendered as `secrets.py` in the root of the indicators directory. --- ansible/ansible-deploy-staging.yaml | 12 + ansible/ansible-deploy.yaml | 12 + .../templates/claims_hosp-secrets-prod.py.j2 | 11 + ansible/vars.yaml | 5 + ansible/vault.yaml | 457 +++++++++--------- 5 files changed, 273 insertions(+), 224 deletions(-) create mode 100755 ansible/templates/claims_hosp-secrets-prod.py.j2 diff --git a/ansible/ansible-deploy-staging.yaml b/ansible/ansible-deploy-staging.yaml index 3056d79f2..f08839800 100644 --- a/ansible/ansible-deploy-staging.yaml +++ b/ansible/ansible-deploy-staging.yaml @@ -27,6 +27,10 @@ local_action: stat path="templates/{{ indicator }}-params-prod.json.j2" register: template + - name: Check to see if we have a secrets template to send. + local_action: stat path="templates/{{ indicator }}-secrets-prod.py.j2" + register: template + - name: Set production params file. copy: src: files/{{ indicator }}-params-prod.json @@ -42,3 +46,11 @@ owner: "{{ runtime_user }}" group: "{{ runtime_user }}" when: template.stat.exists + + - name: Set production secrets template. + template: + src: templates/{{ indicator }}-secrets-prod.py.j2 + dest: "{{ indicators_runtime_dir }}/{{ indicator }}/secrets.py" + owner: "{{ runtime_user }}" + group: "{{ runtime_user }}" + when: template.stat.exists diff --git a/ansible/ansible-deploy.yaml b/ansible/ansible-deploy.yaml index f35aa40f8..9863639ed 100644 --- a/ansible/ansible-deploy.yaml +++ b/ansible/ansible-deploy.yaml @@ -27,6 +27,10 @@ local_action: stat path="templates/{{ indicator }}-params-prod.json.j2" register: template + - name: Check to see if we have a secrets template to send. + local_action: stat path="templates/{{ indicator }}-secrets-prod.py.j2" + register: template + - name: Set production params file. copy: src: files/{{ indicator }}-params-prod.json @@ -42,3 +46,11 @@ owner: "{{ runtime_user }}" group: "{{ runtime_user }}" when: template.stat.exists + + - name: Set production secrets template. + template: + src: templates/{{ indicator }}-secrets-prod.py.j2 + dest: "{{ indicators_runtime_dir }}/{{ indicator }}/secrets.py" + owner: "{{ runtime_user }}" + group: "{{ runtime_user }}" + when: template.stat.exists diff --git a/ansible/templates/claims_hosp-secrets-prod.py.j2 b/ansible/templates/claims_hosp-secrets-prod.py.j2 new file mode 100755 index 000000000..b00b147d7 --- /dev/null +++ b/ansible/templates/claims_hosp-secrets-prod.py.j2 @@ -0,0 +1,11 @@ +class claims: + HOST = 'ftp.delphi.cmu.edu' + USER = {{ claims_hosp_ftp_user }} + PASS = {{ claims_hosp_ftp_password }} + PORT = 2222 + + +class covidcast: + HOST = "delphi.midas.cs.cmu.edu" + USER = {{ claims_hosp_midas_user }} + PASS = {{ claims_hosp_midas_password }} diff --git a/ansible/vars.yaml b/ansible/vars.yaml index eaeff437d..c44844e8e 100644 --- a/ansible/vars.yaml +++ b/ansible/vars.yaml @@ -26,6 +26,11 @@ changehc_sftp_host: "{{ vault_changehc_sftp_host }}" changehc_sftp_port: "{{ vault_changehc_sftp_port }}" changehc_sftp_user: "{{ vault_changehc_sftp_user }}" changehc_sftp_password: "{{ vault_changehc_sftp_password }}" +# claims_hosp +claims_hosp_ftp_user: "{{ vault_claims_hosp_ftp_user }}" +claims_hosp_ftp_password: "{{ vault_claims_hosp_ftp_password }}" +claims_hosp_midas_user: "{{ vault_claims_hosp_midas_user }}" +claims_hosp_midas_password: "{{ vault_claims_hosp_midas_password }}" # NCHS nchs_mortality_token: "{{ vault_nchs_mortality_token }}" # SirCAL diff --git a/ansible/vault.yaml b/ansible/vault.yaml index 950d0ed51..ed2916927 100644 --- a/ansible/vault.yaml +++ b/ansible/vault.yaml @@ -1,225 +1,234 @@ $ANSIBLE_VAULT;1.1;AES256 -63316430313534303165663533363834353939653931383036363236343034666463366362303263 -3732646465653464336239613162343439323734636430660a393334323166376563653037303336 -63303663346462653963326361313238633263663133383965393431613139333432323139636361 -3130343634663234390aa666666633932306336353766396630 +64363066303830616536373936313666643435656639656538646334366433613363313165303832 +3335313031323032330a656464343664656235353536386630633331343633353263623637356264 +63636139653566366263333539656632316433333266666263356661663233613166616661336232 +62366137326264656531396434663766396233623562333562326633646666356465336435643338 +37316431343033316662663438376630343837616137313564376638313966363838666434373538 +63613834316439353033613235346634383834343132393934636365386339353039653663323161 +62613237393232643738333534396263633963653938336638376436663439366263373063303861 +61316132663638656436623862643834366236623130653864643766663631336361373236323264 +34383362393764643165343832643835336534653433313436363631363161313436363463616435 +30323534333265393030343032346531643330303362383562636566623964633266666261343533 +38643037333166316663303363633464643531633930313834646134303362643463653564333062 +39356431363937646637613439616136626231626366643364633637383338613232656165643832 +61373563653734313035633263616665373630623130396537616366636134613731376637363966 +63323430303466663433333861666435303435353135396238343538363136353035646435623136 +38636436623638613135313736366435323230343035383765636463666533303738633063656661 +36303831386662366365616434373833306465383562323930613236663031313165326633356434 +39313562623338376635326134636438323963646437353933336238346364623830333364326230 +61316661636163366361613464376165373666356236623135396631326637643735616461346264 +37333433343332633136663530616237356639396535653334386138643135643032383337333366 +33393335656139393538386266386432306532346638623530336239636264316236336637303636 +35363938643730313539323163366462373764613536346262666133383131626663653965343366 +31646162303832376362663432313965333433323766343162383936646430366664313437396534 +32373164343533613239393062636338643064636432323036666366653137633538363434376563 +63353139666462366235666436613865323632306663323339636664386437383561336435616533 +33353337666231663739316435663133616137653039363637346435336234373234313266303765 +64323030353735353764666466313362626637643064386138326130363064653764633137646538 +38343566643831383361336337616535393134366333306431373864633930383037343238326532 +38626238626465663561313661366362623238323934363639643033646133396661366533346433 +36613931376234646464333865376339396362616463313637353565383061626431633665396139 +66646333313438353432613435363231663265356535663634613066636161356466336334383835 +32383738313036643739623838656662366635396237393834373062383664336537373638313965 +38343836306635626632626266666334396364653331363932316562373632353434323337363034 +31306539383838663737363639316137386234343538633561386633366632393236373662636238 +66323261643535376533666539626266663132386435366230646263396138623065386137646638 +39616539363165613630336664373966643064303834646437616131633937653335383565376462 +30653162313730366431356533633166353631393732386532666632623666646230666166303464 +66326663353037373164316332386139313135363962623030643635336464356562383530313537 +63363964343738356632653531373930656431343763343435623364303430323866303730366263 +38346236616362356333326536366330653032323733313230363137656465653537396539613939 +30646463303438316433306266633738383837376233653833313565336633386236306363613739 +65616162666662636531396266396561373832366233623362356330353831393866366662656235 +38346533396334326632666663326161653832373366313633373965636663616334336466383465 +35363661333562393331366166663036383562666532623136333462336663353836333464316230 +32336638336132663936656361646431646434323634336535636661643164343737343861333430 +66393565623761643930323132643232653265346130303832613730373862636134616261623139 +63363062383032393738363538393664333436376436353764346133666163656431346331656266 +32323530316438396338633938306162383661353638373566386230323230343837333930383837 +37343265383734313966613765646165316133373665656631636139613331346530643466346166 +62613739326563396662393835353630653538343235363863326466353065353936343262303435 +65336133616130373366653336613832326363366532326465366238383061653832393739313235 +34663431316261666634356234623963373636313132663739333838373634663331633337653039 +36396539386636323936373631303339356339393435623731303763653236323766616464303938 +37633034343734633063653364393765396466366131613435343030393638306563373263646263 +35396333636464366364313435643630663638633333333537373930323830373831323432383061 +62313932653737323134663661643531333862646638303639393835646536653830353735636130 +62653631333530633833323832353831316366646162373235363232333231643631646562316163 +66613835666339643237646330343262366235336435343130313566656531356233393836613035 +38366639323334663731623334616234356562373761613535386230336263313337353662656561 +62366635333061643561653530633034366631333230613633656632633532326233663366636263 +66613632666164353830326361373163663666643835336164653135623563306439643530353737 +34323736396361336466353233366461663437616265396361323237663861396136353864383133 +66343937363164396364326665656261343662386631353532636562613332653636396539376161 +61393430336662323563366531396233393635626432353037373865643039323635663062666236 +39333265363363393231303939386338386532393766303330303562393230393865633263623832 +33643531383633346362626533646130383262653032386563396238643761363130313065633231 +39643039643264636262346366663566343966333963343932633034363861383830343731623230 +34396362626364306636666138623861653161646564393963316263663866636338396530636237 +62643561323236383863623432326563306533363566623963353537346430313835313338383165 +31626636353765356361383639643337303233343064346430346639303663306565353565323162 +65306533656466373863333637663039353537326536623962303866363233323939386561343137 +37373634323263333863333339366434623436643666316232353136613230613438313662666266 +61643432656332323661323732363537383537623661626530353364663237643463356266323261 +61333865336461363133633032343930313031383934336465363036303636393332336265373539 +66303434333232316562363064646330636565333737663238373938326135623837616664643330 +65323539653464346334613462323162666534353530383532623565336234643434303337326233 +66343932623036633466393637623333383263616434313437663231623036343466663232333262 +64396333346364663636643831323538373633376239396165353166626233656234343632623337 +33393663653139663937383832333032316130393666663261663631653435346566363363646164 +37353064633031383536626163383834366637643561653832633466346466663732383961626130 +39653164643732336362326531626130346432623932656535353964393837363961656230343064 +38613335303838306164663932396236376366343162306437323030626132373464363464376363 +31346663623563366535353165343736316336323437623430313233323435396130386662666138 +39363732373266666261643830346237316330633438323430333437393537626264343839366637 +61636537323663663130323334633364633738353161373239393434336461383062346231613932 +33313461646233356464313836626230613731626536653137366465613532623762313533343062 +61306137613336653162383134306464616462623331303836306339326330653362393732356538 +35353731373733353634356666353362336632333166383262663565383636613231643131323666 +62663134316462613665386263386461346535663839633066653132653264373963353339376662 +63363834343266663066343163366563373766623037616566663533303035666665653162316536 +32346437633964666464653239373933636363366334646634313563386633643336333362356464 +39613732376439393333643066326166623861653962656630396336306262663766323061663664 +33393234666663336533626331396161646337383032653532386534333762613335333035353237 +37333864396365633734623963663539653565613838643633343032386239613936633631646238 +37396262313834376166626237656530353232333635633135306638363434653866386463333630 +64376336383730663365656365613235373661363962646361333463613836393566326331623438 +35633662343937393739623031393437616434386462323033626563383164613838373066346338 +34616262363563373361353136613761633637366465663339373863316539623838343139303363 +31616232383038363539346465353734323465303835353465326533386437333034643466373338 +36343362326237383432616634363338383963303530613538303731613132633637633530366565 +63393466613665353435643734323932316361373032653933393230666137633130653637343236 +35393864633662346266383935393066633137363766343431616238613766613132366638653638 +61353833613834666132613834393237333663396666326639353235353563323034613831306330 +35376661393164333461646362313562316638383930353439333865353930373237373230393830 +33373134336238653863663532656632303638343134613566336332616537333334396236313135 +37633266383762646637303533653336303666326431656230366630616261343365303238663762 +31383264616134346432313033623461376163353534346439346639356637353861373762663838 +36386535623162353232616164326264653531343633343766306135363364623164363531356131 +30666565316337663162343032623662663939613265346330363431323938326532376161373034 +64353731653734636438393465663963393137353864643366326630666630643635626637333536 +66666139343134326137633664663830333465326137326438663833353731623831613864626136 +31626363396662333832663931383031616637393639616434383364393930383566323336663234 +30643162393130343134653736313830616234353064663736333738623339313761303036316639 +61326130343664633732656166383337353234373531346561363130663962313638383231353437 +31316239643263653230613538316337383265343735313531376237333330386362613539313665 +37333431383062336365636231373638313336363135626538393937613865353433393639666663 +38653431363666303632636131643961373437366565636363323330323239656530316138316666 +34333232636534643535346131633539646663636261663738333138396363336332666262353830 +34393534653439663239343162396236643033316561343939663239366233643830363463656163 +33363662386266396462386537316233356136396662616536383263643863663133623564656261 +32393437393438663563613530656439663231346563363063646163366335633637383637653631 +36613663393865373337313735623230613939396539623462663938633432376334306166386562 +31396333653862336336643662323266333566363764336363383335303731313930613237646436 +35336231653263363630663136613332356530393866333538633231363033643738373664623232 +32366261333634386636353336303537353362376463616630306235313436646264653733343933 +37316130326232653335323863326133383664663166653362613661346262626436646634633566 +64633530666434326439303633643030313435633831623364643566343366396262313063656366 +64316531663538336636636533376332323763663638643239356263306662613132633233623138 +64623461623865343736333661303934656630353535643465616635303264633465353438613532 +30313938333266383233376431623761373934373765363739336135346232313032373239313734 +33313838393136656131333465653833346164373735633762643363663337386537326665653736 +30386439316266636334303261363734323965363864313030613539363764663935656233323966 +33373731666537326266623264636536633433373663313233613330623337336331353833336565 +34356438666135633138333538386266323166393737366230313136306263383935636565623037 +64333936356362666665623263636535666235356233616664663335333833613139356337383337 +35386137383461313963333763376339646138646161383735616563306134633863313363633730 +30656538663837666338646636333261366566353764623439333765396365633361303865303439 +39353836326664386664393662376138303161653765663637643566633239396265633730323830 +36616639396232346561393533623664313437643635353531373232613231346130303966303433 +37356437353166643461663265313334343334623038343139383066343362653230343331663864 +62356335366563356536343165616362326536653233353537303739373837393132333036633863 +65363732313933666237333933323763616634396366613766303931373161313132363135393663 +31383232646562323832323531303639383838633734336563666464653833666466313532346362 +64656463326462623562613139666532313866343834663239653037306331346431613135363431 +37623063366163646435313939663562656436653565383961333362626663373533343836626436 +61343461343235383861366163303764313437646263636335643165303064643663633739346131 +33383536306234356439646133653634653336633138626132366630323732613462306166313863 +31633931363035653461623965303964626339666633636137393566343339323833653764386535 +33623834616164663261343763373062623335383966323932323833393165363630316131613930 +36336132663537333266323865313439656435306137303339363964633462356363613539643061 +34323764306363366536353738303665303063333939363165666365303561383739303566313633 +38633833633362313165396132646337383663333363393335333234303362646232616534303130 +37633638356636646231656335333532353138653630633831643231303564383166626237323337 +37663639643233303539303735366362663362663465356636623439336130356234353931363336 +37653735633034316539643433643836393031343162633764643738326234396535643063366430 +65333966333961303162376531616633333933336165313732303464646637623066343737303034 +65663533643232376164333563333464346136363237313731663030613263653631383836363238 +61613037623765383265643934336363646564313937336532626231343964393764386561376236 +61303832323336663962313030353838386338343738636431363438373166303830353638663761 +33626261363263386163636336336133636565303532303466623538616335623564636630313835 +36323336326437626431643964646265303134376661396164643739663435356161306239346265 +32616334626363653064316232326339663030653065373634626436626665306338343663366136 +64343634616536353965626666323232396137643462626366363962373130313638303961323334 +65376139636638636333653337333465333463666463366661316539363131373364653734613065 +34633861653239393864363031343835396161633231303666656533376462393063353839346363 +32373633353962663938623930356162313032643830363436656366326334396364336261626637 +34303835363662356563633764326439373837346337646131303930646638646531313537373666 +30643536383532623231646165303330393330383335616633646666386463326130663432363236 +65346664326165383264383362373333666338316438396439623063613636663836633330353132 +65316233366166633733663765366566373437326536386461376338616438653335303961336430 +35336230653962363462323863353835633033313235313561356261613263353433336535366230 +36326335646536663965343764643438326133313463333336656636373132643637653263303037 +65303031366138663863393464656436643964656462383462313562306131616139623738393636 +63363935363931616661626662653863613866306537633533663465313961336137373437336363 +63316132336230316661316233343538383661396338393735383136316639663033393538623131 +35643063303239623339313639623962376364363730613963363631336135383066313537633865 +66643838636231363832646532303065313863316134343066613161646336336237646530343432 +39346139373030646631393635623736343739613362666433303765366431653436393965643739 +38363865313830663736383866653530383739363063373930383264633763353430653434356664 +35643162616466343837653130373566636530393137653165346630643865336634333662666430 +38626333353232386438653331613737376365326535346563306164386434336132363865346263 +36633030373235353732363939316238663935393537666535393661323062396635353435303636 +37393631633864316262306630356566366465363836376361616237393062393065663735653966 +30363534646331386330613030376238356631373466333931363030643730326238356265663162 +62363563613564303731353231633637393133326162653638633430396561653236326362376238 +65633633626262323033386262396433323332303230373838363933336564386165373534643961 +38346130646466633265346262623639653135636332396332353562373233373138656433646663 +32613832393334623232663336313538656163303738366632656638376134646565626636343536 +33366563313263343936353634303238316437646664316563386130306639376363326362303330 +31633234623638663566336132383738393730306436666134333637336434316332353962663837 +38656432623063333863626136316261636365656462323239613564643163326336306564306166 +64336135366634643064373234613336623438666539646565383835396166306337626136623566 +32663032633739653435313539353434663865613636316437393034333830623739633162313864 +30353031636536383239376432646530633538623563646234363462386635323765656337646430 +38323337363632343236303834623535613432363366306564656539653937646262343132626564 +32366136373164303639373862666161373532643934366165643362326534373331393537376339 +66386561626531373438616662343861396362643431633462306562343964636638643338363963 +66623231343766303364323632613434373864353566643962643564323465303566313235313164 +38613438643461343631316265386432323762313132353930396234633132306433353064333733 +36373430383930646632356331306134303533643865363961336335323565376330313666393731 +64383430626536353932343337663930333735356634666534316438303334356131313461366239 +61353539343138303332353530396137356265356130656332366337653139396434393533633762 +31616139613139656631616332376339623630636235333434373930633938323361386334383730 +32666139653663643938333431656336633132313035616232613037313265646261316464336162 +33633731303539306530616561353533643132366165336465326430313565386465313935383036 +33393835323139313732396339313336623736363664666561353764633266336566356230643263 +62636232623233333630343364356361666564396236396562356631353737663437386634353138 +35666134373133656339303634646539663131326631666232386164613034646336353266323839 +32343361353932333431303132366536356230323566623135626662393664633435613834363433 +33313963656230653137323464343536313835393938343830656639663932363763653432386661 +65323164656663643438666539636366643136653433313238653835336166366336646236663262 +62653532306534313966313935656336303837323838656134323035626164373533333065343566 +38666439613837636135323636373762363432336134323761383930643735376562333565353862 +34343666333838663133306265383338363333313431613564663033353436663262633536373465 +34633265613035326434626563373164663365626561346334643534376565333463393534643638 +64653962383233303266333566326237616234346565636366373264356337623832383034316333 +62613135343438306235653863326532626466373364616435383466653032633038323435316566 +62636431616361333337623739653631653831333135353266383931663836663764343063613863 +66323338303433613437643338653863633862313133303263326663343737346638373333663436 +61383865313232633864306332323165653366623337323162316431386161636535623862356561 +38393438376164353163313334613965663635376563313661373530383334646465333236613035 +38653265363364663365653535306661646136343332393039383130636361643537363264633663 +33333535623637373732303738316438323461376366316238613131666332393265333031336231 +32323030343764636565666539616339643435303164623535336239663337633735323461366466 +66636663396266333965643935636662623233623861616439663564613430333235343834313965 +30356538376665363232666634393331393636363064613363333366336664303434373331336133 +36366337366534326634303238643334626462663336343834316632303333636230333961643132 +61363032616563346530323763346561383330373137343638363334313331623563363336366635 +65323234373034393931636665343831636639363038643732366432626434373539313538363733 +65653734356131313066633436363331376165316234373632613830633931346235633736656335 +37633963376262336231636566336266366163316265633631396163343762323866656639663430 +33303032383332313231313364633535633861353531633433356363613239623864 From c3227c4849eb90d3712d2e9074623ca4adb3156e Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Wed, 8 Dec 2021 16:03:13 -0500 Subject: [PATCH 02/46] Add `hosp_claims` automation rig - Adds new Python and shell scripts to automate `hosp_claims` indicator. - Adds a symlink for `secrets.py`. --- ansible/vault.yaml | 467 +++++++++--------- claims_hosp/HospClaims/automate/README.md | 12 + .../HospClaims/automate/agg_claims_drops.py | 116 +++++ .../automate/download_claims_ftp_files.py | 110 +++++ .../HospClaims/automate/ftp_to_covidcast.py | 62 +++ .../automate/get_latest_claims_name.py | 42 ++ .../automate/hosp_claims_master_script.sh | 72 +++ .../automate/hosp_claims_regen_script.sh | 68 +++ .../HospClaims/automate/regen_old_issue.py | 47 ++ .../HospClaims/automate/sanity_checks.py | 289 +++++++++++ claims_hosp/HospClaims/automate/secrets.py | 1 + .../HospClaims/automate/update_json.py | 60 +++ 12 files changed, 1113 insertions(+), 233 deletions(-) create mode 100644 claims_hosp/HospClaims/automate/README.md create mode 100644 claims_hosp/HospClaims/automate/agg_claims_drops.py create mode 100644 claims_hosp/HospClaims/automate/download_claims_ftp_files.py create mode 100644 claims_hosp/HospClaims/automate/ftp_to_covidcast.py create mode 100644 claims_hosp/HospClaims/automate/get_latest_claims_name.py create mode 100755 claims_hosp/HospClaims/automate/hosp_claims_master_script.sh create mode 100755 claims_hosp/HospClaims/automate/hosp_claims_regen_script.sh create mode 100755 claims_hosp/HospClaims/automate/regen_old_issue.py create mode 100644 claims_hosp/HospClaims/automate/sanity_checks.py create mode 120000 claims_hosp/HospClaims/automate/secrets.py create mode 100644 claims_hosp/HospClaims/automate/update_json.py diff --git a/ansible/vault.yaml b/ansible/vault.yaml index ed2916927..914b39f87 100644 --- a/ansible/vault.yaml +++ b/ansible/vault.yaml @@ -1,234 +1,235 @@ $ANSIBLE_VAULT;1.1;AES256 -65363733383463363636386638303566363861646436363666656665663263653838373739383036 -6432353762653138336663653530633764323565383238370a666666633932306336353766396630 -64363066303830616536373936313666643435656639656538646334366433613363313165303832 -3335313031323032330aa646535383133343035613931363736 +62663830633537376231336633346538346564653461613964346637353937306536383565313466 +3231633437303435640adiff --git a/claims_hosp/HospClaims/automate/README.md b/claims_hosp/HospClaims/automate/README.md new file mode 100644 index 000000000..58ebf15b6 --- /dev/null +++ b/claims_hosp/HospClaims/automate/README.md @@ -0,0 +1,12 @@ +## Dependencies +- paramiko +- click +- numpy +- pandas +- pathlib +- imap_tools +- matplotlib +- requests + +Maria todo: + - remove all absolute path references \ No newline at end of file diff --git a/claims_hosp/HospClaims/automate/agg_claims_drops.py b/claims_hosp/HospClaims/automate/agg_claims_drops.py new file mode 100644 index 000000000..28ba97c1a --- /dev/null +++ b/claims_hosp/HospClaims/automate/agg_claims_drops.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 + +"""Aggregates chunks of drops. + +Drops are expected to be numbered as: + +../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_1_07052020_1456.csv.gz +../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_2_07052020_1456.csv.gz +... etc. +""" + +# standard +from collections import defaultdict +from pathlib import Path + +# third party +import click +import numpy as np +import pandas as pd + + +def agg_and_write(data_path, force=True): + """ + Aggregate drops given a folder path. Will output an aggregated version in the + same folder. Example below. + + Input files in folder: + ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_1_07052020_1456.csv.gz + ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_2_07052020_1456.csv.gz + + Will create: + ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_07052020_1456.csv.gz + + + Args: + data_path: path to the folder with duplicated drops. + force: if aggregated file exists, whether to overwrite or not + """ + + files = np.array(list(Path(data_path).glob("*"))) + + for f in files: + out_path = f.parents[0] / f.name + dfs = pd.read_csv(f, dtype={"PatCountyFIPS": str, + "patCountyFIPS": str}) + if "servicedate" in dfs.columns: + dfs.rename(columns={"servicedate": "ServiceDate"}, inplace=True) + if "patCountyFIPS" in dfs.columns: + dfs.rename(columns={"patCountyFIPS": "PatCountyFIPS"}, inplace=True) + if "patHRRname" in dfs.columns: + dfs.rename(columns={"patHRRname": "Pat HRR Name"}, inplace=True) + if "patAgeGroup" in dfs.columns: + dfs.rename(columns={"patAgeGroup": "PatAgeGroup"}, inplace=True) + if "patHRRid" in dfs.columns: + dfs.rename(columns={"patHRRid": "Pat HRR ID"}, inplace=True) + + assert np.sum( + dfs.duplicated(subset=["ServiceDate", "PatCountyFIPS", + "Pat HRR Name", "PatAgeGroup"])) == 0, \ + "Duplication across drops!" + assert dfs.shape[1] == 10, "Wrong number of columns" + + dfs.to_csv(out_path, index=False) + print(f"Wrote {out_path}") + + # matches = defaultdict(list) + # for i, f in enumerate(files): + # drop_datetime = f.name.split("_")[3:] + # if len(drop_datetime) > 2: # there is a group number + # group, date, time = drop_datetime + # dateid = date + time + # matches[dateid].append(i) + # + # for match, file_idxs in matches.items(): + # # check if file exists before writing + # out_name = files[file_idxs][0].name.split("_") + # out_name = '_'.join(out_name[:3] + out_name[4:]) + # out_path = files[file_idxs][0].parents[0] / out_name + # if out_path.exists() and not force: + # # print(f"{out_path} exists, skipping") + # continue + # + # dfs = [pd.read_csv(files[i], dtype={"PatCountyFIPS": str, + # "patCountyFIPS": str}) for i in file_idxs] + # n_rows = [a.shape[0] for a in dfs] + # dfs = pd.concat(dfs) + # if "servicedate" in dfs.columns: + # dfs.rename(columns={"servicedate": "ServiceDate"}, inplace=True) + # if "patCountyFIPS" in dfs.columns: + # dfs.rename(columns={"patCountyFIPS": "PatCountyFIPS"}, inplace=True) + # if "patHRRname" in dfs.columns: + # dfs.rename(columns={"patHRRname": "Pat HRR Name"}, inplace=True) + # if "patAgeGroup" in dfs.columns: + # dfs.rename(columns={"patAgeGroup": "PatAgeGroup"}, inplace=True) + # if "patHRRid" in dfs.columns: + # dfs.rename(columns={"patHRRid": "Pat HRR ID"}, inplace=True) + # + # assert np.sum( + # dfs.duplicated(subset=["ServiceDate", "PatCountyFIPS", + # "Pat HRR Name", "PatAgeGroup"])) == 0, \ + # "Duplication across drops!" + # assert dfs.shape[1] == 10, "Wrong number of columns" + # assert sum(n_rows) == dfs.shape[0], "Sum of rows is incorrect" + # + # safe_out(out_path, dfs, force) + + +@click.command() +@click.argument('data_path') +@click.option('--force', '-f', is_flag=True, default=False) +def run_cli(data_path, force): + agg_and_write(data_path, force=force) + + +if __name__ == "__main__": + run_cli() diff --git a/claims_hosp/HospClaims/automate/download_claims_ftp_files.py b/claims_hosp/HospClaims/automate/download_claims_ftp_files.py new file mode 100644 index 000000000..28baff0e8 --- /dev/null +++ b/claims_hosp/HospClaims/automate/download_claims_ftp_files.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +"""Downloads files modified in the last 24 hours from the delphi ftp server.""" + +# standard +import datetime +import functools +import sys +from os import path + +# third party +import click +import paramiko + +# first party +from secrets import claims + + +class AllowAnythingPolicy(paramiko.MissingHostKeyPolicy): + def missing_host_key(self, client, hostname, key): + return + + +def print_callback(filename, bytes_so_far, bytes_total): + rough_percent_transferred = int(100 * (bytes_so_far / bytes_total)) + if (rough_percent_transferred % 25) == 0: + print(f'{filename} transfer: {rough_percent_transferred}%') + + +def get_timestamp(name): + try: + split_name = name.split("_") + yyyymmdd = split_name[3] + hhmm = ''.join(filter(str.isdigit, split_name[4])) + timestamp = datetime.datetime.strptime(''.join([yyyymmdd, hhmm]), + "%Y%m%d%H%M") + except Exception: + timestamp = datetime.datetime(1900, 1, 1) + + return timestamp + + +def flip_MMDDYYYY_to_DDMMYYYY(name): + # flip date from MMDDYYYY to DDMMYYYY + split_name = name.split("_") + date = split_name[4] + flip_date = date[2:4] + date[:2] + date[4:] + split_name[4] = flip_date + name = '_'.join(split_name) + return name + + +def flip_YYYYMMDD_to_DDMMYYYY(name): + split_name = name.split("_") + date = split_name[3] + flip_date = date[6:] + date[4:6] + date[:4] + split_name[3] = flip_date + name = '_'.join(split_name) + return name + + +@click.command() +@click.argument("out_path") +def download(out_path): + current_time = datetime.datetime.now() + seconds_in_day = 24 * 60 * 60 + print(f"current time is {current_time}") + + # open client + client = paramiko.SSHClient() + client.set_missing_host_key_policy(AllowAnythingPolicy()) + + client.connect(claims.HOST, + username=claims.USER, password=claims.PASS, port=claims.PORT) + sftp = client.open_sftp() + sftp.chdir('/hosp/receiving') + + + # go through files in recieving dir + files_to_download = [] + for fileattr in sftp.listdir_attr(): + # file_time = datetime.datetime.fromtimestamp(fileattr.st_mtime) + file_time = get_timestamp(fileattr.filename) + time_diff_to_current_time = current_time - file_time + if 0 < time_diff_to_current_time.total_seconds() <= seconds_in_day: + files_to_download.append(fileattr.filename) + + # make sure we don't download more that the 3 chunked drops (2x a day) for OP + # and the 1 chunk (2x a day) for IP - 01/07/21, *2 for multiple day drops + assert len(files_to_download) <= 2 * ((3 * 2) + 2), "more files dropped than expected" + + filepaths_to_download = {} + for file in files_to_download: + flipped_file = flip_YYYYMMDD_to_DDMMYYYY(file) + if "INPATIENT" in file: + full_path = path.join(out_path, flipped_file) + if path.exists(full_path): + print(f"{flipped_file} exists, skipping") + else: + filepaths_to_download[file] = full_path + + # download! + for infile, outfile in filepaths_to_download.items(): + callback_for_filename = functools.partial(print_callback, infile) + sftp.get(infile, outfile, callback=callback_for_filename) + + client.close() + + +if __name__ == "__main__": + download() diff --git a/claims_hosp/HospClaims/automate/ftp_to_covidcast.py b/claims_hosp/HospClaims/automate/ftp_to_covidcast.py new file mode 100644 index 000000000..0333f027c --- /dev/null +++ b/claims_hosp/HospClaims/automate/ftp_to_covidcast.py @@ -0,0 +1,62 @@ +"""FTP created files over to Delphi Covidcast ingestion.""" +# standard +import datetime +import os +from pathlib import Path + +# third party +import click +import paramiko + +# first party +from secrets import covidcast + +NUM_FILES = 71*6*2 # expect (71 dates x 6 geos x 2 signals) +NUM_SE_FILES = 71*6*1 # expect (71 dates x 6 geos x 1 signals) + + +class AllowAnythingPolicy(paramiko.MissingHostKeyPolicy): + def missing_host_key(self, client, hostname, key): + return + + +@click.command() +@click.argument("local_receiving_dir") +def upload(local_receiving_dir): + """Upload files to the delphi covidcast ingestion folders + + Args: + local_receiving_dir: local dir containing the non-se signal files + + """ + today = datetime.datetime.now().date() + + # open client + client = paramiko.SSHClient() + client.set_missing_host_key_policy(AllowAnythingPolicy()) + client.connect(covidcast.HOST, username=covidcast.USER, password=covidcast.PASS) + sftp = client.open_sftp() + + files_to_upload = [] + for file in Path(local_receiving_dir).glob("*.csv"): + files_to_upload.append(file) + + assert len(files_to_upload) == NUM_FILES, "more files to upload than expected!" + + # upload signal without se + sftp.chdir("/common/covidcast/receiving/hospital-admissions") + for i, file in enumerate(files_to_upload): + assert ( + datetime.datetime.fromtimestamp(os.path.getmtime(file)).date() == today + ), f"uploading old file {file}" + + sftp.put(file, file.name) + if (i % 61) == 0: + print(f"Finished {i} out of {len(files_to_upload)}") + + print(f"Successfully uploaded the hospital-admissions claims signal") + client.close() + + +if __name__ == "__main__": + upload() diff --git a/claims_hosp/HospClaims/automate/get_latest_claims_name.py b/claims_hosp/HospClaims/automate/get_latest_claims_name.py new file mode 100644 index 000000000..9dadbe5dc --- /dev/null +++ b/claims_hosp/HospClaims/automate/get_latest_claims_name.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +"""Return the latest drop.""" + +# standard +import datetime +from pathlib import Path + +# third party +import click + + +@click.command() +@click.argument("dir_path") +def get_latest_filename(dir_path): + current_date = datetime.datetime.now() + files = list(Path(dir_path).glob("*")) + + latest_timestamp = datetime.datetime(1900, 1, 1) + latest_filename = None + for file in files: + split_name = file.name.split("_") + if len(split_name) == 5: + ddmmyyyy = split_name[3] + hhmm = ''.join(filter(str.isdigit, split_name[4])) + timestamp = datetime.datetime.strptime(''.join([ddmmyyyy, hhmm]), + "%d%m%Y%H%M") + if timestamp > latest_timestamp: + if timestamp <= current_date: + latest_timestamp = timestamp + latest_filename = file + + assert current_date.date() == latest_timestamp.date(), "no drop for today" + + # write to stdout for shell script to use + print(latest_filename) + + # return for other uses + return latest_filename + + +if __name__ == "__main__": + get_latest_filename() diff --git a/claims_hosp/HospClaims/automate/hosp_claims_master_script.sh b/claims_hosp/HospClaims/automate/hosp_claims_master_script.sh new file mode 100755 index 000000000..e3230a5c2 --- /dev/null +++ b/claims_hosp/HospClaims/automate/hosp_claims_master_script.sh @@ -0,0 +1,72 @@ +#!/bin/sh +set -o errexit +#set -o nounset +set -o pipefail + +BASE="/home/indicators/runtime/claims_hosp/HospClaims" + +AUTO_DIR="$BASE/automate" +HOSP_CLAIMS_PKG_DIR="/home/indicators/runtime/claims_hosp" +CLAIMS_DIR="$BASE/claims_data" +GEO_DIR="/common/covidcast/covid-19/geographical_scope" +CURRENT_dmY=$(date '+%d%m%Y') +CURRENT_Ymd=$(date '+%Y%m%d') +CURRENT_YmdHM=$(date '+%Y%m%d_%H%M') +RECEIVING_DIR="$BASE/receiving/results_$CURRENT_YmdHM" +RECEIVING_SE_DIR="$BASE/receiving/results_se_$CURRENT_YmdHM" + +# pull latest data +echo "downloading drops" +cd "$AUTO_DIR" || exit +python3 download_claims_ftp_files.py "$CLAIMS_DIR" + +# aggregate data +echo "aggregating drops" +python3 agg_claims_drops.py "$CLAIMS_DIR" + +# find the latest files (these have timestamps) +echo "finding today's latest claims drop" +claims_file=$(python3 get_latest_claims_name.py "$CLAIMS_DIR") + +# make receiving directories +mkdir "$RECEIVING_DIR" + +# generate the sensor +cd "$HOSP_CLAIMS_PKG_DIR" || exit + +source env/bin/activate + +python $AUTO_DIR/update_json.py \ + "$claims_file" \ + "$GEO_DIR" \ + "$HOSP_PKG_DIR" \ + "$RECEIVING_DIR" + +python -m delphi_claims_hosp + +deactivate + +sanity_check() { + geo=$1 + cd "$AUTO_DIR" || exit + python3 sanity_checks.py "$RECEIVING_DIR" "$geo" +} + +echo "running sanity checks" +sanity_check state +sanity_check msa +sanity_check hrr +sanity_check county + +# plot states without se +cd "$AUTO_DIR" || exit +python3 sanity_checks.py "$RECEIVING_DIR" state -p + +# upload files to covidcast +python3 ftp_to_covidcast.py "$RECEIVING_DIR" + +# delete raw data +rm "$CLAIMS_DIR"/*.csv.gz + +# delete signal files +rm -r "$RECEIVING_DIR" diff --git a/claims_hosp/HospClaims/automate/hosp_claims_regen_script.sh b/claims_hosp/HospClaims/automate/hosp_claims_regen_script.sh new file mode 100755 index 000000000..2111f7b7e --- /dev/null +++ b/claims_hosp/HospClaims/automate/hosp_claims_regen_script.sh @@ -0,0 +1,68 @@ +#!/bin/sh +set -o errexit +#set -o nounset +set -o pipefail + +BASE="/home/indicators/runtime/claims_hosp/HospClaims" +AUTO_DIR="$BASE/automate" +HOSP_CLAIMS_PKG_DIR="/home/indicators/runtime/claims_hosp" +CLAIMS_DIR="$BASE/claims_data" +GEO_DIR="/common/covidcast/covid-19/geographical_scope" +RECEIVING_DIR="$1" + +# pull latest data +echo "downloading drops" +cd "$AUTO_DIR" || exit +python3 download_claims_ftp_files.py "$CLAIMS_DIR" + +# find the latest files (these have timestamps) +echo "finding today's latest claims drop" +claims_file=$(python3 get_latest_claims_name.py "$CLAIMS_DIR") + +# only keep latest file +cd "$CLAIMS_DIR" || exit +claims_filename=$(basename "$claims_file") +echo "$claims_filename" +mv "$claims_filename" .. +cd .. +rm -f "$CLAIMS_DIR"/*.csv.gz +mv "$claims_filename" "$CLAIMS_DIR" + +# aggregate data +cd "$AUTO_DIR" || exit +echo "aggregating drops" +python3 agg_claims_drops.py "$CLAIMS_DIR" + +# generate the sensor +cd "$HOSP_CLAIMS_PKG_DIR" || exit + +source env/bin/activate + +python $AUTO_DIR/update_json.py \ + "$claims_file" \ + "$GEO_DIR" \ + "$HOSP_PKG_DIR" \ + "$RECEIVING_DIR" + +python -m delphi_claims_hosp + +deactivate + +sanity_check() { + geo=$1 + cd "$AUTO_DIR" || exit + python3 sanity_checks.py "$RECEIVING_DIR" "$geo" +} + +echo "running sanity checks" +sanity_check state +sanity_check msa +sanity_check hrr +sanity_check county + +# plot states without se +#cd "$AUTO_DIR" || exit +#python3 sanity_checks.py "$RECEIVING_DIR" state -p + +# delete raw data +rm "$CLAIMS_DIR"/*.csv.gz diff --git a/claims_hosp/HospClaims/automate/regen_old_issue.py b/claims_hosp/HospClaims/automate/regen_old_issue.py new file mode 100755 index 000000000..838d4b24d --- /dev/null +++ b/claims_hosp/HospClaims/automate/regen_old_issue.py @@ -0,0 +1,47 @@ +from datetime import datetime, timedelta +import os +import logging + + +def regen(issue_date: datetime): + fake_date = datetime.strftime(issue_date, '%Y%m%d') + fake_datetime = datetime.strftime(issue_date, '%Y-%m-%d %H:%M:%S') + + out_dir = f"/home/maria/Delphi/HospClaims/regen/issue_{fake_date}" + out_dir_no_se = out_dir + "/hospital-admissions" + #if os.path.isdir(out_dir_no_se) and len(os.listdir(out_dir_no_se)) > 0: + # logging.info(f"files in output dir, skipping {issue_date}") + # return False + + os.makedirs(out_dir_no_se, exist_ok=True) + os.system( + f"faketime '{fake_datetime}' /home/maria/Delphi/HospClaims/automate/hosp_claims_regen_script.sh {out_dir_no_se}") + + logging.info(str(issue_date.date())) + + +def main(): + hour = 23 + + start_date = datetime(2021, 6, 12, hour) + end_date = datetime(2021, 6, 13, hour) + #start_date = datetime(2020, 6, 2, hour) + #end_date = datetime(2020, 8, 4, hour) + n_dates = (end_date - start_date).days + 1 + date_range = [start_date + timedelta(days=a) for a in range(n_dates)] + + logging.basicConfig(level=logging.DEBUG, filename="out.log", + filemode="a+", + format="%(asctime)-15s %(levelname)-8s %(message)s") + + #date_range = [datetime(2020, 6, 21, hour)] + for date in date_range: + try: + regen(date) + except Exception as e: + logging.info(e) + continue + + +if __name__ == "__main__": + main() diff --git a/claims_hosp/HospClaims/automate/sanity_checks.py b/claims_hosp/HospClaims/automate/sanity_checks.py new file mode 100644 index 000000000..f2b03fe65 --- /dev/null +++ b/claims_hosp/HospClaims/automate/sanity_checks.py @@ -0,0 +1,289 @@ +"""Sanity check results from generating DV estimates. + +Author: Maria Jahja +Created: 2020-05-12 + +Plotting code modified from: http://blog.marmakoide.org/?p=94 +""" + +# standard packages +import logging +import sys +from collections import defaultdict +from datetime import datetime, timedelta +from pathlib import Path + +# third party +import click +import matplotlib.dates as mpld +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from matplotlib.backends.backend_pdf import PdfPages + +# first party +EPIDATA_DIR = Path.home() / "Delphi/delphi-epidata/src/client" +FIPS_DIR = Path.home() / "Delphi/covid-19/doctor-visits/maria/data/fips_full.csv" +sys.path.append(str(EPIDATA_DIR)) +from delphi_epidata import Epidata + + +class EMRHospChecks: + DATE_FORMAT = mpld.DateFormatter('%m-%d') + + def __init__(self, data_path, level, se): + self.level = level + self.data = self.get_data(data_path, level, se) + self.locs = list(sorted(set(self.data["adj"]["val"].keys()) | \ + set(self.data["nadj"]["val"].keys()))) + self.se = se + + # read in geo file for fips + self.geo = pd.read_csv(FIPS_DIR, dtype={"FIPS": int}) + self.geo.drop_duplicates('FIPS', inplace=True) + + @staticmethod + def get_data(data_path, level, se): + """ + Compile data values and dates for given data_path and geographic level + + Args: + data_path: path to the data files + level: geographic level to pull + se: bool if se's are included in the file + + Returns: + dictionary with data + """ + + def extract(all_files, all_dates): + """Extract data from the files.""" + res = {"val": defaultdict(list), + "se": defaultdict(list), + "dates": defaultdict(list)} + for f, d in zip(all_files, all_dates): + df = pd.read_csv(open(f, "rb"), dtype={"geo_id": str}).to_numpy() + for row in df: + geo = row[0] + res["val"][geo].append(row[1]) + res["se"][geo].append(row[2]) + res["dates"][geo].append(d) + return res + + data = {"adj_files": [], "nadj_files": [], "dates": []} + for f in sorted(data_path.glob("*")): + name = f.name.split("_") + if f.suffix == ".csv" and name[1] == level: + name_idx = -2 if se else 3 + if name[name_idx] == "adj": + data["adj_files"].append(f) + else: + data["nadj_files"].append(f) + data["dates"].append(name[0]) + + # extract data + data["dates"] = sorted(list(set(data["dates"]))) + data["adj"] = extract(data["adj_files"], data["dates"]) + data["nadj"] = extract(data["nadj_files"], data["dates"]) + + # convert dates + data["dates"] = pd.to_datetime(data["dates"]) + data["first_date"] = data["dates"].min() + data["last_date"] = data["dates"].max() + data["first_plot_date"] = data["last_date"] - timedelta(days=30) + data["epidata_date_range"] = Epidata.range( + str(data["first_plot_date"].date()).replace('-', ''), + str(data["last_date"].date()).replace('-', '')) + + return data + + def check_se_na(self): + """ + Checks that all SE are reported as 'NA' due to + privacy concerns from the company. + + Returns: + true if pass, false otherwise + """ + + for kind in ["adj", "nadj"]: + for geo, ses in self.data[kind]["se"].items(): + for se in ses: + if not np.isnan(se): + logging.error(f"{geo}, {se} not nan") + return False + return True + + def check_range(self): + """ + Checks that all percentages are within [0, 100]. + + Returns: + true if pass, false otherwise + """ + for kind in ["adj", "nadj"]: + for geo, vals in self.data[kind]["val"].items(): + for val in vals: + if not (0 <= val <= 100): + logging.error(f"{geo}, {val} not in [0, 100]") + return False + return True + + def check_quantity(self): + """Checks how many geographies were generated.""" + n_geos = {} + logging.info(f"geographies generated for {self.level}") + for kind in ["adj", "nadj"]: + for geo, vals in self.data[kind]['val'].items(): + n_geos[geo] = len(vals) + + min_geo = np.min([v for k, v in n_geos.items()]) + max_geo = np.max([v for k, v in n_geos.items()]) + avg_geo = np.mean([v for k, v in n_geos.items()]) + std_geo = np.std([v for k, v in n_geos.items()]) + logging.info(f"\t{kind}" + f"\nmin:\t{min_geo}\nmax:\t{max_geo}" + f"\navg:\t{avg_geo:.2f}\nstd:\t{std_geo:.2f}") + + def get_filled_df(self, loc, kind): + df = pd.DataFrame({"val": self.data[kind]["val"][loc]}, + index=pd.to_datetime(self.data[kind]["dates"][loc])) + + if self.data["first_plot_date"] not in df.index: + df = df.append( + pd.DataFrame({"val": np.nan}, index=[self.data["first_plot_date"]])) + if self.data["last_date"] not in df.index: + df = df.append(pd.DataFrame({"val": np.nan}, index=[self.data["last_date"]])) + df.sort_index(inplace=True) + df = df.asfreq('D', fill_value=np.nan) + return df[df.index > self.data["first_plot_date"]] + + def get_epidata_df(self, loc, kind): + epi_kind = "smoothed_adj_covid19_from_claims" if kind == "adj" else "smoothed_covid19_from_claims" + if self.level == "msa": + loc = int(float(loc)) + + rows = Epidata.covidcast("hospital-admissions", epi_kind, "day", + self.level, self.data["epidata_date_range"], loc) + vals = [] + obs_dates = [] + for row in rows['epidata']: + vals.append(row['value']) + obs_dates.append(row['time_value']) + + obs_dates = [datetime.strptime(str(d), "%Y%m%d") for d in obs_dates] + df = pd.DataFrame({'date': obs_dates, 'val': vals}) + df = df.set_index('date') + return df + + def get_county_name(self, fips_code): + """Return name of a county given it's fips code.""" + loc = self.geo[self.geo["FIPS"] == fips_code] + if len(loc) == 0: + return fips_code + return f'{loc["Name"].iloc[0]} County, {loc["State"].iloc[0]}' + + def plot(self, outname): + """ Create PDF plots of the generated values by location. + + Args: + outname: name for the output pdf file + """ + + # start pdf document + pdf_pages = PdfPages(f'{outname}-{self.level}-hosp-claims-plots.pdf') + n_plot = len(self.locs) + n_plots_per_page = 25 + + # init plotting axis and counter + fig, axs = None, None + j = 0 + + for i, loc in enumerate(self.locs): + + # start new page if needed + if i % n_plots_per_page == 0: + fig, axs = plt.subplots(5, 5, figsize=(10, 10), sharex=True) + axs = axs.ravel() + j = 0 + + # plot + adj_ts = self.get_filled_df(loc, "adj") + axs[j].plot(adj_ts.index, adj_ts["val"], label="New (Adj)", color="blue") + + if not self.se: + nadj_ts = self.get_filled_df(loc, "nadj") + axs[j].plot(nadj_ts.index, nadj_ts["val"], label="New", color="green") + + # current data. left unlabeled to clear clutter, but colors correspond to + # the "new" lines. only plot first 52 cases (it's rather slow to run otherwise) + if self.level == "state" or \ + ((self.level == "county") and (loc in ["53033", "36061"])): + try: + epi_adj_ts = self.get_epidata_df(loc, "adj") + axs[j].plot(epi_adj_ts.index, epi_adj_ts["val"], + color="lightskyblue", linestyle="--") + if not self.se: + epd_nadj_ts = self.get_epidata_df(loc, "nadj") + axs[j].plot(epd_nadj_ts.index, epd_nadj_ts["val"], + color="lightgreen", linestyle="--") + except: + logging.warning(f"could not retrieve {loc} in epidata, skipping") + + # set title + if self.level == "county": + axs[j].set_title(self.get_county_name(int(loc)), fontsize=10) + else: + axs[j].set_title(loc) + + # set legend and format + if i == 0 or j == 0: + axs[j].legend() + + axs[j].xaxis.set_major_formatter(self.DATE_FORMAT) + axs[j].tick_params(axis='both', which='major', labelsize=5, labelrotation=90) + + # close the page if needed + if (i + 1) % n_plots_per_page == 0 or (i + 1) == n_plot: + plt.tight_layout() + pdf_pages.savefig(fig) + plt.close() + j += 1 + + pdf_pages.close() + logging.info(f"plotted to '{outname}-{self.level}-hosp-claims-plots.pdf'") + + +def run(respath, geo, se, plot): + """Run sanity checks and produce plots. + + Args: + respath: path to result csvs + geo: geo level, one of state, msa, hrr, county + se: boolean whether data includes se or not + plot: boolean whether to plot or not + """ + assert geo in ["state", "msa", "hrr", "county"], f"{geo} is invalid" + + ehc = EMRHospChecks(Path(respath), geo, se) + assert ehc.check_range(), "range failed" + if not se: + assert ehc.check_se_na(), "se is all na failed" + ehc.check_quantity() + if plot: + ehc.plot(str(datetime.today().date())) + logging.info("finished checks") + + +@click.command() +@click.argument('respath') +@click.argument('geo') +@click.option('--se', is_flag=True, default=False) +@click.option('--plot', '-p', is_flag=True, default=False) +def run_cli(respath, geo, se, plot): + logging.basicConfig(level=logging.INFO) + run(respath, geo, se, plot) + + +if __name__ == "__main__": + run_cli() diff --git a/claims_hosp/HospClaims/automate/secrets.py b/claims_hosp/HospClaims/automate/secrets.py new file mode 120000 index 000000000..4ae593b93 --- /dev/null +++ b/claims_hosp/HospClaims/automate/secrets.py @@ -0,0 +1 @@ +../../secrets.py \ No newline at end of file diff --git a/claims_hosp/HospClaims/automate/update_json.py b/claims_hosp/HospClaims/automate/update_json.py new file mode 100644 index 000000000..9ae028570 --- /dev/null +++ b/claims_hosp/HospClaims/automate/update_json.py @@ -0,0 +1,60 @@ +"""Change the json template to run the package.""" + +import argparse +import json +from pathlib import Path + + +def get_hosp(edi_file, geo_dir, out_dir, receiving_dir, se): + """Output the json.params file needed to run the emr_hosp package. + + Args: + edi_file: Path to EDI file (claims) + geo_dir: Dir containing the geo_map files + out_dir: Output dir to put the json file + receiving_dir: Output dir to put the hosp estimates + se: Boolean to output SEs or not + """ + + if se: + weekday = [True] + else: + weekday = [True, False] + + data = { + "common": { + "export_dir": receiving_dir, + "log_exceptions": False + }, + "indicator": { + "input_file": edi_file, + "start_date": None, + "end_date": None, + "drop_date": None, + "n_backfill_days": 70, + "n_waiting_days": 3, + "write_se": se, + "obfuscated_prefix": "wip_henear", + "parallel": True, + "geos": ["state", "msa", "hrr", "county", "hhs", "nation"], + #"geos": ['hhs', 'nation'], + "weekday": weekday + } + } + print(f"Using {edi_file}") + + with open(Path(out_dir) / 'params.json', 'w') as outfile: + json.dump(data, outfile, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('edi_file') + parser.add_argument('geo_dir') + parser.add_argument('out_dir') + parser.add_argument('receiving_dir') + parser.add_argument('--se', action="store_true") + + args = parser.parse_args() + get_hosp(args.edi_file, args.geo_dir, + args.out_dir, args.receiving_dir, args.se) From 095f792cba1fec75cb630584e79d699c79a0f645 Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Thu, 9 Dec 2021 15:25:48 -0500 Subject: [PATCH 03/46] Re-add new secrets after fixing merge conflict with main --- ansible/vault.yaml | 460 +++++++++++++++++++++++---------------------- 1 file changed, 235 insertions(+), 225 deletions(-) diff --git a/ansible/vault.yaml b/ansible/vault.yaml index 6e26c86e7..cede7133f 100644 --- a/ansible/vault.yaml +++ b/ansible/vault.yaml @@ -1,226 +1,236 @@ $ANSIBLE_VAULT;1.1;AES256 -37363765653062643761336537303863333332346238656661646635613262616135333839346662 -3533306333623537356332366433636363643261656133620a663637636661663766376665313639 -61353639386239633430383864646533613364633765636664616134333138376266346462383463 -3232336263326536350a663637386366636434303465623937633237323638303430346162643261 -38343738333737353637306161373031366563646333666536663535663930323032323465316232 -64663738616334343866386536356136393732656334383737326661613235313937666637383966 -34313364316563613764383063373463663664353966636665663766323937656237326664363037 -39653333613732356535656366306161643266323938393362393261653733383435383335356531 -32663333616438393635633464336431616366363066646463663937383834303165646361646530 -35653839303939623932373532663332353662643134303830383765336132343831393939353538 -32626335613362303063326466343061353532333832333264383538336362383737633738323762 -61633237616238643765363630366433396134626535343463313431383037313764643535353835 -65306336666432656331333839303136373535323265373531363537353733303062373636323033 -63303333306437326431616635646263666166666339353632313134636363646362633434623639 -39346433376436306433373939626239363732376232636337373234373534646464313535333837 -32666538626538316534383531316531333865323262373232643761643035633961386530376235 -30633361333239376463313933343939353630323539666131373761636230623064306531376562 -36393036386137626166363433626330653431636131323836656433636533323937363265616337 -33306163626438636564353838366132623730353931376537323264323564313666623964306335 -65383537303563623732613862366431326530313364386562336136666435663130636136626435 -62336531626537393932376436633664363261316639386237323461343236396633363933643637 -31303534653138336533303566633963333534303964363962626264363062623562386164343564 -64313733666336383733613738666537623238303935353830393530353637363266656465363731 -34616437343262333861396361313730346533646364356461626232376362396639353135643332 -34656532393038656362616665343864393033613637646633343763613564636535643461396661 -36353636313639636661613131366433333866663263303362333665353630656462646663306335 -39653431393864373934393132323333623736663231323233323366313932656161363930353062 -34313138316134313533643934353237333236623438653734366439616230383264343834343462 -36356538653763656235316463653638666635616638303739316463343262646337313661326330 -32373439386133646134343039303161643863323736613466623538626234323866306265343034 -38656339313136393037386364663263393264376436626639393935386363663862633330613238 -30306563376133613762623365316337363966356633383832666134663034616636633866346264 -65613539613331386337633731373261336161346532633264373632376163386333653161666432 -34386533343238303431363236303062393564303438633633313330323564353733323163353935 -64656238663735343839666430366334336363363262353732333035663832353262653065613632 -39316261646366626138333639343263363966653965323265373266363438323733363461333433 -62396533313139333834323431323162353336643733373662373335616138333665383536326232 -34363235643731613663386361353533333563323934396362323738663534376632356635303065 -65366366616137656430626233653538656135623166623333623638303936313231393461393139 -36623330386332653932613761353738636531643437353663633463356537383431373764653930 -64636232613163343665653037663538326163313665383632633930323937383133653232366361 -37356331373762626638343439343136303738353062386435663636346165653963333761396533 -63353565366561653839353136306333313366613738353837616237623162653363393066623061 -63613536333061613638376333313463636666396236353337323363323263383133386166316362 -64343538646334356637663936373261633733303337353533393364326465343133663064336366 -34656463326333643531333064313763343738666466333535616232613539623438363961626332 -33313164613433346566633861666462343939663834366636326536613234326339646330376534 -62306637623233313739393630623033383366616239333036316132643730326234383135396332 -30303133303561623534363339303762663566366664616433313966336532373135633536613333 -38343934626230646334343062356431656466663665663466326331383237343439313333303561 -64306330383363306537336466623934363563363535326161653630303461343461663365343063 -33386263306338323862326336376234363061366530303436633366366466663464623235366265 -63383438383933656264323162373262323532303431363535663965356366323663396566396234 -33313535313133303866636266326535316466383839653662396531616361613561393864383239 -33636264383964616331363866633862666630346331393630356464643833333938653865393832 -35393334373737626233333466333835386263343039613362633333653538303633646162306163 -37663962646133326438396630626132613266663731363436616536616162373439663537336431 -30376636323637616437376431356135363562376530303333343465366533623331663139616538 -61356233633431663464336265323964313138393335633063636565383066616339313936393534 -30643032633836376362616362333536316139323238366230653636663539653666306337313833 -66313164346332333539383034333235323335323062633465353237613161643338356562373634 -31353665363336323462666335306435623938356539613231356535323138653431646364393861 -32626233653932383831313035336438643635613264346266393334303530396562383961336530 -64663430376561663232616663343531656538643166363261333337303836623264623838303065 -66393765383462623661633862346261323464346665613961363833306666656263633733303230 -30353532613965316538643435313234323264353963656566653761323830336231363037646132 -35356130653038346664306266643261666163313964353035393466646536646165666361346231 -65613239396235353064366231353939323062623065306163643530626262326566356331333465 -36386134666135396165366132636263383537313535613663663833396538613039343064376334 -64323364613431613962396334653165363937313265663864643630343538666237623062393633 -35306265313962616362616561643633653239356338653030653362623932666264626166366136 -37616661643464373837656237366430343561653533393530366238616537333061353366623833 -34346335646436633430316563636466323466663433613230376636326232306633393462653162 -37383638646234363764666364616635333039636133326561323765353939363732366434626639 -66666565623464653561363534316565633939666261376137623966326338643937383732613931 -35623131643937346461373835366436386264636333333165666639646338346632653035313238 -39666632616262663232306637646239303436366662333264383231323565363164383561303039 -63313964663564313335623639656563613136323666663930646332356538313839323463633235 -64613966633237323638366236636338633236396565303063363864666639303737336663646233 -64363737646564613331353534396432383433653263643730323337306566336631333736326466 -30366339373161316438636434626137653130363932303134663162373466666362373132303331 -61643933663531663335316137636436613361623233356430333430343833626162383266626239 -62383836313438343733623762303166333836623736346233356565353136396138376366343663 -35663134666136633639333031353066666135343661656536393536363866393562653061343439 -34633231396666663234303431323532336262363637376436363430636436656266363266616137 -31353635383833623862373333306562623966376235643935323538353939376633313437663665 -36363134313931633730626431393664643936333532346166333530613637626336333466643233 -33316564393162383536366531623538343961376166653630393062323134646136393339393937 -36383432663362623932623865386330663239663563333965373238353763343036636236346335 -61346330383639353961613161666364616234656638616465343462363138383338613034363938 -38383966313132663931613838323339363565323264336431333263653765323132636237373764 -38303434353862333465393334623635323334333132386464386666306630376537373762623133 -39346531326331353838666434323735313434333239643938333763313166343131656336316439 -32333463343338643562623634653737393333346432376530356333316338636436643236616663 -31383336373262353062623431323061643861396333656464623635336334663836353732376639 -62636536336265363631636262333461366366393137666538346239353364616236326366636435 -63323336333735326338303263333935386261353735643662303064616564393233326635303939 -33636530386438316466313138323439373632303439633065393261393865396265653338623963 -39353935663861333165343930643865383166373264373534353166633633653338303830333464 -32623838353838386238303461376435333237653963343539356434646532323032353733633464 -39336132323666323533316433376137656463616436383035366630396637333662326535623265 -34663961323530656135616233626138646665373539356266346565363932616133343066316338 -33663663373732666431333963313464336635303138346364303462376366353066323938333331 -35373863353536656462643238373936306365653361326539366462616230626165323331333262 -37356365623934303062666361636563306266356239666264326463343038393636303166353636 -31383835326337343635313330303962666636363166666237663565393235643436316464616464 -32363738666630333232326462313234663734333335653933636236613932633938353265326561 -34636632373336363061316662646132373462636337303233373733353030623637663561366637 -31366632623563323337626265373832386466386230343039356234363136383533373437343866 -32323234356139393935616334623061613431336266353937636636373138613335346265373261 -34393137323232326332373535343930643764666331373937333765353738303964666131643663 -61333136623834383030343431613863353634626631396637643630373737666635643435316131 -36306130316434356534613433626334323831666136383065633632633962313238373935613336 -33323933336235333562333264626531666436313331393362393532363936323362613633626464 -38326561623333316432663466376336323764663336336563383232666434636332646634363463 -61393432326638316664613638323466656235363137303562376436653439623931363766313631 -63663032313436386666303463363265313262623366616332373165646631333539313933663533 -64346533383661346563666132363163663762343664656232336436646434663236613436626461 -37643937626366643235323062353632376137343334303235383337346565333039346132633462 -66303733386336376666353735396238653930363833306133313736653233663436623462636161 -36613136626365663936336338626139323631646462316335613163633030336165613730383765 -61666134346234366633353264353338333734353239643632636434333863356131616230303862 -39373931353165393135343164613561393430343763333330353736326266313439656338366464 -63633234643635383262613533306434633064393039393431326464353638623235663537633665 -38663165393765303464306662363532363231383265393134303136626632346434623562363438 -61646165383537353337373739623866306133646461376261323565306530613136376461343134 -35363862376534383764616665373061393436616165666531373930616130326166396335646136 -34303833336236613537343033636563386139393432633234303838343230613638633336623435 -63653735353365346438663933663039653232356332363836653864383634616664396362613036 -62656532356363366234383735313464633432666437623866333231373838353861386336663035 -32303930396433323432366535656436346538346363363839373532396462393864393731643132 -63633036623461616431303132623264346330613962656161323465643961346461393861646433 -34323030643763653239656131353666633334633134376230656533396564666131613038336464 -62356264383661376133336566636431636635353063353561333266373437383164336366393762 -36656532363335316266393265643961336135666434373364653938326634656631613535383938 -38646135303136646639386230386135636634656135373832393736623363653234313932623034 -66633430366561386165396433306566663566613333333865646435383363323334373663663364 -64633865646365666233326264363236623330633762376632633965653935653533663333356134 -34646264303966613837666435343661383935386662386433343661653866313535613138613166 -62386633373561303437333162613830343539623034356563376362323134313635633033313034 -64373463326464313438656364383262653465343238636164326337373664653865616363656230 -35343630313732636433636336346332356366626234303163366238353164313563616162663430 -30353736646563646332393632333939613335393761323763316335346166623665623334363561 -31333639396337383133336431376366333535353566333665353238303765313763393930393930 -65633435366564616161343632356231353033326365313232363636393530356132636163353731 -36383563323938633063313931313636366433383735326538383533633033323133386135623939 -37363530343437353035626464326136313863633335303735333839616161633236303662633930 -31356363656433303431643531643230646634303363336230313838383931373365383539376331 -39316630373662633533303936623035663635343963393134636630633165636164343265333531 -66633265633133303463616637373832353463646233356636303736346436373934346536353765 -35613564663332663333343235643462653137653232633565363338356630666161393765306132 -39353035396361356266613535653531613565343932306262666531613731353434343861343632 -34623439353839626531326161303230336661616335656365646366623763386235393062303333 -61646261396430376134613631663566623832396437363331653062333566653236383862616631 -31356332356434393834366637366539316466363466333362643830616439303034646538616134 -64356638633531663537386164663030386435646639626664326137383663346666316536376434 -64313063633864396533323831333866336162333963666639653161633762616234363334663539 -66343262316135353261306161323062343166386662633635633936663337356238643933306530 -34643335396564363530366330346466633364613365333230613161343233333331393963656537 -30636538303563666339356362303962316466323937373130373835623961656539396364666435 -61373933363438323333636666646262393935663563323734636535313935393531363364343763 -61313132376236616666353364383636666461323038383332643131326536333466633365363039 -39386537643466636333653862373637666337366363323962633334373434323263616430393530 -34633363353639306664396461373739343331356132643861306636326631376132336430336236 -38646339363163623336326164363133636539613134303635303833316632376162656632373163 -65346231663538316534393238633364343631383438663539323633396632376465643630323232 -33386266353065653766393937366430306635363734323634383931346330366533383234646434 -63376232653434643164616366623230653637623434646130663432636630663364396161623362 -37313564383232383365616539653936383636653439343239303731666136306530626664306466 -39343761666464356435643437666338653037653332666463653336303136396632396235356538 -63623236626236613734633633396137313264376139656163306238653830306665393065366536 -38393238633262656638636661666537343638643538633538663730316565393336353562366135 -32336331303765313033643439363034613633313831643832303438346238316239653337653663 -63613038306332323334613062303361363865346134653765323837313135346561383931303530 -32346438633134393563363730313931373431346639633437313038303632663234383334303734 -38386532393831653763356534336135306339653265386532346530623236323933396138313066 -31343162616233333639656665336133343439356161336166333731626437613961663233313237 -37383433633936623733383565633131633433366533303739613862313765396634663832313830 -38333831643730653135326432353531663936613061313337376365386437383838663662346232 -31363563373433373862353439633330626330633838663131653633663933333066623366373465 -39323331303638376461326331643663663833646263383238313664303431613831623637653835 -62326461633762356365383665326162323536386631623738643061353066343461393530393661 -64333261383834353734633031363635366532333338376337643263343531353530343233356531 -39663439326638633361306463393430336666386334616239363933656563333565383963636464 -37656266366539663934396330666464666430376139303235643938663865386130383063623466 -34383564366230376534613232363231333438386534616634633639323035386639383635636535 -33326532326638333833353761626334646663373666396566653337323764353662316235323834 -61643434393039313731323334353631333261343339343130633438333430313231363163336265 -30356539663131353831343631353931663064373064383735383138626165323563326339663534 -61306330343163356532373062643232663631376238313939643062363332613934636131373832 -61613936333366363936623631623432636563326638653264323164643366396361326631623736 -37636665336633653161393362353565383037326130626366653265656561386266643035383865 -35386662376466626466633762316238306234646336336630323562353437623164373031663732 -39323430326663336137643532613962666537363832333034323635386462626635323664333265 -37393833353937366364653066333330386162646664363833336162643237363165323432326664 -36336236306437356234356535663632353531346533613430326430616134313963353139393936 -66353636656135323635356663666465343834363037386132376432373639333564316336353766 -31653766326332663236643932613933393562363437613530613236666364393335336131336163 -65636234326131363264666236656562306539306336626366333463386465383064343265646432 -39353538653563303662396461383630383262636132353036363630393666326637346239363435 -37396132663963633232393163326437363331643539353061383232323238326462316132363033 -62323563363665336334383066373866636238663337316534363061333965653834633166383334 -37376564633939383733303636366639626239613230306238393064643136633966343735386561 -37393633626633653935396531623735666564633761326437383532386366303662636366386337 -39333764326661343035373039646336633539643434626166656232393433633365373332373562 -61303766646165313262626163373134363034356236633764616562366566333836663266616336 -65333030303861643961386430323965356464336536666163643034326664353566613861393338 -66383335326130643963336232636230376436386238643964636630613063376538393963363461 -31613430383363353037366363393666646635306662626362373865323036383636623531623166 -66626135313232376562666335653662376136313362653133623731663537636335353130313230 -63343665346564663530363930373066626437313131336631373033666334346637646165623964 -39623164613039393533656430343331396262633431323038323966616437666265666239326433 -33666366393833373964326530626137633763663337363665306133373930366362343633316162 -61383538666266613835623364306133333037353134633133346264333439376463636436383330 -65396231393438363731613732663631386162373439326634663133396533363432356334616533 -34353166306632613063646339666636323366653262363230323561366431393730663439326232 -61336130363533366431333437653736326236323461353530363237613166316436656262356339 -33613134323937323732346461306563343762333936613131353364353132623237303433393636 -65656565323737366265383964613730613132663562383534396633303739346462633761303437 -37656536663537353738326663623862623161356536643733343361313737613635656633626536 -37633964326334356139643431303933306634383139663863613635353462616335653330326132 -32313161613835333039306465363436393433346334333339303661383431313631343162643462 -32396163313535653930346236396337393531313061656432623439613637323366393939343332 -33376639343333343133396533353034643733366564643738376263653435623739333066306133 -6434 \ No newline at end of file +34613064323664616266326436376330366432336666656438346663383165363865363966343266 +6636656139316131663836633137613730393836666437330a343930613438623366393130653964 +61366435633630353363333631326566663865376462326231643930336435336132346233663934 +3135643864366232380arom b70e0bb8b72824c3fa3681c27dacd01eba73c54f Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Thu, 9 Dec 2021 16:29:53 -0500 Subject: [PATCH 04/46] Register template and template-secrets in case we have both --- ansible/ansible-deploy-staging.yaml | 4 ++-- ansible/ansible-deploy.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ansible/ansible-deploy-staging.yaml b/ansible/ansible-deploy-staging.yaml index f08839800..d26ff10cf 100644 --- a/ansible/ansible-deploy-staging.yaml +++ b/ansible/ansible-deploy-staging.yaml @@ -29,7 +29,7 @@ - name: Check to see if we have a secrets template to send. local_action: stat path="templates/{{ indicator }}-secrets-prod.py.j2" - register: template + register: template-secrets - name: Set production params file. copy: @@ -53,4 +53,4 @@ dest: "{{ indicators_runtime_dir }}/{{ indicator }}/secrets.py" owner: "{{ runtime_user }}" group: "{{ runtime_user }}" - when: template.stat.exists + when: template-secrets.stat.exists diff --git a/ansible/ansible-deploy.yaml b/ansible/ansible-deploy.yaml index 9863639ed..952347021 100644 --- a/ansible/ansible-deploy.yaml +++ b/ansible/ansible-deploy.yaml @@ -29,7 +29,7 @@ - name: Check to see if we have a secrets template to send. local_action: stat path="templates/{{ indicator }}-secrets-prod.py.j2" - register: template + register: template-secrets - name: Set production params file. copy: @@ -53,4 +53,4 @@ dest: "{{ indicators_runtime_dir }}/{{ indicator }}/secrets.py" owner: "{{ runtime_user }}" group: "{{ runtime_user }}" - when: template.stat.exists + when: template-secrets.stat.exists From ab1c2b851e6fd9a3e5631d506d47de6200e244f2 Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Fri, 10 Dec 2021 07:32:29 -0500 Subject: [PATCH 05/46] Update claims_hosp/HospClaims/automate/update_json.py Co-authored-by: Katie Mazaitis --- claims_hosp/HospClaims/automate/update_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/claims_hosp/HospClaims/automate/update_json.py b/claims_hosp/HospClaims/automate/update_json.py index 9ae028570..093cee1c6 100644 --- a/claims_hosp/HospClaims/automate/update_json.py +++ b/claims_hosp/HospClaims/automate/update_json.py @@ -6,7 +6,7 @@ def get_hosp(edi_file, geo_dir, out_dir, receiving_dir, se): - """Output the json.params file needed to run the emr_hosp package. + """Output the json.params file needed to run the claims_hosp package. Args: edi_file: Path to EDI file (claims) From 419303848954c36897c660f7a38889b36e56f99b Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Tue, 3 May 2022 09:39:33 -0400 Subject: [PATCH 06/46] Remove unused lines --- .../HospClaims/automate/agg_claims_drops.py | 41 ------------------- 1 file changed, 41 deletions(-) diff --git a/claims_hosp/HospClaims/automate/agg_claims_drops.py b/claims_hosp/HospClaims/automate/agg_claims_drops.py index 28ba97c1a..6fe56dfb3 100644 --- a/claims_hosp/HospClaims/automate/agg_claims_drops.py +++ b/claims_hosp/HospClaims/automate/agg_claims_drops.py @@ -63,47 +63,6 @@ def agg_and_write(data_path, force=True): dfs.to_csv(out_path, index=False) print(f"Wrote {out_path}") - # matches = defaultdict(list) - # for i, f in enumerate(files): - # drop_datetime = f.name.split("_")[3:] - # if len(drop_datetime) > 2: # there is a group number - # group, date, time = drop_datetime - # dateid = date + time - # matches[dateid].append(i) - # - # for match, file_idxs in matches.items(): - # # check if file exists before writing - # out_name = files[file_idxs][0].name.split("_") - # out_name = '_'.join(out_name[:3] + out_name[4:]) - # out_path = files[file_idxs][0].parents[0] / out_name - # if out_path.exists() and not force: - # # print(f"{out_path} exists, skipping") - # continue - # - # dfs = [pd.read_csv(files[i], dtype={"PatCountyFIPS": str, - # "patCountyFIPS": str}) for i in file_idxs] - # n_rows = [a.shape[0] for a in dfs] - # dfs = pd.concat(dfs) - # if "servicedate" in dfs.columns: - # dfs.rename(columns={"servicedate": "ServiceDate"}, inplace=True) - # if "patCountyFIPS" in dfs.columns: - # dfs.rename(columns={"patCountyFIPS": "PatCountyFIPS"}, inplace=True) - # if "patHRRname" in dfs.columns: - # dfs.rename(columns={"patHRRname": "Pat HRR Name"}, inplace=True) - # if "patAgeGroup" in dfs.columns: - # dfs.rename(columns={"patAgeGroup": "PatAgeGroup"}, inplace=True) - # if "patHRRid" in dfs.columns: - # dfs.rename(columns={"patHRRid": "Pat HRR ID"}, inplace=True) - # - # assert np.sum( - # dfs.duplicated(subset=["ServiceDate", "PatCountyFIPS", - # "Pat HRR Name", "PatAgeGroup"])) == 0, \ - # "Duplication across drops!" - # assert dfs.shape[1] == 10, "Wrong number of columns" - # assert sum(n_rows) == dfs.shape[0], "Sum of rows is incorrect" - # - # safe_out(out_path, dfs, force) - @click.command() @click.argument('data_path') From 17b85d5825805f16be33f9a76dd06d1640165e34 Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Tue, 3 May 2022 12:29:15 -0400 Subject: [PATCH 07/46] Update claims_hosp/HospClaims/automate/update_json.py Co-authored-by: Katie Mazaitis --- claims_hosp/HospClaims/automate/update_json.py | 1 - 1 file changed, 1 deletion(-) diff --git a/claims_hosp/HospClaims/automate/update_json.py b/claims_hosp/HospClaims/automate/update_json.py index 093cee1c6..3bdcf5b3b 100644 --- a/claims_hosp/HospClaims/automate/update_json.py +++ b/claims_hosp/HospClaims/automate/update_json.py @@ -37,7 +37,6 @@ def get_hosp(edi_file, geo_dir, out_dir, receiving_dir, se): "obfuscated_prefix": "wip_henear", "parallel": True, "geos": ["state", "msa", "hrr", "county", "hhs", "nation"], - #"geos": ['hhs', 'nation'], "weekday": weekday } } From a631d77f0264742b0ccb09aab185082ce0dc736f Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Tue, 3 May 2022 12:34:51 -0400 Subject: [PATCH 08/46] In light of #1419 being merged, we can change to 70 dates --- claims_hosp/HospClaims/automate/ftp_to_covidcast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/claims_hosp/HospClaims/automate/ftp_to_covidcast.py b/claims_hosp/HospClaims/automate/ftp_to_covidcast.py index 0333f027c..2356783c3 100644 --- a/claims_hosp/HospClaims/automate/ftp_to_covidcast.py +++ b/claims_hosp/HospClaims/automate/ftp_to_covidcast.py @@ -11,8 +11,8 @@ # first party from secrets import covidcast -NUM_FILES = 71*6*2 # expect (71 dates x 6 geos x 2 signals) -NUM_SE_FILES = 71*6*1 # expect (71 dates x 6 geos x 1 signals) +NUM_FILES = 70*6*2 # expect (70 dates x 6 geos x 2 signals) +NUM_SE_FILES = 70*6*1 # expect (70 dates x 6 geos x 1 signals) class AllowAnythingPolicy(paramiko.MissingHostKeyPolicy): From 126f6e1ebd7dbf72b41d2b72c14084b00f039137 Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Tue, 3 May 2022 13:29:34 -0400 Subject: [PATCH 09/46] Remove unused line --- claims_hosp/HospClaims/automate/hosp_claims_master_script.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/claims_hosp/HospClaims/automate/hosp_claims_master_script.sh b/claims_hosp/HospClaims/automate/hosp_claims_master_script.sh index e3230a5c2..836709bf4 100755 --- a/claims_hosp/HospClaims/automate/hosp_claims_master_script.sh +++ b/claims_hosp/HospClaims/automate/hosp_claims_master_script.sh @@ -1,6 +1,5 @@ #!/bin/sh set -o errexit -#set -o nounset set -o pipefail BASE="/home/indicators/runtime/claims_hosp/HospClaims" From d9b087f593030a5791a5a57fd694c66cfe5e96a6 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 6 Jun 2022 07:52:38 -0400 Subject: [PATCH 10/46] add scripts and modification required for pulling data --- .../delphi_claims_hosp/agg_claims_drops.py | 61 +++++++++ .../download_claims_ftp_files.py | 121 ++++++++++++++++++ .../get_latest_claims_name.py | 35 +++++ claims_hosp/delphi_claims_hosp/run.py | 19 ++- claims_hosp/params.json.template | 1 + claims_hosp/setup.py | 1 + 6 files changed, 235 insertions(+), 3 deletions(-) create mode 100644 claims_hosp/delphi_claims_hosp/agg_claims_drops.py create mode 100644 claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py create mode 100644 claims_hosp/delphi_claims_hosp/get_latest_claims_name.py diff --git a/claims_hosp/delphi_claims_hosp/agg_claims_drops.py b/claims_hosp/delphi_claims_hosp/agg_claims_drops.py new file mode 100644 index 000000000..5b3a2967e --- /dev/null +++ b/claims_hosp/delphi_claims_hosp/agg_claims_drops.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +"""Aggregates chunks of drops. + +Drops are expected to be numbered as: + +../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_1_07052020_1456.csv.gz +../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_2_07052020_1456.csv.gz +... etc. +""" + +# standard +from pathlib import Path + +# third party +import numpy as np +import pandas as pd + + +def agg_and_write(data_path, logger): + """ + Aggregate drops given a folder path. Will output an aggregated version in the + same folder. Example below. + + Input files in folder: + ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_1_07052020_1456.csv.gz + ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_2_07052020_1456.csv.gz + + Will create: + ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_07052020_1456.csv.gz + + Args: + data_path: path to the folder with duplicated drops. + force: if aggregated file exists, whether to overwrite or not + """ + + files = np.array(list(Path(data_path).glob("*"))) + + for f in files: + out_path = f.parents[0] / f.name + dfs = pd.read_csv(f, dtype={"PatCountyFIPS": str, + "patCountyFIPS": str}) + if "servicedate" in dfs.columns: + dfs.rename(columns={"servicedate": "ServiceDate"}, inplace=True) + if "patCountyFIPS" in dfs.columns: + dfs.rename(columns={"patCountyFIPS": "PatCountyFIPS"}, inplace=True) + if "patHRRname" in dfs.columns: + dfs.rename(columns={"patHRRname": "Pat HRR Name"}, inplace=True) + if "patAgeGroup" in dfs.columns: + dfs.rename(columns={"patAgeGroup": "PatAgeGroup"}, inplace=True) + if "patHRRid" in dfs.columns: + dfs.rename(columns={"patHRRid": "Pat HRR ID"}, inplace=True) + + assert np.sum( + dfs.duplicated(subset=["ServiceDate", "PatCountyFIPS", + "Pat HRR Name", "PatAgeGroup"])) == 0, \ + "Duplication across drops!" + assert dfs.shape[1] == 10, "Wrong number of columns" + + dfs.to_csv(out_path, index=False) + logger.info(f"Wrote {out_path}") diff --git a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py new file mode 100644 index 000000000..73132ce53 --- /dev/null +++ b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +"""Downloads files modified in the last 24 hours from the delphi ftp server.""" + +# standard +import datetime +import functools +from os import path + +# third party +from secrets import claims +import paramiko + + +class AllowAnythingPolicy(paramiko.MissingHostKeyPolicy): + """ + Class for missing host key policy. + """ + def missing_host_key(self, client, hostname, key): + """ + Function for the missing host key. + """ + return + + +def print_callback(filename, bytes_so_far, bytes_total, logger): + """ + Print the callback information. + """ + rough_percent_transferred = int(100 * (bytes_so_far / bytes_total)) + if (rough_percent_transferred % 25) == 0: + logger.info(f'{filename} transfer: {rough_percent_transferred}%') + + +def get_timestamp(name): + """ + Get the reference date in datetime format. + """ + try: + split_name = name.split("_") + yyyymmdd = split_name[3] + hhmm = ''.join(filter(str.isdigit, split_name[4])) + timestamp = datetime.datetime.strptime(''.join([yyyymmdd, hhmm]), + "%Y%m%d%H%M") + except Exception: + timestamp = datetime.datetime(1900, 1, 1) + + return timestamp + + +def flip_MMDDYYYY_to_DDMMYYYY(name): + """ + Flip date from MMDDYYYY to DDMMYYYY. + """ + # flip date from MMDDYYYY to DDMMYYYY + split_name = name.split("_") + date = split_name[4] + flip_date = date[2:4] + date[:2] + date[4:] + split_name[4] = flip_date + name = '_'.join(split_name) + return name + + +def flip_YYYYMMDD_to_DDMMYYYY(name): + """ + Flip date from DDMMYYYY to MMDDYYYY. + """ + split_name = name.split("_") + date = split_name[3] + flip_date = date[6:] + date[4:6] + date[:4] + split_name[3] = flip_date + name = '_'.join(split_name) + return name + + +def download(out_path, logger): + """ + The main function to pull the latest raw files. + """ + current_time = datetime.datetime.now() + seconds_in_day = 24 * 60 * 60 + logger.info(f"current time is {current_time}") + + # open client + client = paramiko.SSHClient() + client.set_missing_host_key_policy(AllowAnythingPolicy()) + + client.connect(claims.HOST, + username=claims.USER, password=claims.PASS, port=claims.PORT) + sftp = client.open_sftp() + sftp.chdir('/hosp/receiving') + + + # go through files in recieving dir + files_to_download = [] + for fileattr in sftp.listdir_attr(): + # file_time = datetime.datetime.fromtimestamp(fileattr.st_mtime) + file_time = get_timestamp(fileattr.filename) + time_diff_to_current_time = current_time - file_time + if 0 < time_diff_to_current_time.total_seconds() <= seconds_in_day: + files_to_download.append(fileattr.filename) + + # make sure we don't download more that the 3 chunked drops (2x a day) for OP + # and the 1 chunk (2x a day) for IP - 01/07/21, *2 for multiple day drops + assert len(files_to_download) <= 2 * ((3 * 2) + 2), "more files dropped than expected" + + filepaths_to_download = {} + for file in files_to_download: + flipped_file = flip_YYYYMMDD_to_DDMMYYYY(file) + if "INPATIENT" in file: + full_path = path.join(out_path, flipped_file) + if path.exists(full_path): + logger.info(f"{flipped_file} exists, skipping") + else: + filepaths_to_download[file] = full_path + + # download! + for infile, outfile in filepaths_to_download.items(): + callback_for_filename = functools.partial(print_callback, infile, logger) + sftp.get(infile, outfile, callback=callback_for_filename) + + client.close() diff --git a/claims_hosp/delphi_claims_hosp/get_latest_claims_name.py b/claims_hosp/delphi_claims_hosp/get_latest_claims_name.py new file mode 100644 index 000000000..c7f0dcad5 --- /dev/null +++ b/claims_hosp/delphi_claims_hosp/get_latest_claims_name.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +"""Return the latest drop.""" + +# standard +import datetime +from pathlib import Path + +def get_latest_filename(dir_path, logger): + """ + Get the latest filename from the list of downloaded raw files. + """ + current_date = datetime.datetime.now() + files = list(Path(dir_path).glob("*")) + + latest_timestamp = datetime.datetime(1900, 1, 1) + latest_filename = None + for file in files: + split_name = file.name.split("_") + if len(split_name) == 5: + ddmmyyyy = split_name[3] + hhmm = ''.join(filter(str.isdigit, split_name[4])) + timestamp = datetime.datetime.strptime(''.join([ddmmyyyy, hhmm]), + "%d%m%Y%H%M") + if timestamp > latest_timestamp: + if timestamp <= current_date: + latest_timestamp = timestamp + latest_filename = file + + assert current_date.date() == latest_timestamp.date(), "no drop for today" + + # write to stdout for shell script to use + logger.info(latest_filename) + + # return for other uses + return latest_filename diff --git a/claims_hosp/delphi_claims_hosp/run.py b/claims_hosp/delphi_claims_hosp/run.py index 58cba1c56..4be8b71d9 100644 --- a/claims_hosp/delphi_claims_hosp/run.py +++ b/claims_hosp/delphi_claims_hosp/run.py @@ -15,6 +15,9 @@ # first party from .config import Config +from .download_claims_ftp_files import download +from .agg_claims_drops import agg_and_write +from .get_latest_claims_name import get_latest_filename from .update_indicator import ClaimsHospIndicatorUpdater @@ -31,7 +34,7 @@ def run_module(params): - "log_exceptions" (optional): bool, whether to log exceptions to file. - "log_filename" (optional): str, name of file to write logs - "indicator": - - "input_file": str, optional filenames to download. If null, + - "input_dir": str, directory to downloaded raw files. If null, defaults are set in retrieve_files(). - "start_date": str, YYYY-MM-DD format, first day to generate data for. - "end_date": str or null, YYYY-MM-DD format, last day to generate data for. @@ -53,11 +56,21 @@ def run_module(params): __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) + # pull latest data + download(params["indicator"]["input_dir"], logger) + + # aggregate data + agg_and_write(params["indicator"]["input_dir"], logger) + + # find the latest files (these have timestamps) + claims_file = get_latest_filename(params["indicator"]["input_dir"], + params["indicator"]["write_se"], logger) + # handle range of estimates to produce # filename expected to have format: EDI_AGG_INPATIENT_DDMMYYYY_HHMM{timezone}.csv.gz if params["indicator"]["drop_date"] is None: dropdate_dt = datetime.strptime( - Path(params["indicator"]["input_file"]).name.split("_")[3], "%d%m%Y") + Path(claims_file).name.split("_")[3], "%d%m%Y") else: dropdate_dt = datetime.strptime(params["indicator"]["drop_date"], "%Y-%m-%d") @@ -114,7 +127,7 @@ def run_module(params): signal_name ) updater.update_indicator( - params["indicator"]["input_file"], + claims_file, params["common"]["export_dir"], logger, ) diff --git a/claims_hosp/params.json.template b/claims_hosp/params.json.template index d6df27ed3..34e13a63d 100644 --- a/claims_hosp/params.json.template +++ b/claims_hosp/params.json.template @@ -5,6 +5,7 @@ }, "indicator": { "input_file": "./tests/test_data/SYNEDI_AGG_INPATIENT_11062020_1451CDT.csv.gz", + "input_dir": "./retrieve_files", "start_date": "2020-02-01", "end_date": null, "drop_date": null, diff --git a/claims_hosp/setup.py b/claims_hosp/setup.py index 940e1d165..d7e46a13d 100644 --- a/claims_hosp/setup.py +++ b/claims_hosp/setup.py @@ -4,6 +4,7 @@ required = [ "numpy", "pandas", + "paramiko", "pydocstyle", "pytest", "pytest-cov", From 0e59cd1ba4627e74a7ba065fbafed786487d514e Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 6 Jun 2022 08:01:40 -0400 Subject: [PATCH 11/46] rm downloaded files at the end --- claims_hosp/delphi_claims_hosp/run.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/claims_hosp/delphi_claims_hosp/run.py b/claims_hosp/delphi_claims_hosp/run.py index 4be8b71d9..d0ec4fc71 100644 --- a/claims_hosp/delphi_claims_hosp/run.py +++ b/claims_hosp/delphi_claims_hosp/run.py @@ -7,6 +7,7 @@ # standard packages import time +import os from datetime import datetime, timedelta from pathlib import Path @@ -133,6 +134,8 @@ def run_module(params): ) logger.info("finished updating", geo = geo) + os.system(f'rm -rf {params["indicator"]["input_dir"]}') + elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", elapsed_time_in_seconds = elapsed_time_in_seconds) From c927e00b55e592b58450b49ac50b1268bdda271b Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 6 Jun 2022 08:03:12 -0400 Subject: [PATCH 12/46] add logger info for it --- claims_hosp/delphi_claims_hosp/run.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/claims_hosp/delphi_claims_hosp/run.py b/claims_hosp/delphi_claims_hosp/run.py index d0ec4fc71..43621937d 100644 --- a/claims_hosp/delphi_claims_hosp/run.py +++ b/claims_hosp/delphi_claims_hosp/run.py @@ -133,8 +133,10 @@ def run_module(params): logger, ) logger.info("finished updating", geo = geo) - + + # Remove all the raw files os.system(f'rm -rf {params["indicator"]["input_dir"]}') + logger.info('Remove all the raw files.') elapsed_time_in_seconds = round(time.time() - start_time, 2) logger.info("Completed indicator run", From c69cc2f37c067cbe551d3de6719bc4cb10dcd8c5 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 6 Jun 2022 18:56:11 -0400 Subject: [PATCH 13/46] fixed errors in pulling --- .../delphi_claims_hosp/agg_claims_drops.py | 2 ++ .../download_claims_ftp_files.py | 27 +++++-------------- claims_hosp/delphi_claims_hosp/run.py | 8 +++--- claims_hosp/params.json.template | 9 +++++-- 4 files changed, 20 insertions(+), 26 deletions(-) diff --git a/claims_hosp/delphi_claims_hosp/agg_claims_drops.py b/claims_hosp/delphi_claims_hosp/agg_claims_drops.py index 5b3a2967e..accee88e3 100644 --- a/claims_hosp/delphi_claims_hosp/agg_claims_drops.py +++ b/claims_hosp/delphi_claims_hosp/agg_claims_drops.py @@ -37,6 +37,8 @@ def agg_and_write(data_path, logger): files = np.array(list(Path(data_path).glob("*"))) for f in files: + if ".csv.gz" not in str(f): + continue out_path = f.parents[0] / f.name dfs = pd.read_csv(f, dtype={"PatCountyFIPS": str, "patCountyFIPS": str}) diff --git a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py index 73132ce53..b173d53a6 100644 --- a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py +++ b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py @@ -7,7 +7,6 @@ from os import path # third party -from secrets import claims import paramiko @@ -22,7 +21,7 @@ def missing_host_key(self, client, hostname, key): return -def print_callback(filename, bytes_so_far, bytes_total, logger): +def print_callback(filename, logger, bytes_so_far, bytes_total): """ Print the callback information. """ @@ -46,23 +45,9 @@ def get_timestamp(name): return timestamp - -def flip_MMDDYYYY_to_DDMMYYYY(name): - """ - Flip date from MMDDYYYY to DDMMYYYY. - """ - # flip date from MMDDYYYY to DDMMYYYY - split_name = name.split("_") - date = split_name[4] - flip_date = date[2:4] + date[:2] + date[4:] - split_name[4] = flip_date - name = '_'.join(split_name) - return name - - def flip_YYYYMMDD_to_DDMMYYYY(name): """ - Flip date from DDMMYYYY to MMDDYYYY. + Flip date from YYYYMMDD to MMDDYYYY. """ split_name = name.split("_") date = split_name[3] @@ -72,7 +57,7 @@ def flip_YYYYMMDD_to_DDMMYYYY(name): return name -def download(out_path, logger): +def download(ftp_credentials, out_path, logger): """ The main function to pull the latest raw files. """ @@ -84,8 +69,10 @@ def download(out_path, logger): client = paramiko.SSHClient() client.set_missing_host_key_policy(AllowAnythingPolicy()) - client.connect(claims.HOST, - username=claims.USER, password=claims.PASS, port=claims.PORT) + client.connect(ftp_credentials["host"], + username=ftp_credentials["user"], + password=ftp_credentials["pass"], + port=ftp_credentials["port"]) sftp = client.open_sftp() sftp.chdir('/hosp/receiving') diff --git a/claims_hosp/delphi_claims_hosp/run.py b/claims_hosp/delphi_claims_hosp/run.py index 43621937d..cd03311aa 100644 --- a/claims_hosp/delphi_claims_hosp/run.py +++ b/claims_hosp/delphi_claims_hosp/run.py @@ -58,14 +58,14 @@ def run_module(params): log_exceptions=params["common"].get("log_exceptions", True)) # pull latest data - download(params["indicator"]["input_dir"], logger) + download(params["indicator"]["ftp_credentials"], + params["indicator"]["input_dir"], logger) # aggregate data agg_and_write(params["indicator"]["input_dir"], logger) # find the latest files (these have timestamps) - claims_file = get_latest_filename(params["indicator"]["input_dir"], - params["indicator"]["write_se"], logger) + claims_file = get_latest_filename(params["indicator"]["input_dir"], logger) # handle range of estimates to produce # filename expected to have format: EDI_AGG_INPATIENT_DDMMYYYY_HHMM{timezone}.csv.gz @@ -133,7 +133,7 @@ def run_module(params): logger, ) logger.info("finished updating", geo = geo) - + # Remove all the raw files os.system(f'rm -rf {params["indicator"]["input_dir"]}') logger.info('Remove all the raw files.') diff --git a/claims_hosp/params.json.template b/claims_hosp/params.json.template index 34e13a63d..e200fa8fc 100644 --- a/claims_hosp/params.json.template +++ b/claims_hosp/params.json.template @@ -4,7 +4,6 @@ "log_exceptions": false }, "indicator": { - "input_file": "./tests/test_data/SYNEDI_AGG_INPATIENT_11062020_1451CDT.csv.gz", "input_dir": "./retrieve_files", "start_date": "2020-02-01", "end_date": null, @@ -15,7 +14,13 @@ "obfuscated_prefix": "foo_obfuscated", "parallel": false, "geos": ["state", "msa", "hrr", "county"], - "weekday": [true, false] + "weekday": [true, false], + "ftp_credentials": { + "host": "", + "user": "", + "pass": "", + "port": 2222 + } }, "validation": { "common": { From 40411f84bb66c009535e808e586d90373ccda2b2 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 6 Jun 2022 19:21:42 -0400 Subject: [PATCH 14/46] add unit tests --- .../tests/test_download_claims_ftp_files.py | 21 ++++++++++++++++++ .../tests/test_get_latest_claims_name.py | 22 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 claims_hosp/tests/test_download_claims_ftp_files.py create mode 100644 claims_hosp/tests/test_get_latest_claims_name.py diff --git a/claims_hosp/tests/test_download_claims_ftp_files.py b/claims_hosp/tests/test_download_claims_ftp_files.py new file mode 100644 index 000000000..a1f4df192 --- /dev/null +++ b/claims_hosp/tests/test_download_claims_ftp_files.py @@ -0,0 +1,21 @@ +# standard +import datetime + +# third party +import numpy as np + +# first party +from delphi_claims_hosp.download_claims_ftp_files import (flip_YYYYMMDD_to_DDMMYYYY, + get_timestamp) + + +class TestDownloadClaimsFtpFiles: + + def test_flip_YYYYMMDD_to_DDMMYYYY(self): + name = "SYNEDI_AGG_INPATIENT_20200611_1451CDT" + expected = "SYNEDI_AGG_INPATIENT_11062020_1451CDT" + assert(flip_YYYYMMDD_to_DDMMYYYY(name)==expected) + + def test_get_timestamp(self): + name = "SYNEDI_AGG_INPATIENT_20200611_1451CDT" + assert(get_timestamp(name).date() == datetime.date(2020, 6, 11)) diff --git a/claims_hosp/tests/test_get_latest_claims_name.py b/claims_hosp/tests/test_get_latest_claims_name.py new file mode 100644 index 000000000..8a5ddfb54 --- /dev/null +++ b/claims_hosp/tests/test_get_latest_claims_name.py @@ -0,0 +1,22 @@ +# standard +import time + +# third party +import pytest + +from delphi_utils import get_structured_logger +from delphi_claims_hosp.get_latest_claims_name import get_latest_filename + + +class TestGetLatestFileName: + + start_time = time.time() + logger = get_structured_logger( + __name__, filename="test.log", + log_exceptions=True) + + def test_get_latest_claims_name(self): + dir_path = "./test_data/" + + with pytest.raises(AssertionError): + get_latest_filename(dir_path, self.logger) From b753b7ef4fa7c71023da4b720e379df6e6671af5 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 6 Jun 2022 20:21:21 -0400 Subject: [PATCH 15/46] change the path to test.log --- claims_hosp/tests/test_get_latest_claims_name.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/claims_hosp/tests/test_get_latest_claims_name.py b/claims_hosp/tests/test_get_latest_claims_name.py index 8a5ddfb54..627ebadc7 100644 --- a/claims_hosp/tests/test_get_latest_claims_name.py +++ b/claims_hosp/tests/test_get_latest_claims_name.py @@ -12,7 +12,7 @@ class TestGetLatestFileName: start_time = time.time() logger = get_structured_logger( - __name__, filename="test.log", + __name__, filename="./tests/test.log", log_exceptions=True) def test_get_latest_claims_name(self): From edf8e3f91bd9c26aae980aa8ba0b5cb4159169c4 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 6 Jun 2022 20:21:41 -0400 Subject: [PATCH 16/46] fixed an error --- claims_hosp/delphi_claims_hosp/run.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/claims_hosp/delphi_claims_hosp/run.py b/claims_hosp/delphi_claims_hosp/run.py index cd03311aa..c7ed0e31a 100644 --- a/claims_hosp/delphi_claims_hosp/run.py +++ b/claims_hosp/delphi_claims_hosp/run.py @@ -135,7 +135,9 @@ def run_module(params): logger.info("finished updating", geo = geo) # Remove all the raw files - os.system(f'rm -rf {params["indicator"]["input_dir"]}') + for fn in os.listdir(params["indicator"]["input_dir"]): + if ".csv.gz" in fn: + os.system(f'rm {params["indicator"]["input_dir"]}/{fn}') logger.info('Remove all the raw files.') elapsed_time_in_seconds = round(time.time() - start_time, 2) From 2d4400bf57ba713f84e206630c205ffa49caab4d Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Mon, 6 Jun 2022 20:37:36 -0400 Subject: [PATCH 17/46] change the format of geo_id at hrr level from float to str --- claims_hosp/delphi_claims_hosp/update_indicator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/claims_hosp/delphi_claims_hosp/update_indicator.py b/claims_hosp/delphi_claims_hosp/update_indicator.py index b4169370d..2e4aaa8ca 100644 --- a/claims_hosp/delphi_claims_hosp/update_indicator.py +++ b/claims_hosp/delphi_claims_hosp/update_indicator.py @@ -113,6 +113,7 @@ def geo_reindex(self, data): new_code=self.geo) elif self.geo == "hrr": data_frame = data # data is already adjusted in aggregation step above + data_frame[self.geo] = data_frame[self.geo].astype(str).str.zfill(3) else: logging.error( "%s is invalid, pick one of 'county', 'state', 'msa', 'hrr', 'hhs', nation'", From a7f9146b4ddb2e4be1411175b716b79583aa67a6 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 7 Jun 2022 11:14:15 -0400 Subject: [PATCH 18/46] fix hrr geo id type --- claims_hosp/delphi_claims_hosp/update_indicator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/claims_hosp/delphi_claims_hosp/update_indicator.py b/claims_hosp/delphi_claims_hosp/update_indicator.py index 2e4aaa8ca..cab92b551 100644 --- a/claims_hosp/delphi_claims_hosp/update_indicator.py +++ b/claims_hosp/delphi_claims_hosp/update_indicator.py @@ -150,7 +150,7 @@ def update_indicator(self, input_filepath, outpath, logger): # load data base_geo = Config.HRR_COL if self.geo == Config.HRR_COL else Config.FIPS_COL data = load_data(input_filepath, self.dropdate, base_geo) - data_frame = self.geo_reindex(data) + data_frame = self.geo_reindex(data) # handle if we need to adjust by weekday wd_params = Weekday.get_params( From a2fcfcb554ded60d0e863773202f7198a6b849dd Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 7 Jun 2022 11:14:30 -0400 Subject: [PATCH 19/46] add unit tests --- claims_hosp/tests/test_get_latest_claims_name.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/claims_hosp/tests/test_get_latest_claims_name.py b/claims_hosp/tests/test_get_latest_claims_name.py index 627ebadc7..a64c39710 100644 --- a/claims_hosp/tests/test_get_latest_claims_name.py +++ b/claims_hosp/tests/test_get_latest_claims_name.py @@ -12,7 +12,7 @@ class TestGetLatestFileName: start_time = time.time() logger = get_structured_logger( - __name__, filename="./tests/test.log", + __name__, filename="./test.log", log_exceptions=True) def test_get_latest_claims_name(self): From 40fa62d4b3bfd5145abcdd4603baf0c9e4868335 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 7 Jun 2022 13:02:27 -0400 Subject: [PATCH 20/46] change back. Geo ids for hrr are still float numbers --- claims_hosp/delphi_claims_hosp/update_indicator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/claims_hosp/delphi_claims_hosp/update_indicator.py b/claims_hosp/delphi_claims_hosp/update_indicator.py index cab92b551..5522917fa 100644 --- a/claims_hosp/delphi_claims_hosp/update_indicator.py +++ b/claims_hosp/delphi_claims_hosp/update_indicator.py @@ -113,7 +113,6 @@ def geo_reindex(self, data): new_code=self.geo) elif self.geo == "hrr": data_frame = data # data is already adjusted in aggregation step above - data_frame[self.geo] = data_frame[self.geo].astype(str).str.zfill(3) else: logging.error( "%s is invalid, pick one of 'county', 'state', 'msa', 'hrr', 'hhs', nation'", From 78e3c4fa66f2ef394b8d909badb89a4971db7e0a Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 7 Jun 2022 13:25:02 -0400 Subject: [PATCH 21/46] fix linting --- claims_hosp/.pylintrc | 4 ++- .../delphi_claims_hosp/agg_claims_drops.py | 7 +++-- .../download_claims_ftp_files.py | 29 ++++++------------- .../get_latest_claims_name.py | 4 +-- .../delphi_claims_hosp/update_indicator.py | 2 +- claims_hosp/retrieve_files/.gitignore | 0 .../tests/test_download_claims_ftp_files.py | 6 ++-- 7 files changed, 21 insertions(+), 31 deletions(-) create mode 100644 claims_hosp/retrieve_files/.gitignore diff --git a/claims_hosp/.pylintrc b/claims_hosp/.pylintrc index f30837c7e..8ba5e540a 100644 --- a/claims_hosp/.pylintrc +++ b/claims_hosp/.pylintrc @@ -7,7 +7,9 @@ disable=logging-format-interpolation, # Allow pytest functions to be part of a class. no-self-use, # Allow pytest classes to have one test. - too-few-public-methods + too-few-public-methods, + broad-except + [BASIC] diff --git a/claims_hosp/delphi_claims_hosp/agg_claims_drops.py b/claims_hosp/delphi_claims_hosp/agg_claims_drops.py index accee88e3..45b57d03a 100644 --- a/claims_hosp/delphi_claims_hosp/agg_claims_drops.py +++ b/claims_hosp/delphi_claims_hosp/agg_claims_drops.py @@ -19,8 +19,9 @@ def agg_and_write(data_path, logger): """ - Aggregate drops given a folder path. Will output an aggregated version in the - same folder. Example below. + Aggregate drops given a folder path. + + Will output an aggregated version in the same folder. Example below. Input files in folder: ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_1_07052020_1456.csv.gz @@ -32,8 +33,8 @@ def agg_and_write(data_path, logger): Args: data_path: path to the folder with duplicated drops. force: if aggregated file exists, whether to overwrite or not - """ + """ files = np.array(list(Path(data_path).glob("*"))) for f in files: diff --git a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py index b173d53a6..a97d7718a 100644 --- a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py +++ b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py @@ -11,29 +11,22 @@ class AllowAnythingPolicy(paramiko.MissingHostKeyPolicy): - """ - Class for missing host key policy. - """ + """Class for missing host key policy.""" + def missing_host_key(self, client, hostname, key): - """ - Function for the missing host key. - """ + """Check missing host key.""" return def print_callback(filename, logger, bytes_so_far, bytes_total): - """ - Print the callback information. - """ + """Print the callback information.""" rough_percent_transferred = int(100 * (bytes_so_far / bytes_total)) if (rough_percent_transferred % 25) == 0: logger.info(f'{filename} transfer: {rough_percent_transferred}%') def get_timestamp(name): - """ - Get the reference date in datetime format. - """ + """Get the reference date in datetime format.""" try: split_name = name.split("_") yyyymmdd = split_name[3] @@ -45,10 +38,8 @@ def get_timestamp(name): return timestamp -def flip_YYYYMMDD_to_DDMMYYYY(name): - """ - Flip date from YYYYMMDD to MMDDYYYY. - """ +def change_date_format(name): + """Flip date from YYYYMMDD to MMDDYYYY.""" split_name = name.split("_") date = split_name[3] flip_date = date[6:] + date[4:6] + date[:4] @@ -58,9 +49,7 @@ def flip_YYYYMMDD_to_DDMMYYYY(name): def download(ftp_credentials, out_path, logger): - """ - The main function to pull the latest raw files. - """ + """Pull the latest raw files.""" current_time = datetime.datetime.now() seconds_in_day = 24 * 60 * 60 logger.info(f"current time is {current_time}") @@ -92,7 +81,7 @@ def download(ftp_credentials, out_path, logger): filepaths_to_download = {} for file in files_to_download: - flipped_file = flip_YYYYMMDD_to_DDMMYYYY(file) + flipped_file = change_date_format(file) if "INPATIENT" in file: full_path = path.join(out_path, flipped_file) if path.exists(full_path): diff --git a/claims_hosp/delphi_claims_hosp/get_latest_claims_name.py b/claims_hosp/delphi_claims_hosp/get_latest_claims_name.py index c7f0dcad5..39b4808d0 100644 --- a/claims_hosp/delphi_claims_hosp/get_latest_claims_name.py +++ b/claims_hosp/delphi_claims_hosp/get_latest_claims_name.py @@ -6,9 +6,7 @@ from pathlib import Path def get_latest_filename(dir_path, logger): - """ - Get the latest filename from the list of downloaded raw files. - """ + """Get the latest filename from the list of downloaded raw files.""" current_date = datetime.datetime.now() files = list(Path(dir_path).glob("*")) diff --git a/claims_hosp/delphi_claims_hosp/update_indicator.py b/claims_hosp/delphi_claims_hosp/update_indicator.py index 5522917fa..b4169370d 100644 --- a/claims_hosp/delphi_claims_hosp/update_indicator.py +++ b/claims_hosp/delphi_claims_hosp/update_indicator.py @@ -149,7 +149,7 @@ def update_indicator(self, input_filepath, outpath, logger): # load data base_geo = Config.HRR_COL if self.geo == Config.HRR_COL else Config.FIPS_COL data = load_data(input_filepath, self.dropdate, base_geo) - data_frame = self.geo_reindex(data) + data_frame = self.geo_reindex(data) # handle if we need to adjust by weekday wd_params = Weekday.get_params( diff --git a/claims_hosp/retrieve_files/.gitignore b/claims_hosp/retrieve_files/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/claims_hosp/tests/test_download_claims_ftp_files.py b/claims_hosp/tests/test_download_claims_ftp_files.py index a1f4df192..95f967ca3 100644 --- a/claims_hosp/tests/test_download_claims_ftp_files.py +++ b/claims_hosp/tests/test_download_claims_ftp_files.py @@ -5,16 +5,16 @@ import numpy as np # first party -from delphi_claims_hosp.download_claims_ftp_files import (flip_YYYYMMDD_to_DDMMYYYY, +from delphi_claims_hosp.download_claims_ftp_files import (change_date_format, get_timestamp) class TestDownloadClaimsFtpFiles: - def test_flip_YYYYMMDD_to_DDMMYYYY(self): + def test_change_date_format(self): name = "SYNEDI_AGG_INPATIENT_20200611_1451CDT" expected = "SYNEDI_AGG_INPATIENT_11062020_1451CDT" - assert(flip_YYYYMMDD_to_DDMMYYYY(name)==expected) + assert(change_date_format(name)==expected) def test_get_timestamp(self): name = "SYNEDI_AGG_INPATIENT_20200611_1451CDT" From 93fdafa555d01afc635916104f822e83fa9532e3 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Wed, 8 Jun 2022 21:01:40 -0400 Subject: [PATCH 22/46] remove unused code --- claims_hosp/HospClaims/automate/README.md | 12 - .../HospClaims/automate/agg_claims_drops.py | 75 ----- .../automate/download_claims_ftp_files.py | 110 ------- .../HospClaims/automate/ftp_to_covidcast.py | 62 ---- .../automate/get_latest_claims_name.py | 42 --- .../automate/hosp_claims_master_script.sh | 71 ----- .../automate/hosp_claims_regen_script.sh | 68 ----- .../HospClaims/automate/regen_old_issue.py | 47 --- .../HospClaims/automate/sanity_checks.py | 289 ------------------ claims_hosp/HospClaims/automate/secrets.py | 1 - .../HospClaims/automate/update_json.py | 59 ---- 11 files changed, 836 deletions(-) delete mode 100644 claims_hosp/HospClaims/automate/README.md delete mode 100644 claims_hosp/HospClaims/automate/agg_claims_drops.py delete mode 100644 claims_hosp/HospClaims/automate/download_claims_ftp_files.py delete mode 100644 claims_hosp/HospClaims/automate/ftp_to_covidcast.py delete mode 100644 claims_hosp/HospClaims/automate/get_latest_claims_name.py delete mode 100755 claims_hosp/HospClaims/automate/hosp_claims_master_script.sh delete mode 100755 claims_hosp/HospClaims/automate/hosp_claims_regen_script.sh delete mode 100755 claims_hosp/HospClaims/automate/regen_old_issue.py delete mode 100644 claims_hosp/HospClaims/automate/sanity_checks.py delete mode 120000 claims_hosp/HospClaims/automate/secrets.py delete mode 100644 claims_hosp/HospClaims/automate/update_json.py diff --git a/claims_hosp/HospClaims/automate/README.md b/claims_hosp/HospClaims/automate/README.md deleted file mode 100644 index 58ebf15b6..000000000 --- a/claims_hosp/HospClaims/automate/README.md +++ /dev/null @@ -1,12 +0,0 @@ -## Dependencies -- paramiko -- click -- numpy -- pandas -- pathlib -- imap_tools -- matplotlib -- requests - -Maria todo: - - remove all absolute path references \ No newline at end of file diff --git a/claims_hosp/HospClaims/automate/agg_claims_drops.py b/claims_hosp/HospClaims/automate/agg_claims_drops.py deleted file mode 100644 index 6fe56dfb3..000000000 --- a/claims_hosp/HospClaims/automate/agg_claims_drops.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python3 - -"""Aggregates chunks of drops. - -Drops are expected to be numbered as: - -../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_1_07052020_1456.csv.gz -../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_2_07052020_1456.csv.gz -... etc. -""" - -# standard -from collections import defaultdict -from pathlib import Path - -# third party -import click -import numpy as np -import pandas as pd - - -def agg_and_write(data_path, force=True): - """ - Aggregate drops given a folder path. Will output an aggregated version in the - same folder. Example below. - - Input files in folder: - ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_1_07052020_1456.csv.gz - ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_2_07052020_1456.csv.gz - - Will create: - ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_07052020_1456.csv.gz - - - Args: - data_path: path to the folder with duplicated drops. - force: if aggregated file exists, whether to overwrite or not - """ - - files = np.array(list(Path(data_path).glob("*"))) - - for f in files: - out_path = f.parents[0] / f.name - dfs = pd.read_csv(f, dtype={"PatCountyFIPS": str, - "patCountyFIPS": str}) - if "servicedate" in dfs.columns: - dfs.rename(columns={"servicedate": "ServiceDate"}, inplace=True) - if "patCountyFIPS" in dfs.columns: - dfs.rename(columns={"patCountyFIPS": "PatCountyFIPS"}, inplace=True) - if "patHRRname" in dfs.columns: - dfs.rename(columns={"patHRRname": "Pat HRR Name"}, inplace=True) - if "patAgeGroup" in dfs.columns: - dfs.rename(columns={"patAgeGroup": "PatAgeGroup"}, inplace=True) - if "patHRRid" in dfs.columns: - dfs.rename(columns={"patHRRid": "Pat HRR ID"}, inplace=True) - - assert np.sum( - dfs.duplicated(subset=["ServiceDate", "PatCountyFIPS", - "Pat HRR Name", "PatAgeGroup"])) == 0, \ - "Duplication across drops!" - assert dfs.shape[1] == 10, "Wrong number of columns" - - dfs.to_csv(out_path, index=False) - print(f"Wrote {out_path}") - - -@click.command() -@click.argument('data_path') -@click.option('--force', '-f', is_flag=True, default=False) -def run_cli(data_path, force): - agg_and_write(data_path, force=force) - - -if __name__ == "__main__": - run_cli() diff --git a/claims_hosp/HospClaims/automate/download_claims_ftp_files.py b/claims_hosp/HospClaims/automate/download_claims_ftp_files.py deleted file mode 100644 index 28baff0e8..000000000 --- a/claims_hosp/HospClaims/automate/download_claims_ftp_files.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python3 -"""Downloads files modified in the last 24 hours from the delphi ftp server.""" - -# standard -import datetime -import functools -import sys -from os import path - -# third party -import click -import paramiko - -# first party -from secrets import claims - - -class AllowAnythingPolicy(paramiko.MissingHostKeyPolicy): - def missing_host_key(self, client, hostname, key): - return - - -def print_callback(filename, bytes_so_far, bytes_total): - rough_percent_transferred = int(100 * (bytes_so_far / bytes_total)) - if (rough_percent_transferred % 25) == 0: - print(f'{filename} transfer: {rough_percent_transferred}%') - - -def get_timestamp(name): - try: - split_name = name.split("_") - yyyymmdd = split_name[3] - hhmm = ''.join(filter(str.isdigit, split_name[4])) - timestamp = datetime.datetime.strptime(''.join([yyyymmdd, hhmm]), - "%Y%m%d%H%M") - except Exception: - timestamp = datetime.datetime(1900, 1, 1) - - return timestamp - - -def flip_MMDDYYYY_to_DDMMYYYY(name): - # flip date from MMDDYYYY to DDMMYYYY - split_name = name.split("_") - date = split_name[4] - flip_date = date[2:4] + date[:2] + date[4:] - split_name[4] = flip_date - name = '_'.join(split_name) - return name - - -def flip_YYYYMMDD_to_DDMMYYYY(name): - split_name = name.split("_") - date = split_name[3] - flip_date = date[6:] + date[4:6] + date[:4] - split_name[3] = flip_date - name = '_'.join(split_name) - return name - - -@click.command() -@click.argument("out_path") -def download(out_path): - current_time = datetime.datetime.now() - seconds_in_day = 24 * 60 * 60 - print(f"current time is {current_time}") - - # open client - client = paramiko.SSHClient() - client.set_missing_host_key_policy(AllowAnythingPolicy()) - - client.connect(claims.HOST, - username=claims.USER, password=claims.PASS, port=claims.PORT) - sftp = client.open_sftp() - sftp.chdir('/hosp/receiving') - - - # go through files in recieving dir - files_to_download = [] - for fileattr in sftp.listdir_attr(): - # file_time = datetime.datetime.fromtimestamp(fileattr.st_mtime) - file_time = get_timestamp(fileattr.filename) - time_diff_to_current_time = current_time - file_time - if 0 < time_diff_to_current_time.total_seconds() <= seconds_in_day: - files_to_download.append(fileattr.filename) - - # make sure we don't download more that the 3 chunked drops (2x a day) for OP - # and the 1 chunk (2x a day) for IP - 01/07/21, *2 for multiple day drops - assert len(files_to_download) <= 2 * ((3 * 2) + 2), "more files dropped than expected" - - filepaths_to_download = {} - for file in files_to_download: - flipped_file = flip_YYYYMMDD_to_DDMMYYYY(file) - if "INPATIENT" in file: - full_path = path.join(out_path, flipped_file) - if path.exists(full_path): - print(f"{flipped_file} exists, skipping") - else: - filepaths_to_download[file] = full_path - - # download! - for infile, outfile in filepaths_to_download.items(): - callback_for_filename = functools.partial(print_callback, infile) - sftp.get(infile, outfile, callback=callback_for_filename) - - client.close() - - -if __name__ == "__main__": - download() diff --git a/claims_hosp/HospClaims/automate/ftp_to_covidcast.py b/claims_hosp/HospClaims/automate/ftp_to_covidcast.py deleted file mode 100644 index 2356783c3..000000000 --- a/claims_hosp/HospClaims/automate/ftp_to_covidcast.py +++ /dev/null @@ -1,62 +0,0 @@ -"""FTP created files over to Delphi Covidcast ingestion.""" -# standard -import datetime -import os -from pathlib import Path - -# third party -import click -import paramiko - -# first party -from secrets import covidcast - -NUM_FILES = 70*6*2 # expect (70 dates x 6 geos x 2 signals) -NUM_SE_FILES = 70*6*1 # expect (70 dates x 6 geos x 1 signals) - - -class AllowAnythingPolicy(paramiko.MissingHostKeyPolicy): - def missing_host_key(self, client, hostname, key): - return - - -@click.command() -@click.argument("local_receiving_dir") -def upload(local_receiving_dir): - """Upload files to the delphi covidcast ingestion folders - - Args: - local_receiving_dir: local dir containing the non-se signal files - - """ - today = datetime.datetime.now().date() - - # open client - client = paramiko.SSHClient() - client.set_missing_host_key_policy(AllowAnythingPolicy()) - client.connect(covidcast.HOST, username=covidcast.USER, password=covidcast.PASS) - sftp = client.open_sftp() - - files_to_upload = [] - for file in Path(local_receiving_dir).glob("*.csv"): - files_to_upload.append(file) - - assert len(files_to_upload) == NUM_FILES, "more files to upload than expected!" - - # upload signal without se - sftp.chdir("/common/covidcast/receiving/hospital-admissions") - for i, file in enumerate(files_to_upload): - assert ( - datetime.datetime.fromtimestamp(os.path.getmtime(file)).date() == today - ), f"uploading old file {file}" - - sftp.put(file, file.name) - if (i % 61) == 0: - print(f"Finished {i} out of {len(files_to_upload)}") - - print(f"Successfully uploaded the hospital-admissions claims signal") - client.close() - - -if __name__ == "__main__": - upload() diff --git a/claims_hosp/HospClaims/automate/get_latest_claims_name.py b/claims_hosp/HospClaims/automate/get_latest_claims_name.py deleted file mode 100644 index 9dadbe5dc..000000000 --- a/claims_hosp/HospClaims/automate/get_latest_claims_name.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -"""Return the latest drop.""" - -# standard -import datetime -from pathlib import Path - -# third party -import click - - -@click.command() -@click.argument("dir_path") -def get_latest_filename(dir_path): - current_date = datetime.datetime.now() - files = list(Path(dir_path).glob("*")) - - latest_timestamp = datetime.datetime(1900, 1, 1) - latest_filename = None - for file in files: - split_name = file.name.split("_") - if len(split_name) == 5: - ddmmyyyy = split_name[3] - hhmm = ''.join(filter(str.isdigit, split_name[4])) - timestamp = datetime.datetime.strptime(''.join([ddmmyyyy, hhmm]), - "%d%m%Y%H%M") - if timestamp > latest_timestamp: - if timestamp <= current_date: - latest_timestamp = timestamp - latest_filename = file - - assert current_date.date() == latest_timestamp.date(), "no drop for today" - - # write to stdout for shell script to use - print(latest_filename) - - # return for other uses - return latest_filename - - -if __name__ == "__main__": - get_latest_filename() diff --git a/claims_hosp/HospClaims/automate/hosp_claims_master_script.sh b/claims_hosp/HospClaims/automate/hosp_claims_master_script.sh deleted file mode 100755 index 836709bf4..000000000 --- a/claims_hosp/HospClaims/automate/hosp_claims_master_script.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/sh -set -o errexit -set -o pipefail - -BASE="/home/indicators/runtime/claims_hosp/HospClaims" - -AUTO_DIR="$BASE/automate" -HOSP_CLAIMS_PKG_DIR="/home/indicators/runtime/claims_hosp" -CLAIMS_DIR="$BASE/claims_data" -GEO_DIR="/common/covidcast/covid-19/geographical_scope" -CURRENT_dmY=$(date '+%d%m%Y') -CURRENT_Ymd=$(date '+%Y%m%d') -CURRENT_YmdHM=$(date '+%Y%m%d_%H%M') -RECEIVING_DIR="$BASE/receiving/results_$CURRENT_YmdHM" -RECEIVING_SE_DIR="$BASE/receiving/results_se_$CURRENT_YmdHM" - -# pull latest data -echo "downloading drops" -cd "$AUTO_DIR" || exit -python3 download_claims_ftp_files.py "$CLAIMS_DIR" - -# aggregate data -echo "aggregating drops" -python3 agg_claims_drops.py "$CLAIMS_DIR" - -# find the latest files (these have timestamps) -echo "finding today's latest claims drop" -claims_file=$(python3 get_latest_claims_name.py "$CLAIMS_DIR") - -# make receiving directories -mkdir "$RECEIVING_DIR" - -# generate the sensor -cd "$HOSP_CLAIMS_PKG_DIR" || exit - -source env/bin/activate - -python $AUTO_DIR/update_json.py \ - "$claims_file" \ - "$GEO_DIR" \ - "$HOSP_PKG_DIR" \ - "$RECEIVING_DIR" - -python -m delphi_claims_hosp - -deactivate - -sanity_check() { - geo=$1 - cd "$AUTO_DIR" || exit - python3 sanity_checks.py "$RECEIVING_DIR" "$geo" -} - -echo "running sanity checks" -sanity_check state -sanity_check msa -sanity_check hrr -sanity_check county - -# plot states without se -cd "$AUTO_DIR" || exit -python3 sanity_checks.py "$RECEIVING_DIR" state -p - -# upload files to covidcast -python3 ftp_to_covidcast.py "$RECEIVING_DIR" - -# delete raw data -rm "$CLAIMS_DIR"/*.csv.gz - -# delete signal files -rm -r "$RECEIVING_DIR" diff --git a/claims_hosp/HospClaims/automate/hosp_claims_regen_script.sh b/claims_hosp/HospClaims/automate/hosp_claims_regen_script.sh deleted file mode 100755 index 2111f7b7e..000000000 --- a/claims_hosp/HospClaims/automate/hosp_claims_regen_script.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/sh -set -o errexit -#set -o nounset -set -o pipefail - -BASE="/home/indicators/runtime/claims_hosp/HospClaims" -AUTO_DIR="$BASE/automate" -HOSP_CLAIMS_PKG_DIR="/home/indicators/runtime/claims_hosp" -CLAIMS_DIR="$BASE/claims_data" -GEO_DIR="/common/covidcast/covid-19/geographical_scope" -RECEIVING_DIR="$1" - -# pull latest data -echo "downloading drops" -cd "$AUTO_DIR" || exit -python3 download_claims_ftp_files.py "$CLAIMS_DIR" - -# find the latest files (these have timestamps) -echo "finding today's latest claims drop" -claims_file=$(python3 get_latest_claims_name.py "$CLAIMS_DIR") - -# only keep latest file -cd "$CLAIMS_DIR" || exit -claims_filename=$(basename "$claims_file") -echo "$claims_filename" -mv "$claims_filename" .. -cd .. -rm -f "$CLAIMS_DIR"/*.csv.gz -mv "$claims_filename" "$CLAIMS_DIR" - -# aggregate data -cd "$AUTO_DIR" || exit -echo "aggregating drops" -python3 agg_claims_drops.py "$CLAIMS_DIR" - -# generate the sensor -cd "$HOSP_CLAIMS_PKG_DIR" || exit - -source env/bin/activate - -python $AUTO_DIR/update_json.py \ - "$claims_file" \ - "$GEO_DIR" \ - "$HOSP_PKG_DIR" \ - "$RECEIVING_DIR" - -python -m delphi_claims_hosp - -deactivate - -sanity_check() { - geo=$1 - cd "$AUTO_DIR" || exit - python3 sanity_checks.py "$RECEIVING_DIR" "$geo" -} - -echo "running sanity checks" -sanity_check state -sanity_check msa -sanity_check hrr -sanity_check county - -# plot states without se -#cd "$AUTO_DIR" || exit -#python3 sanity_checks.py "$RECEIVING_DIR" state -p - -# delete raw data -rm "$CLAIMS_DIR"/*.csv.gz diff --git a/claims_hosp/HospClaims/automate/regen_old_issue.py b/claims_hosp/HospClaims/automate/regen_old_issue.py deleted file mode 100755 index 838d4b24d..000000000 --- a/claims_hosp/HospClaims/automate/regen_old_issue.py +++ /dev/null @@ -1,47 +0,0 @@ -from datetime import datetime, timedelta -import os -import logging - - -def regen(issue_date: datetime): - fake_date = datetime.strftime(issue_date, '%Y%m%d') - fake_datetime = datetime.strftime(issue_date, '%Y-%m-%d %H:%M:%S') - - out_dir = f"/home/maria/Delphi/HospClaims/regen/issue_{fake_date}" - out_dir_no_se = out_dir + "/hospital-admissions" - #if os.path.isdir(out_dir_no_se) and len(os.listdir(out_dir_no_se)) > 0: - # logging.info(f"files in output dir, skipping {issue_date}") - # return False - - os.makedirs(out_dir_no_se, exist_ok=True) - os.system( - f"faketime '{fake_datetime}' /home/maria/Delphi/HospClaims/automate/hosp_claims_regen_script.sh {out_dir_no_se}") - - logging.info(str(issue_date.date())) - - -def main(): - hour = 23 - - start_date = datetime(2021, 6, 12, hour) - end_date = datetime(2021, 6, 13, hour) - #start_date = datetime(2020, 6, 2, hour) - #end_date = datetime(2020, 8, 4, hour) - n_dates = (end_date - start_date).days + 1 - date_range = [start_date + timedelta(days=a) for a in range(n_dates)] - - logging.basicConfig(level=logging.DEBUG, filename="out.log", - filemode="a+", - format="%(asctime)-15s %(levelname)-8s %(message)s") - - #date_range = [datetime(2020, 6, 21, hour)] - for date in date_range: - try: - regen(date) - except Exception as e: - logging.info(e) - continue - - -if __name__ == "__main__": - main() diff --git a/claims_hosp/HospClaims/automate/sanity_checks.py b/claims_hosp/HospClaims/automate/sanity_checks.py deleted file mode 100644 index f2b03fe65..000000000 --- a/claims_hosp/HospClaims/automate/sanity_checks.py +++ /dev/null @@ -1,289 +0,0 @@ -"""Sanity check results from generating DV estimates. - -Author: Maria Jahja -Created: 2020-05-12 - -Plotting code modified from: http://blog.marmakoide.org/?p=94 -""" - -# standard packages -import logging -import sys -from collections import defaultdict -from datetime import datetime, timedelta -from pathlib import Path - -# third party -import click -import matplotlib.dates as mpld -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from matplotlib.backends.backend_pdf import PdfPages - -# first party -EPIDATA_DIR = Path.home() / "Delphi/delphi-epidata/src/client" -FIPS_DIR = Path.home() / "Delphi/covid-19/doctor-visits/maria/data/fips_full.csv" -sys.path.append(str(EPIDATA_DIR)) -from delphi_epidata import Epidata - - -class EMRHospChecks: - DATE_FORMAT = mpld.DateFormatter('%m-%d') - - def __init__(self, data_path, level, se): - self.level = level - self.data = self.get_data(data_path, level, se) - self.locs = list(sorted(set(self.data["adj"]["val"].keys()) | \ - set(self.data["nadj"]["val"].keys()))) - self.se = se - - # read in geo file for fips - self.geo = pd.read_csv(FIPS_DIR, dtype={"FIPS": int}) - self.geo.drop_duplicates('FIPS', inplace=True) - - @staticmethod - def get_data(data_path, level, se): - """ - Compile data values and dates for given data_path and geographic level - - Args: - data_path: path to the data files - level: geographic level to pull - se: bool if se's are included in the file - - Returns: - dictionary with data - """ - - def extract(all_files, all_dates): - """Extract data from the files.""" - res = {"val": defaultdict(list), - "se": defaultdict(list), - "dates": defaultdict(list)} - for f, d in zip(all_files, all_dates): - df = pd.read_csv(open(f, "rb"), dtype={"geo_id": str}).to_numpy() - for row in df: - geo = row[0] - res["val"][geo].append(row[1]) - res["se"][geo].append(row[2]) - res["dates"][geo].append(d) - return res - - data = {"adj_files": [], "nadj_files": [], "dates": []} - for f in sorted(data_path.glob("*")): - name = f.name.split("_") - if f.suffix == ".csv" and name[1] == level: - name_idx = -2 if se else 3 - if name[name_idx] == "adj": - data["adj_files"].append(f) - else: - data["nadj_files"].append(f) - data["dates"].append(name[0]) - - # extract data - data["dates"] = sorted(list(set(data["dates"]))) - data["adj"] = extract(data["adj_files"], data["dates"]) - data["nadj"] = extract(data["nadj_files"], data["dates"]) - - # convert dates - data["dates"] = pd.to_datetime(data["dates"]) - data["first_date"] = data["dates"].min() - data["last_date"] = data["dates"].max() - data["first_plot_date"] = data["last_date"] - timedelta(days=30) - data["epidata_date_range"] = Epidata.range( - str(data["first_plot_date"].date()).replace('-', ''), - str(data["last_date"].date()).replace('-', '')) - - return data - - def check_se_na(self): - """ - Checks that all SE are reported as 'NA' due to - privacy concerns from the company. - - Returns: - true if pass, false otherwise - """ - - for kind in ["adj", "nadj"]: - for geo, ses in self.data[kind]["se"].items(): - for se in ses: - if not np.isnan(se): - logging.error(f"{geo}, {se} not nan") - return False - return True - - def check_range(self): - """ - Checks that all percentages are within [0, 100]. - - Returns: - true if pass, false otherwise - """ - for kind in ["adj", "nadj"]: - for geo, vals in self.data[kind]["val"].items(): - for val in vals: - if not (0 <= val <= 100): - logging.error(f"{geo}, {val} not in [0, 100]") - return False - return True - - def check_quantity(self): - """Checks how many geographies were generated.""" - n_geos = {} - logging.info(f"geographies generated for {self.level}") - for kind in ["adj", "nadj"]: - for geo, vals in self.data[kind]['val'].items(): - n_geos[geo] = len(vals) - - min_geo = np.min([v for k, v in n_geos.items()]) - max_geo = np.max([v for k, v in n_geos.items()]) - avg_geo = np.mean([v for k, v in n_geos.items()]) - std_geo = np.std([v for k, v in n_geos.items()]) - logging.info(f"\t{kind}" - f"\nmin:\t{min_geo}\nmax:\t{max_geo}" - f"\navg:\t{avg_geo:.2f}\nstd:\t{std_geo:.2f}") - - def get_filled_df(self, loc, kind): - df = pd.DataFrame({"val": self.data[kind]["val"][loc]}, - index=pd.to_datetime(self.data[kind]["dates"][loc])) - - if self.data["first_plot_date"] not in df.index: - df = df.append( - pd.DataFrame({"val": np.nan}, index=[self.data["first_plot_date"]])) - if self.data["last_date"] not in df.index: - df = df.append(pd.DataFrame({"val": np.nan}, index=[self.data["last_date"]])) - df.sort_index(inplace=True) - df = df.asfreq('D', fill_value=np.nan) - return df[df.index > self.data["first_plot_date"]] - - def get_epidata_df(self, loc, kind): - epi_kind = "smoothed_adj_covid19_from_claims" if kind == "adj" else "smoothed_covid19_from_claims" - if self.level == "msa": - loc = int(float(loc)) - - rows = Epidata.covidcast("hospital-admissions", epi_kind, "day", - self.level, self.data["epidata_date_range"], loc) - vals = [] - obs_dates = [] - for row in rows['epidata']: - vals.append(row['value']) - obs_dates.append(row['time_value']) - - obs_dates = [datetime.strptime(str(d), "%Y%m%d") for d in obs_dates] - df = pd.DataFrame({'date': obs_dates, 'val': vals}) - df = df.set_index('date') - return df - - def get_county_name(self, fips_code): - """Return name of a county given it's fips code.""" - loc = self.geo[self.geo["FIPS"] == fips_code] - if len(loc) == 0: - return fips_code - return f'{loc["Name"].iloc[0]} County, {loc["State"].iloc[0]}' - - def plot(self, outname): - """ Create PDF plots of the generated values by location. - - Args: - outname: name for the output pdf file - """ - - # start pdf document - pdf_pages = PdfPages(f'{outname}-{self.level}-hosp-claims-plots.pdf') - n_plot = len(self.locs) - n_plots_per_page = 25 - - # init plotting axis and counter - fig, axs = None, None - j = 0 - - for i, loc in enumerate(self.locs): - - # start new page if needed - if i % n_plots_per_page == 0: - fig, axs = plt.subplots(5, 5, figsize=(10, 10), sharex=True) - axs = axs.ravel() - j = 0 - - # plot - adj_ts = self.get_filled_df(loc, "adj") - axs[j].plot(adj_ts.index, adj_ts["val"], label="New (Adj)", color="blue") - - if not self.se: - nadj_ts = self.get_filled_df(loc, "nadj") - axs[j].plot(nadj_ts.index, nadj_ts["val"], label="New", color="green") - - # current data. left unlabeled to clear clutter, but colors correspond to - # the "new" lines. only plot first 52 cases (it's rather slow to run otherwise) - if self.level == "state" or \ - ((self.level == "county") and (loc in ["53033", "36061"])): - try: - epi_adj_ts = self.get_epidata_df(loc, "adj") - axs[j].plot(epi_adj_ts.index, epi_adj_ts["val"], - color="lightskyblue", linestyle="--") - if not self.se: - epd_nadj_ts = self.get_epidata_df(loc, "nadj") - axs[j].plot(epd_nadj_ts.index, epd_nadj_ts["val"], - color="lightgreen", linestyle="--") - except: - logging.warning(f"could not retrieve {loc} in epidata, skipping") - - # set title - if self.level == "county": - axs[j].set_title(self.get_county_name(int(loc)), fontsize=10) - else: - axs[j].set_title(loc) - - # set legend and format - if i == 0 or j == 0: - axs[j].legend() - - axs[j].xaxis.set_major_formatter(self.DATE_FORMAT) - axs[j].tick_params(axis='both', which='major', labelsize=5, labelrotation=90) - - # close the page if needed - if (i + 1) % n_plots_per_page == 0 or (i + 1) == n_plot: - plt.tight_layout() - pdf_pages.savefig(fig) - plt.close() - j += 1 - - pdf_pages.close() - logging.info(f"plotted to '{outname}-{self.level}-hosp-claims-plots.pdf'") - - -def run(respath, geo, se, plot): - """Run sanity checks and produce plots. - - Args: - respath: path to result csvs - geo: geo level, one of state, msa, hrr, county - se: boolean whether data includes se or not - plot: boolean whether to plot or not - """ - assert geo in ["state", "msa", "hrr", "county"], f"{geo} is invalid" - - ehc = EMRHospChecks(Path(respath), geo, se) - assert ehc.check_range(), "range failed" - if not se: - assert ehc.check_se_na(), "se is all na failed" - ehc.check_quantity() - if plot: - ehc.plot(str(datetime.today().date())) - logging.info("finished checks") - - -@click.command() -@click.argument('respath') -@click.argument('geo') -@click.option('--se', is_flag=True, default=False) -@click.option('--plot', '-p', is_flag=True, default=False) -def run_cli(respath, geo, se, plot): - logging.basicConfig(level=logging.INFO) - run(respath, geo, se, plot) - - -if __name__ == "__main__": - run_cli() diff --git a/claims_hosp/HospClaims/automate/secrets.py b/claims_hosp/HospClaims/automate/secrets.py deleted file mode 120000 index 4ae593b93..000000000 --- a/claims_hosp/HospClaims/automate/secrets.py +++ /dev/null @@ -1 +0,0 @@ -../../secrets.py \ No newline at end of file diff --git a/claims_hosp/HospClaims/automate/update_json.py b/claims_hosp/HospClaims/automate/update_json.py deleted file mode 100644 index 3bdcf5b3b..000000000 --- a/claims_hosp/HospClaims/automate/update_json.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Change the json template to run the package.""" - -import argparse -import json -from pathlib import Path - - -def get_hosp(edi_file, geo_dir, out_dir, receiving_dir, se): - """Output the json.params file needed to run the claims_hosp package. - - Args: - edi_file: Path to EDI file (claims) - geo_dir: Dir containing the geo_map files - out_dir: Output dir to put the json file - receiving_dir: Output dir to put the hosp estimates - se: Boolean to output SEs or not - """ - - if se: - weekday = [True] - else: - weekday = [True, False] - - data = { - "common": { - "export_dir": receiving_dir, - "log_exceptions": False - }, - "indicator": { - "input_file": edi_file, - "start_date": None, - "end_date": None, - "drop_date": None, - "n_backfill_days": 70, - "n_waiting_days": 3, - "write_se": se, - "obfuscated_prefix": "wip_henear", - "parallel": True, - "geos": ["state", "msa", "hrr", "county", "hhs", "nation"], - "weekday": weekday - } - } - print(f"Using {edi_file}") - - with open(Path(out_dir) / 'params.json', 'w') as outfile: - json.dump(data, outfile, indent=4) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('edi_file') - parser.add_argument('geo_dir') - parser.add_argument('out_dir') - parser.add_argument('receiving_dir') - parser.add_argument('--se', action="store_true") - - args = parser.parse_args() - get_hosp(args.edi_file, args.geo_dir, - args.out_dir, args.receiving_dir, args.se) From 42d72173a1060748abf00f57105b8d61b36d8be5 Mon Sep 17 00:00:00 2001 From: Jingjing Tang <31444565+jingjtang@users.noreply.github.com> Date: Tue, 14 Jun 2022 12:05:42 -0400 Subject: [PATCH 23/46] Use os.remove to clean raw files Co-authored-by: Katie Mazaitis --- claims_hosp/delphi_claims_hosp/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/claims_hosp/delphi_claims_hosp/run.py b/claims_hosp/delphi_claims_hosp/run.py index c7ed0e31a..7817ef4f1 100644 --- a/claims_hosp/delphi_claims_hosp/run.py +++ b/claims_hosp/delphi_claims_hosp/run.py @@ -137,7 +137,7 @@ def run_module(params): # Remove all the raw files for fn in os.listdir(params["indicator"]["input_dir"]): if ".csv.gz" in fn: - os.system(f'rm {params["indicator"]["input_dir"]}/{fn}') + os.remove(f'{params["indicator"]["input_dir"]}/{fn}') logger.info('Remove all the raw files.') elapsed_time_in_seconds = round(time.time() - start_time, 2) From a011ffa11029eb5e3be239df5826661634fc08af Mon Sep 17 00:00:00 2001 From: Jingjing Tang <31444565+jingjtang@users.noreply.github.com> Date: Tue, 14 Jun 2022 12:06:13 -0400 Subject: [PATCH 24/46] Use mock for the logger in unit tests Co-authored-by: Katie Mazaitis --- claims_hosp/tests/test_get_latest_claims_name.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/claims_hosp/tests/test_get_latest_claims_name.py b/claims_hosp/tests/test_get_latest_claims_name.py index a64c39710..6ecf06e6c 100644 --- a/claims_hosp/tests/test_get_latest_claims_name.py +++ b/claims_hosp/tests/test_get_latest_claims_name.py @@ -11,7 +11,7 @@ class TestGetLatestFileName: start_time = time.time() - logger = get_structured_logger( + logger = unittest.mock.Mock() __name__, filename="./test.log", log_exceptions=True) From 06875d7c90f56ba54e1d43fec35756305348af2a Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 14 Jun 2022 12:19:36 -0400 Subject: [PATCH 25/46] include filename with errors --- claims_hosp/delphi_claims_hosp/agg_claims_drops.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/claims_hosp/delphi_claims_hosp/agg_claims_drops.py b/claims_hosp/delphi_claims_hosp/agg_claims_drops.py index 45b57d03a..bb9618204 100644 --- a/claims_hosp/delphi_claims_hosp/agg_claims_drops.py +++ b/claims_hosp/delphi_claims_hosp/agg_claims_drops.py @@ -38,7 +38,8 @@ def agg_and_write(data_path, logger): files = np.array(list(Path(data_path).glob("*"))) for f in files: - if ".csv.gz" not in str(f): + filename = str(f) + if ".csv.gz" not in filename: continue out_path = f.parents[0] / f.name dfs = pd.read_csv(f, dtype={"PatCountyFIPS": str, @@ -57,8 +58,8 @@ def agg_and_write(data_path, logger): assert np.sum( dfs.duplicated(subset=["ServiceDate", "PatCountyFIPS", "Pat HRR Name", "PatAgeGroup"])) == 0, \ - "Duplication across drops!" - assert dfs.shape[1] == 10, "Wrong number of columns" + f'Duplication across drops in {filename}!' + assert dfs.shape[1] == 10, f'Wrong number of columns in {filename}' dfs.to_csv(out_path, index=False) logger.info(f"Wrote {out_path}") From cdd0583bcd5b4b7a2095669eaa88492e6a28fa6a Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 14 Jun 2022 12:22:01 -0400 Subject: [PATCH 26/46] delete commented-out code --- claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py | 1 - 1 file changed, 1 deletion(-) diff --git a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py index a97d7718a..50eb6eee1 100644 --- a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py +++ b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py @@ -69,7 +69,6 @@ def download(ftp_credentials, out_path, logger): # go through files in recieving dir files_to_download = [] for fileattr in sftp.listdir_attr(): - # file_time = datetime.datetime.fromtimestamp(fileattr.st_mtime) file_time = get_timestamp(fileattr.filename) time_diff_to_current_time = current_time - file_time if 0 < time_diff_to_current_time.total_seconds() <= seconds_in_day: From 4f86e1eb1a3ddd380aefa21b1c0c996aa98b7e40 Mon Sep 17 00:00:00 2001 From: Jingjing Tang <31444565+jingjtang@users.noreply.github.com> Date: Tue, 14 Jun 2022 12:23:49 -0400 Subject: [PATCH 27/46] Update logger info with variables Co-authored-by: Katie Mazaitis --- claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py index 50eb6eee1..619fa8a1e 100644 --- a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py +++ b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py @@ -22,7 +22,7 @@ def print_callback(filename, logger, bytes_so_far, bytes_total): """Print the callback information.""" rough_percent_transferred = int(100 * (bytes_so_far / bytes_total)) if (rough_percent_transferred % 25) == 0: - logger.info(f'{filename} transfer: {rough_percent_transferred}%') + logger.info("Transfer in progress", filename=filename, percent=rough_percent_transferred) def get_timestamp(name): From 747d58029a834fe6f82604860067eef84d62b730 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 14 Jun 2022 12:25:16 -0400 Subject: [PATCH 28/46] Remove continue but check file links with .csv.gz --- claims_hosp/delphi_claims_hosp/agg_claims_drops.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/claims_hosp/delphi_claims_hosp/agg_claims_drops.py b/claims_hosp/delphi_claims_hosp/agg_claims_drops.py index bb9618204..f5206f334 100644 --- a/claims_hosp/delphi_claims_hosp/agg_claims_drops.py +++ b/claims_hosp/delphi_claims_hosp/agg_claims_drops.py @@ -35,12 +35,10 @@ def agg_and_write(data_path, logger): force: if aggregated file exists, whether to overwrite or not """ - files = np.array(list(Path(data_path).glob("*"))) + files = np.array(list(Path(data_path).glob("*.csv.gz"))) for f in files: filename = str(f) - if ".csv.gz" not in filename: - continue out_path = f.parents[0] / f.name dfs = pd.read_csv(f, dtype={"PatCountyFIPS": str, "patCountyFIPS": str}) From c0482c3d01b5e9433353f1d1cd26d86eb62e9103 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 14 Jun 2022 12:34:00 -0400 Subject: [PATCH 29/46] fix the error in switching to Mock in unittest.mock --- claims_hosp/tests/test_get_latest_claims_name.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/claims_hosp/tests/test_get_latest_claims_name.py b/claims_hosp/tests/test_get_latest_claims_name.py index 6ecf06e6c..2f3021af0 100644 --- a/claims_hosp/tests/test_get_latest_claims_name.py +++ b/claims_hosp/tests/test_get_latest_claims_name.py @@ -1,19 +1,18 @@ # standard import time +from unittest.mock import Mock # third party import pytest -from delphi_utils import get_structured_logger + from delphi_claims_hosp.get_latest_claims_name import get_latest_filename class TestGetLatestFileName: start_time = time.time() - logger = unittest.mock.Mock() - __name__, filename="./test.log", - log_exceptions=True) + logger = Mock() def test_get_latest_claims_name(self): dir_path = "./test_data/" From ce1f45e1e411afa4ae3b1e5b62b4b7419e317f57 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 14 Jun 2022 12:34:46 -0400 Subject: [PATCH 30/46] remove unused variables --- claims_hosp/tests/test_get_latest_claims_name.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/claims_hosp/tests/test_get_latest_claims_name.py b/claims_hosp/tests/test_get_latest_claims_name.py index 2f3021af0..ded5c9718 100644 --- a/claims_hosp/tests/test_get_latest_claims_name.py +++ b/claims_hosp/tests/test_get_latest_claims_name.py @@ -10,8 +10,6 @@ class TestGetLatestFileName: - - start_time = time.time() logger = Mock() def test_get_latest_claims_name(self): From 2546ceae9fb457cea04977eb39238f55b2871032 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 14 Jun 2022 12:35:18 -0400 Subject: [PATCH 31/46] remove unused messages --- claims_hosp/.pylintrc | 1 - 1 file changed, 1 deletion(-) diff --git a/claims_hosp/.pylintrc b/claims_hosp/.pylintrc index 8ba5e540a..ef44f2925 100644 --- a/claims_hosp/.pylintrc +++ b/claims_hosp/.pylintrc @@ -7,7 +7,6 @@ disable=logging-format-interpolation, # Allow pytest functions to be part of a class. no-self-use, # Allow pytest classes to have one test. - too-few-public-methods, broad-except From 7adf66f69a468c86538c8bb4f013ac821775ec9e Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 14 Jun 2022 13:17:20 -0400 Subject: [PATCH 32/46] add too-few-public-methods back to message control for linting --- claims_hosp/.pylintrc | 1 + 1 file changed, 1 insertion(+) diff --git a/claims_hosp/.pylintrc b/claims_hosp/.pylintrc index ef44f2925..8ba5e540a 100644 --- a/claims_hosp/.pylintrc +++ b/claims_hosp/.pylintrc @@ -7,6 +7,7 @@ disable=logging-format-interpolation, # Allow pytest functions to be part of a class. no-self-use, # Allow pytest classes to have one test. + too-few-public-methods, broad-except From 0c39be0df634c483314fb6daf493a041b419f156 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Wed, 15 Jun 2022 10:07:42 -0400 Subject: [PATCH 33/46] add logger info for files to download --- claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py index 619fa8a1e..f195ce713 100644 --- a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py +++ b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py @@ -73,6 +73,7 @@ def download(ftp_credentials, out_path, logger): time_diff_to_current_time = current_time - file_time if 0 < time_diff_to_current_time.total_seconds() <= seconds_in_day: files_to_download.append(fileattr.filename) + logger.info("File to download", filename=fileattr.filename) # make sure we don't download more that the 3 chunked drops (2x a day) for OP # and the 1 chunk (2x a day) for IP - 01/07/21, *2 for multiple day drops @@ -84,7 +85,7 @@ def download(ftp_credentials, out_path, logger): if "INPATIENT" in file: full_path = path.join(out_path, flipped_file) if path.exists(full_path): - logger.info(f"{flipped_file} exists, skipping") + logger.info("Skip the existing file", filename=flipped_file) else: filepaths_to_download[file] = full_path From c77c8117f2e38cc892f4019a7c19ff9331ff9c93 Mon Sep 17 00:00:00 2001 From: Jingjing Tang <31444565+jingjtang@users.noreply.github.com> Date: Wed, 15 Jun 2022 10:08:30 -0400 Subject: [PATCH 34/46] Update logger info for the latest claims file Co-authored-by: Katie Mazaitis --- claims_hosp/delphi_claims_hosp/get_latest_claims_name.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/claims_hosp/delphi_claims_hosp/get_latest_claims_name.py b/claims_hosp/delphi_claims_hosp/get_latest_claims_name.py index 39b4808d0..e417183c7 100644 --- a/claims_hosp/delphi_claims_hosp/get_latest_claims_name.py +++ b/claims_hosp/delphi_claims_hosp/get_latest_claims_name.py @@ -26,8 +26,7 @@ def get_latest_filename(dir_path, logger): assert current_date.date() == latest_timestamp.date(), "no drop for today" - # write to stdout for shell script to use - logger.info(latest_filename) + logger.info("Latest claims file", filename=latest_filename) # return for other uses return latest_filename From 0be164cea585a8afe3fb12538546026591778ff8 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Wed, 15 Jun 2022 10:19:45 -0400 Subject: [PATCH 35/46] update the function for renmaing the raw drops --- .../delphi_claims_hosp/modify_claims_drops.py | 58 +++++++++++++++++++ claims_hosp/tests/test_modify_claims_drops.py | 15 +++++ 2 files changed, 73 insertions(+) create mode 100644 claims_hosp/delphi_claims_hosp/modify_claims_drops.py create mode 100644 claims_hosp/tests/test_modify_claims_drops.py diff --git a/claims_hosp/delphi_claims_hosp/modify_claims_drops.py b/claims_hosp/delphi_claims_hosp/modify_claims_drops.py new file mode 100644 index 000000000..6fcd88b85 --- /dev/null +++ b/claims_hosp/delphi_claims_hosp/modify_claims_drops.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 + +"""Modify the drops. + +Drops are expected to be numbered as: + +../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_1_07052020_1456.csv.gz +../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_2_07052020_1456.csv.gz +... etc. +""" + +# standard +from pathlib import Path + +# third party +import numpy as np +import pandas as pd + + +def modify_and_write(data_path, logger, force=True): + """ + Modify drops given a folder path. + + Will rename necessary columns in the input files, and check the number of + columns and duplications. + + Args: + data_path: path to the folder with duplicated drops. + force: if aggregated file exists, whether to overwrite or not + + """ + files = np.array(list(Path(data_path).glob("*.csv.gz"))) + + for f in files: + filename = str(f) + out_path = f.parents[0] / f.name + dfs = pd.read_csv(f, dtype={"PatCountyFIPS": str, + "patCountyFIPS": str}) + if "servicedate" in dfs.columns: + dfs.rename(columns={"servicedate": "ServiceDate"}, inplace=True) + if "patCountyFIPS" in dfs.columns: + dfs.rename(columns={"patCountyFIPS": "PatCountyFIPS"}, inplace=True) + if "patHRRname" in dfs.columns: + dfs.rename(columns={"patHRRname": "Pat HRR Name"}, inplace=True) + if "patAgeGroup" in dfs.columns: + dfs.rename(columns={"patAgeGroup": "PatAgeGroup"}, inplace=True) + if "patHRRid" in dfs.columns: + dfs.rename(columns={"patHRRid": "Pat HRR ID"}, inplace=True) + + assert np.sum( + dfs.duplicated(subset=["ServiceDate", "PatCountyFIPS", + "Pat HRR Name", "PatAgeGroup"])) == 0, \ + f'Duplication across drops in {filename}!' + assert dfs.shape[1] == 10, f'Wrong number of columns in {filename}' + + if force: + dfs.to_csv(out_path, index=False) + logger.info(f"Wrote {out_path}") diff --git a/claims_hosp/tests/test_modify_claims_drops.py b/claims_hosp/tests/test_modify_claims_drops.py new file mode 100644 index 000000000..f2fdfa4d2 --- /dev/null +++ b/claims_hosp/tests/test_modify_claims_drops.py @@ -0,0 +1,15 @@ +# standard +from unittest.mock import Mock + +# third party + +# first party +from delphi_claims_hosp.modify_claims_drops import (modify_and_write) + + +class TestDropsModification: + + def test_modify_and_write(self): + data_path = "./test_data/SYNEDI_AGG_INPATIENT_11062020_1451CDT.csv.gz" + logger = Mock() + modify_and_write(data_path, logger, force=False) From 2c9b21dafde761ccf6189befdb0756e6675e6086 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Wed, 15 Jun 2022 10:25:55 -0400 Subject: [PATCH 36/46] remove agg function --- .../delphi_claims_hosp/agg_claims_drops.py | 63 ------------------- 1 file changed, 63 deletions(-) delete mode 100644 claims_hosp/delphi_claims_hosp/agg_claims_drops.py diff --git a/claims_hosp/delphi_claims_hosp/agg_claims_drops.py b/claims_hosp/delphi_claims_hosp/agg_claims_drops.py deleted file mode 100644 index f5206f334..000000000 --- a/claims_hosp/delphi_claims_hosp/agg_claims_drops.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 - -"""Aggregates chunks of drops. - -Drops are expected to be numbered as: - -../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_1_07052020_1456.csv.gz -../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_2_07052020_1456.csv.gz -... etc. -""" - -# standard -from pathlib import Path - -# third party -import numpy as np -import pandas as pd - - -def agg_and_write(data_path, logger): - """ - Aggregate drops given a folder path. - - Will output an aggregated version in the same folder. Example below. - - Input files in folder: - ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_1_07052020_1456.csv.gz - ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_2_07052020_1456.csv.gz - - Will create: - ../EDI_AGG_INPATIENT/EDI_AGG_INPATIENT_07052020_1456.csv.gz - - Args: - data_path: path to the folder with duplicated drops. - force: if aggregated file exists, whether to overwrite or not - - """ - files = np.array(list(Path(data_path).glob("*.csv.gz"))) - - for f in files: - filename = str(f) - out_path = f.parents[0] / f.name - dfs = pd.read_csv(f, dtype={"PatCountyFIPS": str, - "patCountyFIPS": str}) - if "servicedate" in dfs.columns: - dfs.rename(columns={"servicedate": "ServiceDate"}, inplace=True) - if "patCountyFIPS" in dfs.columns: - dfs.rename(columns={"patCountyFIPS": "PatCountyFIPS"}, inplace=True) - if "patHRRname" in dfs.columns: - dfs.rename(columns={"patHRRname": "Pat HRR Name"}, inplace=True) - if "patAgeGroup" in dfs.columns: - dfs.rename(columns={"patAgeGroup": "PatAgeGroup"}, inplace=True) - if "patHRRid" in dfs.columns: - dfs.rename(columns={"patHRRid": "Pat HRR ID"}, inplace=True) - - assert np.sum( - dfs.duplicated(subset=["ServiceDate", "PatCountyFIPS", - "Pat HRR Name", "PatAgeGroup"])) == 0, \ - f'Duplication across drops in {filename}!' - assert dfs.shape[1] == 10, f'Wrong number of columns in {filename}' - - dfs.to_csv(out_path, index=False) - logger.info(f"Wrote {out_path}") From 080ad4515a19e5af22dec5812af11a1acb74cfaf Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Wed, 15 Jun 2022 10:26:25 -0400 Subject: [PATCH 37/46] update import info for the modification function --- claims_hosp/delphi_claims_hosp/run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/claims_hosp/delphi_claims_hosp/run.py b/claims_hosp/delphi_claims_hosp/run.py index 7817ef4f1..9092f46dd 100644 --- a/claims_hosp/delphi_claims_hosp/run.py +++ b/claims_hosp/delphi_claims_hosp/run.py @@ -17,7 +17,7 @@ # first party from .config import Config from .download_claims_ftp_files import download -from .agg_claims_drops import agg_and_write +from .modify_claims_drops import modify_and_write from .get_latest_claims_name import get_latest_filename from .update_indicator import ClaimsHospIndicatorUpdater @@ -62,7 +62,7 @@ def run_module(params): params["indicator"]["input_dir"], logger) # aggregate data - agg_and_write(params["indicator"]["input_dir"], logger) + modify_and_write(params["indicator"]["input_dir"], logger) # find the latest files (these have timestamps) claims_file = get_latest_filename(params["indicator"]["input_dir"], logger) From 819b043698a7f86a51d27e417acf4f2e7579240f Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Wed, 15 Jun 2022 10:27:03 -0400 Subject: [PATCH 38/46] add j2 file for claims_hosp params --- .../templates/claims_hosp-params-prod.py.j2 | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 ansible/templates/claims_hosp-params-prod.py.j2 diff --git a/ansible/templates/claims_hosp-params-prod.py.j2 b/ansible/templates/claims_hosp-params-prod.py.j2 new file mode 100644 index 000000000..5b536ada1 --- /dev/null +++ b/ansible/templates/claims_hosp-params-prod.py.j2 @@ -0,0 +1,43 @@ +{ + "common": { + "export_dir": "./receiving", + "log_exceptions": false + }, + "indicator": { + "input_dir": "./retrieve_files", + "start_date": "2020-02-01", + "end_date": null, + "drop_date": null, + "n_backfill_days": 70, + "n_waiting_days": 3, + "write_se": false, + "obfuscated_prefix": "foo_obfuscated", + "parallel": false, + "geos": ["state", "msa", "hrr", "county"], + "weekday": [true, false], + "ftp_credentials": { + "host": "{{ claims_hosp_ftp_host }}", + "user": "{{ claims_hosp_ftp_user }}", + "pass": "{{ claims_hosp_ftp_password }}", + "port": 2222 + } + }, + "validation": { + "common": { + "data_source": "hospital-admissions", + "span_length": 14, + "min_expected_lag": {"all": "3"}, + "max_expected_lag": {"all": "4"}, + "dry_run": true, + "suppressed_errors": [] + }, + "static": { + "minimum_sample_size": 5, + "missing_se_allowed": true, + "missing_sample_size_allowed": true + }, + "dynamic": { + "ref_window_size": 7 + } + } +} From cfb682c5da01bb08b77d57640330b9a1e5edfdd6 Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Wed, 15 Jun 2022 14:47:29 -0400 Subject: [PATCH 39/46] Manually merge vault changes --- ansible/vault.yaml | 472 +++++++++++++++++++++++---------------------- 1 file changed, 237 insertions(+), 235 deletions(-) diff --git a/ansible/vault.yaml b/ansible/vault.yaml index cede7133f..8339482ba 100644 --- a/ansible/vault.yaml +++ b/ansible/vault.yaml @@ -1,236 +1,238 @@ $ANSIBLE_VAULT;1.1;AES256 -34613064323664616266326436376330366432336666656438346663383165363865363966343266 -6636656139316131663836633137613730393836666437330a343930613438623366393130653964 -61366435633630353363333631326566663865376462326231643930336435336132346233663934 -3135643864366232380aa643833396531656634346465666333 +31366434386538373830363161656636666435373663326263326366303932376132653265376139 +3632333431326631650arom 6f4976ce78984bddad02e56ba567f4c2dd3c2910 Mon Sep 17 00:00:00 2001 From: Jingjing Tang <31444565+jingjtang@users.noreply.github.com> Date: Thu, 16 Jun 2022 13:51:15 -0400 Subject: [PATCH 40/46] Update logger info Co-authored-by: Katie Mazaitis --- claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py index f195ce713..7aa748d4f 100644 --- a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py +++ b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py @@ -52,7 +52,7 @@ def download(ftp_credentials, out_path, logger): """Pull the latest raw files.""" current_time = datetime.datetime.now() seconds_in_day = 24 * 60 * 60 - logger.info(f"current time is {current_time}") + logger.info("starting download", time=current_time) # open client client = paramiko.SSHClient() From 1c83ffde32c2dbc0551a1550c0f5ffd992b27f2a Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Thu, 16 Jun 2022 15:24:24 -0400 Subject: [PATCH 41/46] add min_max_dates, csv_export_count, etc to logger info --- claims_hosp/delphi_claims_hosp/run.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/claims_hosp/delphi_claims_hosp/run.py b/claims_hosp/delphi_claims_hosp/run.py index 9092f46dd..6c7405a36 100644 --- a/claims_hosp/delphi_claims_hosp/run.py +++ b/claims_hosp/delphi_claims_hosp/run.py @@ -102,6 +102,8 @@ def run_module(params): weekday = params["indicator"]["weekday"], write_se = params["indicator"]["write_se"]) + max_dates = [] + n_csv_export = [] # generate indicator csvs for geo in params["indicator"]["geos"]: for weekday in params["indicator"]["weekday"]: @@ -132,6 +134,8 @@ def run_module(params): params["common"]["export_dir"], logger, ) + max_dates.append(updater.output_dates[-1]) + n_csv_export.append(len(updater.output_dates)) logger.info("finished updating", geo = geo) # Remove all the raw files @@ -141,5 +145,12 @@ def run_module(params): logger.info('Remove all the raw files.') elapsed_time_in_seconds = round(time.time() - start_time, 2) + min_max_date = min(max_dates) + max_lag_in_days = (datetime.now() - min_max_date).days + csv_export_count = sum(n_csv_export) + formatted_min_max_date = min_max_date.strftime("%Y-%m-%d") logger.info("Completed indicator run", - elapsed_time_in_seconds = elapsed_time_in_seconds) + elapsed_time_in_seconds = elapsed_time_in_seconds, + csv_export_count = csv_export_count, + max_lag_in_days = max_lag_in_days, + oldest_final_export_date = formatted_min_max_date) From 50c93fb5ea4ecca89fc43690dd17b6d12173af35 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Fri, 17 Jun 2022 13:44:47 -0400 Subject: [PATCH 42/46] update the unit test for modification of the raw files --- .../delphi_claims_hosp/modify_claims_drops.py | 5 ++++- claims_hosp/tests/test_modify_claims_drops.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/claims_hosp/delphi_claims_hosp/modify_claims_drops.py b/claims_hosp/delphi_claims_hosp/modify_claims_drops.py index 6fcd88b85..f40236249 100644 --- a/claims_hosp/delphi_claims_hosp/modify_claims_drops.py +++ b/claims_hosp/delphi_claims_hosp/modify_claims_drops.py @@ -30,7 +30,7 @@ def modify_and_write(data_path, logger, force=True): """ files = np.array(list(Path(data_path).glob("*.csv.gz"))) - + dfs_list = [] for f in files: filename = str(f) out_path = f.parents[0] / f.name @@ -56,3 +56,6 @@ def modify_and_write(data_path, logger, force=True): if force: dfs.to_csv(out_path, index=False) logger.info(f"Wrote {out_path}") + else: + dfs_list.append(dfs) + return files, dfs_list diff --git a/claims_hosp/tests/test_modify_claims_drops.py b/claims_hosp/tests/test_modify_claims_drops.py index f2fdfa4d2..2905a3caa 100644 --- a/claims_hosp/tests/test_modify_claims_drops.py +++ b/claims_hosp/tests/test_modify_claims_drops.py @@ -1,15 +1,19 @@ # standard from unittest.mock import Mock +from pathlib import Path # third party - -# first party from delphi_claims_hosp.modify_claims_drops import (modify_and_write) class TestDropsModification: def test_modify_and_write(self): - data_path = "./test_data/SYNEDI_AGG_INPATIENT_11062020_1451CDT.csv.gz" + data_path = "./tests/test_data/" logger = Mock() - modify_and_write(data_path, logger, force=False) + files, dfs_list = modify_and_write(data_path, logger, force=False) + expected_colnames = ['PatCountyFIPS', 'Pat HRR Name', 'Pat HRR ID', 'PatAgeGroup'] + assert len(files) == 1 + assert len(dfs_list) == 1 + assert all(files == [Path('tests/test_data/SYNEDI_AGG_INPATIENT_11062020_1451CDT.csv.gz')]) + assert set(expected_colnames).issubset(set(dfs_list[0].columns)) From 300397ff16fbcc797b05351df49f9de0469c15b1 Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Fri, 17 Jun 2022 13:48:50 -0400 Subject: [PATCH 43/46] fix the error in the unit test --- claims_hosp/tests/test_modify_claims_drops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/claims_hosp/tests/test_modify_claims_drops.py b/claims_hosp/tests/test_modify_claims_drops.py index 2905a3caa..e4a6ba83f 100644 --- a/claims_hosp/tests/test_modify_claims_drops.py +++ b/claims_hosp/tests/test_modify_claims_drops.py @@ -9,11 +9,11 @@ class TestDropsModification: def test_modify_and_write(self): - data_path = "./tests/test_data/" + data_path = "./test_data/" logger = Mock() files, dfs_list = modify_and_write(data_path, logger, force=False) expected_colnames = ['PatCountyFIPS', 'Pat HRR Name', 'Pat HRR ID', 'PatAgeGroup'] assert len(files) == 1 assert len(dfs_list) == 1 - assert all(files == [Path('tests/test_data/SYNEDI_AGG_INPATIENT_11062020_1451CDT.csv.gz')]) + assert files[0] == Path('./test_data/SYNEDI_AGG_INPATIENT_11062020_1451CDT.csv.gz') assert set(expected_colnames).issubset(set(dfs_list[0].columns)) From 76b4db0fab385b2cba888dbc9b00c54e2f6dbd2e Mon Sep 17 00:00:00 2001 From: Jingjing Tang Date: Tue, 21 Jun 2022 19:51:39 -0400 Subject: [PATCH 44/46] rename the parameter for test_mode check --- claims_hosp/delphi_claims_hosp/modify_claims_drops.py | 10 +++++----- claims_hosp/tests/test_modify_claims_drops.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/claims_hosp/delphi_claims_hosp/modify_claims_drops.py b/claims_hosp/delphi_claims_hosp/modify_claims_drops.py index f40236249..0ab93ebcc 100644 --- a/claims_hosp/delphi_claims_hosp/modify_claims_drops.py +++ b/claims_hosp/delphi_claims_hosp/modify_claims_drops.py @@ -17,7 +17,7 @@ import pandas as pd -def modify_and_write(data_path, logger, force=True): +def modify_and_write(data_path, logger, test_mode=False): """ Modify drops given a folder path. @@ -26,7 +26,7 @@ def modify_and_write(data_path, logger, force=True): Args: data_path: path to the folder with duplicated drops. - force: if aggregated file exists, whether to overwrite or not + test_mode: Don't overwrite the drops if test_mode==True """ files = np.array(list(Path(data_path).glob("*.csv.gz"))) @@ -53,9 +53,9 @@ def modify_and_write(data_path, logger, force=True): f'Duplication across drops in {filename}!' assert dfs.shape[1] == 10, f'Wrong number of columns in {filename}' - if force: + if test_mode: + dfs_list.append(dfs) + else: dfs.to_csv(out_path, index=False) logger.info(f"Wrote {out_path}") - else: - dfs_list.append(dfs) return files, dfs_list diff --git a/claims_hosp/tests/test_modify_claims_drops.py b/claims_hosp/tests/test_modify_claims_drops.py index e4a6ba83f..91d3d048f 100644 --- a/claims_hosp/tests/test_modify_claims_drops.py +++ b/claims_hosp/tests/test_modify_claims_drops.py @@ -11,7 +11,7 @@ class TestDropsModification: def test_modify_and_write(self): data_path = "./test_data/" logger = Mock() - files, dfs_list = modify_and_write(data_path, logger, force=False) + files, dfs_list = modify_and_write(data_path, logger, test_mode=True) expected_colnames = ['PatCountyFIPS', 'Pat HRR Name', 'Pat HRR ID', 'PatAgeGroup'] assert len(files) == 1 assert len(dfs_list) == 1 From aef95115e303170dc6a805e8352ca8dee361685e Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Wed, 22 Jun 2022 16:32:50 -0400 Subject: [PATCH 45/46] Remove try/except from get_timestamp by converting to re --- claims_hosp/.pylintrc | 1 - .../download_claims_ftp_files.py | 17 ++++++----------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/claims_hosp/.pylintrc b/claims_hosp/.pylintrc index 8ba5e540a..7fc2f5c30 100644 --- a/claims_hosp/.pylintrc +++ b/claims_hosp/.pylintrc @@ -8,7 +8,6 @@ disable=logging-format-interpolation, no-self-use, # Allow pytest classes to have one test. too-few-public-methods, - broad-except [BASIC] diff --git a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py index 7aa748d4f..7cb30c60e 100644 --- a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py +++ b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py @@ -5,6 +5,7 @@ import datetime import functools from os import path +import re # third party import paramiko @@ -24,19 +25,13 @@ def print_callback(filename, logger, bytes_so_far, bytes_total): if (rough_percent_transferred % 25) == 0: logger.info("Transfer in progress", filename=filename, percent=rough_percent_transferred) - +FILENAME_TIMESTAMP = re.compile(r".*EDI_AGG_INPATIENT_(?P[0-9]*)_(?P[0-9]*)[^0-9]*") def get_timestamp(name): """Get the reference date in datetime format.""" - try: - split_name = name.split("_") - yyyymmdd = split_name[3] - hhmm = ''.join(filter(str.isdigit, split_name[4])) - timestamp = datetime.datetime.strptime(''.join([yyyymmdd, hhmm]), - "%Y%m%d%H%M") - except Exception: - timestamp = datetime.datetime(1900, 1, 1) - - return timestamp + m = FILENAME_TIMESTAMP.match(name) + if not m: + return datetime.datetime(1900, 1, 1) + return datetime.datetime.strptime(''.join(m.groups()), "%Y%m%d%H%M") def change_date_format(name): """Flip date from YYYYMMDD to MMDDYYYY.""" From 170f2e349c91c4806f4bc583af44169a6de33d64 Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Fri, 24 Jun 2022 17:26:05 -0400 Subject: [PATCH 46/46] Update expected file count for inpatient-only --- .../delphi_claims_hosp/download_claims_ftp_files.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py index 7cb30c60e..6c2a3f184 100644 --- a/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py +++ b/claims_hosp/delphi_claims_hosp/download_claims_ftp_files.py @@ -70,9 +70,10 @@ def download(ftp_credentials, out_path, logger): files_to_download.append(fileattr.filename) logger.info("File to download", filename=fileattr.filename) - # make sure we don't download more that the 3 chunked drops (2x a day) for OP - # and the 1 chunk (2x a day) for IP - 01/07/21, *2 for multiple day drops - assert len(files_to_download) <= 2 * ((3 * 2) + 2), "more files dropped than expected" + # make sure we don't download more than the 1 chunk (2x a day) drops for IP - 01/07/21, + # *2 for multiple day drops + assert len(files_to_download) <= 2 * (2), \ + f"more files dropped ({len(files_to_download)}) than expected (4)" filepaths_to_download = {} for file in files_to_download: