Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove the outbreakSpecifics field #257

Merged
merged 1 commit into from
Jun 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions data-serving/data-service/schemas/cases.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -276,21 +276,6 @@
}
}
},
"outbreakSpecifics": {
"bsonType": "object",
"additionalProperties": false,
"properties": {
"_id": {
"bsonType": "objectId"
},
"livesInWuhan": {
"bsonType": "bool"
},
"reportedMarketExposure": {
"bsonType": "bool"
}
}
},
"pathogens": {
"bsonType": "array",
"uniqueItems": true,
Expand Down Expand Up @@ -416,6 +401,9 @@
},
"travel_history_binary": {
"bsonType": "string"
},
"lives_in_Wuhan": {
"bsonType": "string"
}
}
}
Expand Down
6 changes: 0 additions & 6 deletions data-serving/data-service/src/model/case.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@ import { DemographicsDocument, demographicsSchema } from './demographics';
import { DictionaryDocument, dictionarySchema } from './dictionary';
import { EventDocument, eventSchema } from './event';
import { LocationDocument, locationSchema } from './location';
import {
OutbreakSpecificsDocument,
outbreakSpecificsSchema,
} from './outbreak-specifics';
import { PathogenDocument, pathogenSchema } from './pathogen';
import {
RevisionMetadataDocument,
Expand Down Expand Up @@ -37,7 +33,6 @@ const caseSchema = new mongoose.Schema(
required: 'Must include revision metadata',
},
notes: String,
outbreakSpecifics: outbreakSpecificsSchema,
pathogens: [pathogenSchema],
sources: {
type: [sourceSchema],
Expand Down Expand Up @@ -75,7 +70,6 @@ type CaseDocument = mongoose.Document & {
location: LocationDocument;
revisionMetadata: RevisionMetadataDocument;
notes: string;
outbreakSpecifics: OutbreakSpecificsDocument;
pathogens: [PathogenDocument];
sources: [SourceDocument];
symptoms: DictionaryDocument;
Expand Down
11 changes: 0 additions & 11 deletions data-serving/data-service/src/model/outbreak-specifics.ts

This file was deleted.

18 changes: 8 additions & 10 deletions data-serving/data-service/test/model/data/case.full.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,6 @@
"url": "https://www.colorado.gov/pacific/cdphe/news/10-new-presumptive-positive-cases-colorado-cdphe-confirms-limited-community-spread-covid-19"
}
],
"outbreakSpecifics": {
"livesInWuhan": false,
"reportedMarketExposure": true
},
"pathogens": [
{
"name": "sars-cov-2",
Expand All @@ -131,13 +127,15 @@
"notes": "initial data entry"
},
"importedCase": {
"additionalInformation": "Contact of a confirmed case at work.",
"notesForDiscussion": "Other stuff from notes",
"geoResolution": "admin_2",
"ID": "xyz",
"additional_information": "Contact of a confirmed case at work.",
"notes_for_discussion": "Other stuff from notes",
"geo_resolution": "admin_2",
"symptoms": "severe pneumonia:dyspnea:weakness:some free-form symptoms:that don't match the symptom dictionary\"",
"chronicDiseaseBinary": true,
"chronicDisease": "hypertension:type 2 diabetes:coronary heart disease:lung cancer:some free-form chronic diseases:that don't match the chronic disease dictionary",
"chronic_disease_binary": "true",
"outcome": "discharge 2/12",
"adminId": "291"
"admin_id": "291",
"lives_in_Wuhan": "false",
"reported_market_exposure": "true"
}
}

This file was deleted.

This file was deleted.

23 changes: 0 additions & 23 deletions data-serving/data-service/test/model/outbreak-specifics.test.ts

This file was deleted.

8 changes: 3 additions & 5 deletions data-serving/samples/cases.json
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,6 @@
"url": "https://www.colorado.gov/pacific/cdphe/news/10-new-presumptive-positive-cases-colorado-cdphe-confirms-limited-community-spread-covid-19"
}
],
"outbreakSpecifics": {
"livesInWuhan": false,
"reportedMarketExposure": true
},
"pathogens": [
{
"name": "sars-cov-2",
Expand Down Expand Up @@ -194,7 +190,9 @@
"geo_resolution": "admin_2",
"chronic_disease_binary": "true",
"outcome": "discharge 2/12",
"admin_id": "291"
"admin_id": "291",
"lives_in_Wuhan": "false",
"reported_market_exposure": "true"
}
},
{
Expand Down
20 changes: 12 additions & 8 deletions data-serving/scripts/convert-data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ The following fields are lossy:

- `demographics.ageRange`: Some values are too large to be ages. Ex. row `002-23162` with age value `2073`.
- `events[name='onsetSymptoms']`: Some values are in an invalid format, ex. row `000-1-20073` with value `08.03.20202`
- `outbreakSpecifics.reportedMarketExposure`: Some values are not bools, ex. row `000-1-13167` has value
`exposed to people who come back from wuhan`
- `travelHistory.location`: This field is highly unstructured, and includes lists of locations, free-form text, and
locations of all (unmarked) granularity.
- `travelHistory.dateRange`: As with `events[name='onsetSymptoms']`, the date format varies.
Expand All @@ -45,22 +43,16 @@ The following fields are lossy:
The following fields are *not* lossy, although they require conversion to a new type:

- `sex`
- `outbreakSpecifics.livesInWuhan`
- `location.geometry.latitude`, `location.geometry.longitude`
- `events[name='admissionHospital']`, `events[name='confirmed']`, `events[name='deathOrDischarge']`

### Future improvements

- Improve disambiguation of `travelHistory.location`. For example, if the person lives in Florida and has traveled to
Georgia, it's more likely to be the state than the country.

- Add validation logic to all dates to ensure that they are between 12/2019 and today.

- If a date fails to parse/validate in the `mm/dd/yy` format, attempt to parse it in other formats, including
`dd/mm/yy`, `mm.dd.yy`, and `dd.mm.yy`.

- Take free-form text from `outbreakSpecifics.reportedMarketExposure` and add it to the notes field.

- Clean up the source data in the case of obvious errors in the logs, e.g. ages in the thousands or dates with one too
many or too few digits.

Expand Down Expand Up @@ -109,6 +101,18 @@ Fields that can't be converted include:
- `source.id` and `pathogens.sequenceSource.id`: Sources may have ids to link them to the new `sources` collection; it's
possible that we may be able to backfill this later once that dataset is developed and we can cross-reference by URL.

Fields that are not carrying over to the new schema, though they will be included in `importedCase`:

- Fields that were relevant early on in the outbreak, but aren't tracked any longer: `lives_in_Wuhan`,
`reported_market_exposure`

- Fields supplanted by new values: `ID`

- Non-normalized or redunant location fields, including `province`, `geo_resolution`, `location`, `admin3`,
`country_new`, `admin_id`

- Fields whose values can be imputed from other fields: `geo_resolution`, `chronic_disease_binary`

### Backfilled fields

We are backfilling fields including:
Expand Down
4 changes: 2 additions & 2 deletions data-serving/scripts/convert-data/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
''' The path to the geocoding script in the nCoV2019 repo. '''
GEOCODER_REPO_PATH = 'code/sheet_cleaner/geocoding'

# TODO(khmoran): Include 'outcome' once the curator UI transitions to using the
# TODO(khmoran): Exclude 'outcome' once the curator UI transitions to using the
# new events-based outcome field.
LOSSY_FIELDS = [
'ID', 'province', 'geo_resolution', 'date_onset_symptoms',
'date_admission_hospital', 'date_confirmation', 'travel_history_dates',
'travel_history_location', 'reported_market_exposure',
'chronic_disease_binary', 'outcome', 'location', 'admin3', 'country_new',
'admin_id', 'travel_history_binary'
'admin_id', 'travel_history_binary', 'lives_in_Wuhan'
]

'''
Expand Down
5 changes: 1 addition & 4 deletions data-serving/scripts/convert-data/convert_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
convert_demographics, convert_dictionary_field, convert_events,
convert_imported_case, convert_location, convert_revision_metadata_field,
convert_notes_field, convert_sources_field, convert_pathogens_field,
convert_outbreak_specifics, convert_travel_history)
convert_travel_history)
from typing import Any
from constants import (
DATA_CSV_FILENAME, DATA_GZIP_FILENAME, DATA_REPO_PATH, GEOCODER_DB_FILENAME,
Expand Down Expand Up @@ -151,9 +151,6 @@ def convert(infile: str, outfile: str, geocoder: Any,
json_case['pathogens'] = convert_pathogens_field(
csv_case['sequence_available'])

json_case['outbreakSpecifics'] = convert_outbreak_specifics(
csv_case['ID'], csv_case['reported_market_exposure'], csv_case['lives_in_Wuhan'])

json_case['travelHistory'] = convert_travel_history(
geocoder, csv_case['ID'],
csv_case['travel_history_dates'],
Expand Down
43 changes: 0 additions & 43 deletions data-serving/scripts/convert-data/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,49 +402,6 @@ def convert_pathogens_field(sequence: str) -> List[Dict[str, Any]]:
}] if sources else None


def convert_outbreak_specifics(id: str, reported_market_exposure: str,
lives_in_wuhan: str) -> Dict[str, bool]:
'''
Converts the covid-19-specific fields into a new outbreakSpecifics
object.

Parameters:
id: The id of the input row for logging a failed conversion.

Returns:
None: When the input is empty.
Dict[str, bool]: When the input is nonempty. The dictionary is in the
format:
{
'reportedMarketExposure': bool,
'livesInWuhan': bool
}
'''

outbreak_specifics = {}

try:
normalized = parse_bool(reported_market_exposure)
if normalized is not None:
outbreak_specifics['reportedMarketExposure'] = normalized
except ValueError as e:
log_error(
id, 'reported_market_exposure',
'outbreakSpecifics.reportedMarketExposure',
reported_market_exposure, e)

try:
normalized = parse_bool(lives_in_wuhan)
if normalized is not None:
outbreak_specifics['livesInWuhan'] = normalized
except ValueError as e:
log_error(
id, 'lives_in_wuhan', 'outbreakSpecifics.livesInWuhan',
lives_in_wuhan, e)

return outbreak_specifics or None


def convert_travel_history(geocoder: Any, id: str, dates: str,
location: str) -> Dict[str, Any]:
'''
Expand Down