In [1]:
import pandas as pd

In [2]:
# Read in an csv file
df = pd.read_csv('report_guest_data_va.csv', header=1)

In [3]:
df.head()


Unnamed: 0,Organization Name,Organization Address,Organization City,Organization State,Organization ZIP,Jurisdiction,Organization Country,Phone,Status,Case Number,...,Primary,Suspected/Observed,Admission Comments,Rescue State,Rescue Juristiction,Rescue Address,Other Rescue Information,Latitude,Longitude,Elevation
0,Visitor WILD-ONe Account,1800 South Delphine Ave PO Box 1557,Waynesboro,VA,22980,Waynesboro (City),United States,(540) 942-9453,Active,10-0000095,...,Yes,Suspected,,ID,Boise County,,,43.612476,-116.216784,817.056641
1,Visitor WILD-ONe Account,1800 South Delphine Ave PO Box 1557,Waynesboro,VA,22980,Waynesboro (City),United States,(540) 942-9453,Active,10-0000096,...,Yes,Suspected,,TX,Brown County,,,28.778392,-89.033203,-403.770294
2,Visitor WILD-ONe Account,1800 South Delphine Ave PO Box 1557,Waynesboro,VA,22980,Waynesboro (City),United States,(540) 942-9453,Active,10-0000106,...,Yes,Suspected,,FL,Duval County/City of Jacksonville,,,39.749282,-84.194842,224.74234
3,Visitor WILD-ONe Account,1800 South Delphine Ave PO Box 1557,Waynesboro,VA,22980,Waynesboro (City),United States,(540) 942-9453,Active,10-0000110,...,No,Observed,,MN,Dakota County,,,,,
4,Visitor WILD-ONe Account,1800 South Delphine Ave PO Box 1557,Waynesboro,VA,22980,Waynesboro (City),United States,(540) 942-9453,Active,10-0000110,...,Yes,Suspected,,MN,Dakota County,,,,,


In [4]:
df.columns

Index(['Organization Name', 'Organization Address', 'Organization City',
       'Organization State', 'Organization ZIP', 'Jurisdiction',
       'Organization Country', 'Phone', 'Status', 'Case Number', 'Patient ID',
       'Other Idenifier', 'Class', 'Common Species Name', 'Species',
       'Subspecies', 'Genus', 'TSN Code', 'Gender', 'Patient Flag',
       'Date Found', 'Time Found', 'Date Admitted', 'Time Admitted',
       'Details of rescue', 'Care Given', 'Circumstances of Rescue', 'Primary',
       'Suspected/Observed', 'Admission Comments', 'Rescue State',
       'Rescue Juristiction', 'Rescue Address', 'Other Rescue Information',
       'Latitude', 'Longitude', 'Elevation'],
      dtype='object')

# Target Schema for Mapping


In [None]:
'Organization Name', 'Case Number', 'Patient ID', 'Common Species Name',
       'Date Admitted', 'Circumstances of Rescue', 'Rescue State',
       'Rescue Juristiction', 'Rescue Address', 'Other Rescue Information',
       'Latitude', 'Longitude', 'Elevation', 'Disposition'

## Drop Rows that have no Lat / Long information

In [5]:
# Count how many NaN values there are in the Latitude column
df['Latitude'].isnull().sum()

np.int64(168)

In [6]:
# Drop all rows with NaN values in the Latitude column
df = df.dropna(subset=['Latitude'])
assert df['Latitude'].isnull().sum() == 0

In [7]:
# Check that there are no rows with NaN values in the Longitude column
assert df['Longitude'].isnull().sum() == 0

## Inspecting Jurisdiction and Rescue State Info

In [18]:
df['Jurisdiction'].unique()

array(['Waynesboro (City)'], dtype=object)

In [19]:
df['Rescue State'].unique()

array(['ID', 'TX', 'FL', 'VA', 'MN', 'MD', 'CA', 'NJ', 'IL', 'OR', 'MI',
       'UT', 'IN', 'OH', 'WI', 'WA', 'VT', 'KA', 'NY', 'IA', 'AL', 'GA',
       'AK', 'CT', 'OK', 'ME', 'NC', 'NE', 'AR', 'KY', 'MO', 'PA', 'RI',
       'NV', 'AB', 'QC', 'BC', 'MB', 'MA', 'SC', 'NM', 'ND', 'MT', 'CO',
       'AZ'], dtype=object)

## Profiling Disposition

In [20]:
df['Care Given'].unique()

array([nan, 'offered water', 'Syringed water by mouth.', 'put in box',
       'Bottle fed pedialyte to rehydrate', 'none',
       'Put in box and covered', 'Put in empty ice chest',
       'usual first aid', 'water', 'Bowl of Water',
       'Covered and put in box', 'gave it hotdogs', 'nothing',
       'placed in box and transported to humane society',
       'fed water and chicken', 'no', 'FLUIDS', 'deer carcas', 'yes',
       'administered Sub Q Fluids, kept warm',
       'Applied helium gas to re-inflate', 'fed kitten milk',
       'Fed cows milk and cheerios', 'No', 'chicken legs', 'Put in a box',
       'offered water and bird seed', 'Offered water,kale,and apples',
       'fed water',
       "Painter's tape wrapped around the body, preventing duck from flapping",
       'kjhgkhg', 'fed caviar', 'eau', 'Transport',
       'Head Injury Protocols', 'sq fluids, metacam (1.5 mg/ml)', 'na',
       'half bottle of milk', 'put in box and driven directly to center',
       'none; brought 

## Profiling Patient ID and Case Number (duplicates?)

In [20]:
# Check the Case Number column for duplicates
assert df['Case Number'].duplicated().sum() == 0

AssertionError: 

In [25]:
# Print out a list of the duplicated Case Numbers
# df[df['Case Number'].duplicated(keep=False)][['Case Number', 'Patient ID']]
# df[df['Case Number'].duplicated(keep=False)]['Case Number']
duplicate_case_numbers = list(df[df['Case Number'].duplicated()]['Case Number'])
print(duplicate_case_numbers)
print(f"There are {len(duplicate_case_numbers)} duplicate case numbers")

['10-0000135', '11-0001425', '11-0007396', '11-0010623', '13-0000820', '13-0031922', '13-0031923', '15-0020320', '16-0075828', '19-0006124', '10-0000105', '17-0100773', '15-0014855']
There are 13 duplicate case numbers


In [30]:
df[df['Case Number'] == duplicate_case_numbers[0]]

Unnamed: 0,Organization Name,Organization Address,Organization City,Organization State,Organization ZIP,Jurisdiction,Organization Country,Phone,Status,Case Number,...,Primary,Suspected/Observed,Admission Comments,Rescue State,Rescue Juristiction,Rescue Address,Other Rescue Information,Latitude,Longitude,Elevation
11,Visitor WILD-ONe Account,1800 South Delphine Ave PO Box 1557,Waynesboro,VA,22980,Waynesboro (City),United States,(540) 942-9453,Active,10-0000135,...,No,Suspected,,NJ,Middlesex County,,,40.502189,-74.40659,38.416168
12,Visitor WILD-ONe Account,1800 South Delphine Ave PO Box 1557,Waynesboro,VA,22980,Waynesboro (City),United States,(540) 942-9453,Active,10-0000135,...,Yes,Suspected,,NJ,Middlesex County,,,40.502189,-74.40659,38.416168


KeyError: 11