# Elk ETL

In [1]:
#import libraries
import pandas as pd
import numpy as np
import re

#import functions
from functions import get_sample_id
from functions import panda_stripper

In [2]:
#read main table and strip
elk_table = pd.read_excel('data/Elk captures 21-22.xlsx')
elk_table = panda_stripper(elk_table)

#rename cols
elk_table.columns = ['sample_id', 'archive_id', 'collar_id', 'species', 'sex', 'capture_date', 'capture_unit', 'staging_area', 'age', 'comments']

## Extract and tranform lab results

### ehdv

 Epizootic Hemorrhagic Disease Virus tests come in from the PDFs in a very messy format, so we must deal with it. 


In [3]:
#read and strip table
ehdv_raw_df = pd.read_excel('data/elk_tables.xlsx', sheet_name='ehdv', usecols=[0,1])
ehdv_raw_df = panda_stripper(ehdv_raw_df)

ehdv_raw_df.head(15)

Unnamed: 0,test_and_result,animal
0,Test: Epizootic Hemorrhagic Disease Virus Type...,"Animal: 1, BCE1132/EL21-137"
1,Result Titer,
2,Positve >1280,
3,Test: Epizootic Hemorrhagic Disease Virus Type...,"Animal: 1, BCE1132/EL21-137"
4,Result Titer,
5,Negative <20,
6,Test: Epizootic Hemorrhagic Disease Virus Type...,"Animal: 1, BCE1132/EL21-137"
7,Result Titer,
8,Negative <20,
9,Test: Epizootic Hemorrhagic Disease Virus Type...,"Animal: 2, BCE1134/EL21-138"


In [4]:
#select every 3rd row starting with the 1st row
ehdv_df = ehdv_raw_df.iloc[0::3]
#reset index
ehdv_df = ehdv_df.reset_index(drop=True)
#rename cols
ehdv_df.columns = ['test', 'animal']

#extract result and titer from ehdv_raw_df (i.e., every 3rd row starting with 2nd row)
df = ehdv_raw_df.iloc[2::3]
del df['animal']
#reset index
df = df.reset_index(drop=True)
df.columns = ['result']

#merge
ehdv_df = pd.concat([ehdv_df, df], axis=1)

#get sample ids
ehdv_df['sample_id'] = ehdv_df['animal'].apply(lambda row: re.split(r"\s|/", row)[2][:7])
del ehdv_df['animal']

#split into results and values
ehdv_df['val'] = ehdv_df.result.apply(lambda row: row.split()[1])
ehdv_df['result'] = ehdv_df.result.apply(lambda row: row.split()[0])

#split ehdv tests
ehdv1_df = ehdv_df[ehdv_df.test == 'Test: Epizootic Hemorrhagic Disease Virus Type 1 (VN)']
ehdv2_df = ehdv_df[ehdv_df.test == 'Test: Epizootic Hemorrhagic Disease Virus Type 2 (VN)']
ehdv6_df = ehdv_df[ehdv_df.test == 'Test: Epizootic Hemorrhagic Disease Virus Type 6 (VN)']

#rename cols
ehdv1_df.columns = ['test', 'ehdv_type1_result', 'sample_id', 'ehdv_type1_val']
ehdv2_df.columns = ['test', 'ehdv_type2_result', 'sample_id', 'ehdv_type2_val']
ehdv6_df.columns = ['test', 'ehdv_type6_result', 'sample_id', 'ehdv_type6_val']

#drop unneeded cols
del ehdv1_df['test']
del ehdv2_df['test']
del ehdv6_df['test']

#merge the 3 dfs into single tidy ehdv df
ehdv_df = ehdv1_df.merge(ehdv2_df, on='sample_id', how='outer')
ehdv_df = ehdv_df.merge(ehdv6_df, on='sample_id', how='outer')

#merge into main table
elk_table = elk_table.merge(ehdv_df, on='sample_id', how='outer')

### bovine viral diarrhea virus

In [5]:
#read and trip table
bvd_df = pd.read_excel('data/elk_tables.xlsx', sheet_name='bv_diarrhea')
bvd_df = panda_stripper(bvd_df)

#get sample ids
bvd_df['sample_id'] = bvd_df['label'].apply(lambda row: re.split(r"\s|/", row)[1])

#drop label col
del bvd_df['label']

#rename cols
bvd_df.columns = ['bvd_result', 'sample_id']

#merge with main table
elk_table = elk_table.merge(bvd_df, on='sample_id', how='outer')


### pegrant

In [6]:
#read and strip table
preggers_df = pd.read_excel('data/elk_tables.xlsx', sheet_name='preg', usecols=[1,2,3])
preggers_df = panda_stripper(preggers_df)

#get sample ids
preggers_df['sample_id'] = preggers_df['animal_id'].apply(lambda row: row[:7])

#drop unneeded col
del preggers_df['animal_id']

#rename cols
preggers_df.columns = ['preg_OD_val', 'preg_result', 'sample_id']

#merge into main table
elk_table = elk_table.merge(preggers_df, on='sample_id', how='outer')

### bluetongue

In [7]:
#read and strip table
bt_df = pd.read_excel('data/elk_tables.xlsx', sheet_name='bluetongue')
bt_df = panda_stripper(bt_df)

#get sample id
bt_df['sample_id'] = bt_df.label.apply(lambda row: row[:7])

#drop unneeded col
del bt_df['label']

#rename
bt_df.columns = ['bluetongue_result', 'sample_id']

#merge with main table
elk_table = elk_table.merge(bt_df, on='sample_id', how='outer')

In [8]:
elk_table.head()

Unnamed: 0,sample_id,archive_id,collar_id,species,sex,capture_date,capture_unit,staging_area,age,comments,ehdv_type1_result,ehdv_type1_val,ehdv_type2_result,ehdv_type2_val,ehdv_type6_result,ehdv_type6_val,bvd_result,preg_OD_val,preg_result,bluetongue_result
0,BCE1132,EL21-137,51628.0,Elk,Female,2022-03-07,Book Cliffs,Bryson Canyon,23.5,,Negative,<20,Positve,>1280,Negative,<20,Negative,0.66,Pregnant,Detected
1,BCE1134,EL21-138,46687.0,Elk,Female,2022-03-07,Book Cliffs,Bryson Canyon,19.5,,Negative,<20,Negative,<20,Negative,<20,Negative,0.649,Pregnant,Detected
2,BCE1135,EL21-139,46752.0,Elk,Female,2022-03-07,Book Cliffs,Bryson Canyon,10.5,,Negative,<20,Positive,40,Negative,<20,Negative,0.087,Open,Not detected
3,BCE1136,EL21-140,51622.0,Elk,Female,2022-03-07,Book Cliffs,Bryson Canyon,13.5,,Negative,<20,Negative,<20,Negative,<20,Negative,0.621,Pregnant,Not detected
4,BCE1138,EL21-141,46700.0,Elk,Female,2022-03-07,Book Cliffs,Bryson Canyon,22.5,,Negative,<20,Negative,<20,Negative,<20,,0.612,Pregnant,Not detected


## Load

### Export to excel for office use:

In [9]:
elk_table.to_excel('data/finals/Elk 2021-2022 Lab Results.xlsx')