In [3]:
#import libraries
import pandas as pd
import numpy as np
import re

#define handy functions
def panda_stripper(df):
    '''Strips all string columns in a pandas dataframe, in place. Seems like this should already be a pd method, but whatever.'''
    df_obj = df.select_dtypes(['object'])
    df[df_obj.columns] = df_obj.apply(lambda row: row.str.strip())
    return df

def get_sample_id(df, column):
    '''
    Extracts sample_id from a df column. 
    INPUT: df, column name
    Expected format: "BCB111 / BIS21-027 :: Serum"
    Action: splits string by whitespace or commas and returns first element of string
    OUTPUT: adds df column named 'sample_id' with the returned value in each cell
    '''
    df['sample_id'] = df[column].apply(lambda row: re.split(r"\s|,", row)[0])
    return df

def wildlife_merge(df1, df2):
    '''pandas merge with preferred settings'''
    df = df1.merge(df2.drop_duplicates(subset='sample_id'), on='sample_id', how='outer')
    return df



In [10]:
#read excel table and strip
deer_table = pd.read_excel('data/Muledeer_2021_22_comp_sample sheet.xlsx')
deer_table = panda_stripper(deer_table)
#rename columns
deer_table.columns = ['sample_id', 'collar_id', 'species', 'sex', 'capture_date', 'capture_unit', 'staging_area', 'weight', 'age', 'capture_lat', 'capture_long', 'body_condition', 'lactation', 'comments']

#read and strip adenovirus table
adenovirus_df = pd.read_excel('data/deer_tables.xlsx', sheet_name='adenovirus')
adenovirus_df = panda_stripper(adenovirus_df)
#get sample id
adenovirus_df = get_sample_id(adenovirus_df, 'sample_id')
#combine result1 and result2 cols
adenovirus_df['adenovirus_result'] = adenovirus_df.apply(lambda row: row['result1'] if row['result1'] is not np.NaN else row['result2'], axis=1)
del adenovirus_df['result1']
del adenovirus_df['result2']
#trim "Negative @" result
adenovirus_df['adenovirus_result'] = adenovirus_df['adenovirus_result'].apply(lambda row: row.split()[0])
#merge with deer_table
deer_table = wildlife_merge(deer_table, adenovirus_df)

#read and strip EHDV table
ehdv_df = pd.read_excel('data/deer_tables.xlsx', sheet_name='ehdv', usecols=[0,2])
ehdv_df = panda_stripper(ehdv_df)
#get sample_id and drop original col
ehdv_df = get_sample_id(ehdv_df, 'Animals::Specimens')
del ehdv_df['Animals::Specimens']
#rename cols
ehdv_df.columns = ['ehdv_result', 'sample_id']
#merge with deer_table
deer_table = wildlife_merge(deer_table, ehdv_df)

#read and strip bluetonge table
bluetongue_df = pd.read_excel('data/deer_tables.xlsx', sheet_name='bluetongue')
bluetongue_df = panda_stripper(bluetongue_df)
#get sample_id and drop original col
bluetongue_df = get_sample_id(bluetongue_df, 'specimen')
del bluetongue_df['specimen']
#rename cols
bluetongue_df.columns = ['bluetongue_result', 'sample_id']
#merge with deer_table
deer_table = wildlife_merge(deer_table, bluetongue_df)

#export to xlsx for office use
deer_table.to_excel('data/finals/Mule Deer 2021-2022 Lab Results.xlsx', index=False)
