In [None]:
import pandas as pd
import geopandas as gp
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from scipy.stats import binomtest

# Prepare veterinary data

This script prepares the veterinary and tuberculin batch data from UKFarmcare for the TB Diagnostics model.

## Load raw data

The data is supplied in Excel format, encrypted, with a sheet per year. Here we load indinviual year sheets that have been extracted as CSVs.
There are no batch numbers for 2015.

In [None]:
cols = ['Date','CPH','Practice','Batch Numbers']

In [None]:
data_2015 = pd.read_csv('/Data/TB_Diagnostics/TB_Vet_Data/TestData_2015.csv', usecols=cols[:-1], dtype=str, parse_dates=['Date'], dayfirst=True)
data_2016 = pd.read_csv('/Data/TB_Diagnostics/TB_Vet_Data/TestData_2016.csv', usecols=cols, dtype=str, parse_dates=['Date'], dayfirst=True)
data_2017 = pd.read_csv('/Data/TB_Diagnostics/TB_Vet_Data/TestData_2017.csv', usecols=cols, dtype=str, parse_dates=['Date'], dayfirst=True)
data_2018 = pd.read_csv('/Data/TB_Diagnostics/TB_Vet_Data/TestData_2018.csv', usecols=cols, dtype=str, parse_dates=['Date'], dayfirst=True)
data_2019 = pd.read_csv('/Data/TB_Diagnostics/TB_Vet_Data/TestData_2019.csv', usecols=cols, dtype=str, parse_dates=['Date'], dayfirst=True)

## Concatenate all years

In [None]:
data = pd.concat([data_2015,data_2016,data_2017,data_2018,data_2019], ignore_index=True)

## Clean data

In [None]:
# Remove any rows with NAs in both practice and batch:
data = data.dropna(how='all', subset=['Practice','Batch Numbers'])

In [None]:
# Remove * suffix from practice names:
data.loc[:,'Practice'] = data.Practice.str.replace('*','', regex=False)

In [None]:
# Drop duplicates
data = data.drop_duplicates(subset=['Date','CPH'])

In [None]:
data

In [None]:
data.dropna()

In [None]:
data.Practice.value_counts()

## Parse batch numbers

Unfortunately, batch numbers come in a bewildering array of formats. Gernerally, however the Avian batch is the first six digit number and the Bovine batch the second. Parse according ot this rule, stripping away any other text.

In [None]:
# split out the batch numbers
# (here we're assuming the first number is avian, the second is bovine)
batch_split = data['Batch Numbers'].str.split("[^0-9]").str.join(' ').str.split(expand=True,n=1).dropna()
batch_split = batch_split.rename(columns={0:'BatchAvian',1:'BatchBovine'})

In [None]:
# Limit to 6 digit numbers
batch_split = batch_split[batch_split.BatchAvian.str.contains('^[0-9]{6}$')]
batch_split = batch_split[batch_split.BatchBovine.str.contains('^[0-9]{6}$')]

In [None]:
# rejoin with data
data = data.join(batch_split)

In [None]:
data.BatchBovine.value_counts()

In [None]:
data.BatchAvian.value_counts()

## Analysis

In [None]:
data.Practice.value_counts().plot.bar(figsize=(60,10))

In [None]:
data.BatchBovine.value_counts().plot.bar(figsize=(60,10))

In [None]:
data.BatchAvian.value_counts().plot.bar(figsize=(60,10))

In [None]:
fig = plt.figure(figsize=(20,5))
ax = plt.subplot()
data.dropna(subset=['Practice']).Date.value_counts().sort_index().resample('M').sum().plot.bar(ax=ax, color='red',alpha=0.5,label='Practice')
data.dropna(subset=['Batch Numbers']).Date.value_counts().sort_index().resample('M').sum().plot.bar(ax=ax,color='blue',alpha=0.5,label='Batch')
plt.legend()

In [None]:
data

## Raw output

In [None]:
data.drop(columns=['Batch Numbers']).to_csv('/Data/TB_Diagnostics/vetData_nonCat.csv',index=False)

## Categorical encoding

Practice and Batch data are nominal, high-cardinality features, so we need to encode them down to at most 255 categories for Histogram-based GBT, preferably lower for better computational performance (this is traded off with predictive performance...).

One method to do this is Bayesian LeaveOneOut encoding [REF?], but this requires comparison to the target variable.
Another is Hashing, but this splits the feature into multiple features, losing explainability...

We choose here to take the 250 most frequent categories and an "other" cetegory.

In [None]:
# Function to map categorical feature to an ID, grouping any beyond the top 250 into one ID and NaNs into one ID
#def map_feature_to_category_id(feature):
#    #function for limititing to 250
#    def top250(x): return x if x<250 else 250
#    size_order = list(feature.value_counts(dropna=False).index)
#    ids = list(map(top250,list(range(len(size_order)))))
#    index = {size_order[i]:ids[i] for i in range(len(size_order))}
#    return feature.apply(lambda x:index[x])

In [None]:
def map_feature_to_category_id(feature):
    size_order = list(feature.value_counts().index)
    mapping = dict(zip(size_order, list(map(lambda x: min(x,250), range(1,len(size_order)+1)))))
    def catmap(x):
        if pd.isna(x):
            return x
        else:
            return mapping[x]
    return feature.apply(catmap)

In [None]:
# Apply the categorical mapping to the data
data.Practice = map_feature_to_category_id(data.Practice)
data.BatchAvian = map_feature_to_category_id(data.BatchAvian)
data.BatchBovine = map_feature_to_category_id(data.BatchBovine)

## Categorical output

In [None]:
data.drop(columns=['Batch Numbers']).to_csv('/Data/TB_Diagnostics/vetData.csv',index=False)