In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from scipy.stats import binomtest
from scipy.stats import pearsonr
import geopandas as gp
from joblib import load

# Post-mapping analysis

~This uses the data mapped to top 250 categories, plus an 'other' category. Practice names are not retained in this data.~

Replaced by feature data with vet names for deeper analysis:

In [None]:
# Load practice/batch with test result feature table
#inputVars = pd.read_csv('/Data/TB_Diagnostics/inputVars_NEW.csv', dtype=float, parse_dates=['dateOfTest'])
inputVars = pd.read_csv('/Data/TB_Diagnostics/inputVars_noCat.csv', parse_dates=['dateOfTest'],low_memory=False)
inputVars_model = pd.read_csv('/Data/TB_Diagnostics/inputVars.csv', parse_dates=['dateOfTest'],low_memory=False)

In [None]:
# Split into features(X)/target(y) for model validation later
data_y = inputVars.confirmedBreakdown.to_numpy().astype(bool)
data_X = inputVars.drop(columns=['confirmedBreakdown'])
data_y_model = inputVars_model.confirmedBreakdown.to_numpy().astype(bool)
data_X_model = inputVars_model.drop(columns=['confirmedBreakdown'])

In [None]:
inputVars

In [None]:
# Proportion of positive tests by vet
practice_test_sum = inputVars.groupby('vetPractice')['resultOfTest'].sum()
practice_test_count = inputVars.groupby('vetPractice')['resultOfTest'].count()
sb.histplot(practice_test_sum/practice_test_count)
plt.title('Proportion of positve tests by vet practice')
plt.xlabel('Positive Tests')

In [None]:
# Proportion of positve tests by tuberculin batch (Bovine)
batch_test_sum = inputVars.groupby('batchBovine')['resultOfTest'].sum()
batch_test_count = inputVars.groupby('batchBovine')['resultOfTest'].count()
sb.histplot(batch_test_sum/batch_test_count)
plt.title('Proportion of positve tests by tuberculin batch (Bovine)')
plt.xlabel('Positive Tests')

In [None]:
# Proportion of residuals by vet (where a negative (/posive) test was (/not) followed by a breakdown)
inputVars['residual'] = (inputVars.resultOfTest != inputVars.confirmedBreakdown)
practice_residual_sum = inputVars.groupby('vetPractice')['residual'].sum()
practice_residual_count = inputVars.groupby('vetPractice')['residual'].count()
sb.histplot(practice_residual_sum/practice_residual_count)
plt.title('Proportion of residuals by vet practice')
plt.xlabel('Proportion of residuals')

In [None]:
# Vet practice accuracy (test --> breakdown)
plt.rcParams.update({'font.size': 18})
inputVars['vet_acc'] = (inputVars.resultOfTest == inputVars.confirmedBreakdown)
practice_acc_sum = inputVars.groupby('vetPractice')['vet_acc'].sum()
practice_acc_count = inputVars.groupby('vetPractice')['vet_acc'].count()
sb.histplot(practice_acc_sum/practice_acc_count)
plt.title('Accuracy by vet practice')
plt.xlabel('Test accuracy')
plt.savefig('../Paper/figs/vet_acc.pdf',bbox_inches='tight')

In [None]:
# Proportion of residuals by tuberculin batch (where a negative (/posive) test was (/not) followed by a breakdown)
batch_residual_sum = inputVars.groupby('batchBovine')['residual'].sum()
batch_residual_count = inputVars.groupby('batchBovine')['residual'].count()
sb.histplot(batch_residual_sum/batch_residual_count)
plt.title('Proportion of residuals by tuberculin batch (Bovine)')
plt.xlabel('Proportion of residuals')

In [None]:
# Binomial test for vet practices
expected_success = sum(inputVars.resultOfTest == inputVars.confirmedBreakdown) / len(inputVars)
pvals_vet = []
for i in inputVars.vetPractice.dropna().unique():
    results_for_practice = inputVars[inputVars.vetPractice==i]
    successes = sum(results_for_practice.resultOfTest == results_for_practice.confirmedBreakdown)
    trials = len(results_for_practice)
    pvals_vet.append(binomtest(successes,trials,expected_success).pvalue)

In [None]:
# Distribution of p values
sb.histplot(pvals_vet,bins=20)

In [None]:
# proportion of outliers
sum(np.array(pvals_vet)<0.05) / len(pvals_vet)

In [None]:
# Binomial test for tuberculin batches
expected_success = sum(inputVars.resultOfTest == inputVars.confirmedBreakdown) / len(inputVars)
pvals_batch = []
for i in inputVars.batchBovine.dropna().unique():
    results_for_batch = inputVars[inputVars.batchBovine==i]
    successes = sum(results_for_batch.resultOfTest == results_for_batch.confirmedBreakdown)
    trials = len(results_for_batch)
    pvals_batch.append(binomtest(successes,trials,expected_success).pvalue)

In [None]:
# Distribution of p values
sb.histplot(pvals_batch,bins=20)

In [None]:
# proportion of outliers
sum(np.array(pvals_batch)<0.05) / len(pvals_batch)

# Comparison of high/low performing practices

Compare practices to see if high/low perfomring ones have distinct features?
* Size of practice (number of tests/yr)
* Size of herds managed
* Location?
* ??

In [None]:
# Add year of test
inputVars['yearOfTest'] = inputVars.dateOfTest.apply(lambda x:x.year)

In [None]:
# Practice accuracy
#  Proportion of tests conducted that result in confirmed breakdown within 90 days
practice_accuracy = 1-(inputVars.groupby(['vetPractice'])['residual'].sum() / inputVars.groupby(['vetPractice'])['residual'].count())
practice_accuracy.name = "accuracy"

In [None]:
# Create a dataframe to add other stats to
practice_stats = pd.DataFrame(practice_accuracy)

In [None]:
# Size of practice (by number of tests conducted)
practice_stats['numberOfTests'] = inputVars.groupby(['vetPractice'])['residual'].count()

In [None]:
# Mean size of herds for practice
practice_stats['meanFarmSize'] = inputVars.groupby(['vetPractice'])['animalsTested'].mean()

In [None]:
practice_stats

In [None]:
# Drop the 'other' category, leaving only the top 250 practices by size
#practice_stats_top250 = practice_stats[practice_stats.index<250]

In [None]:
# Get only vets that have done at least 100 tests
practice_stats_100tests = practice_stats[practice_stats.numberOfTests>=100]

In [None]:
# Plot accuracy by number of tests conducted
sb.jointplot(x='numberOfTests', y='accuracy', data=practice_stats_100tests, kind='reg')

In [None]:
sb.jointplot(x='numberOfTests', y='meanFarmSize', data=practice_stats, kind='reg')

In [None]:
# Pearson R of accuracy by number of tests conducted
pearsonr(practice_stats.numberOfTests,practice_stats.accuracy)

In [None]:
# Plot accuracy by mean farm size
plt.rcParams.update({'font.size': 18})
sb.jointplot(x='meanFarmSize', y='accuracy', data=practice_stats_100tests, kind='reg')
plt.ylabel('Vet diagnostic accuracy')
plt.xlabel('Mean herd size tested')
plt.savefig('../Paper/figs/vet_acc_herd_size.pdf',bbox_inches='tight')

In [None]:
# Pearson R of accuracy by mean farm size
pearsonr(practice_stats.meanFarmSize,practice_stats.accuracy)

In [None]:
# Ouptut practice stats
practice_stats.to_csv("/Data/TB_Diagnostics/vet_practice_stats.csv")

# Post model analysis

How does the model improve practice performance?
* Improvement in general of practice accuracy?
* Compare best/worst practices.
* Confusion matrices for best/worst.

In [None]:
# Load model
model = load('/Data/TB_Diagnostics/final_model.model')

In [None]:
# Convert dates to float
data_X_model.dateOfTest = data_X_model.dateOfTest.astype(int).astype(float)
# Add Random features
data_X_model['rand'] = np.random.random_sample(len(data_X_model))
# Convery all to float matrix
#data_X = data_X.to_numpy()

In [None]:
# run model on all data
predict_y = model.predict(data_X_model.to_numpy())

In [None]:
# Get model residuals
predict_residual = predict_y != data_y
inputVars['model_residual'] = predict_residual

In [None]:
# get model accuracy by practice
practice_stats['model_accuracy'] = 1-(inputVars.groupby(['vetPractice'])['model_residual'].sum() / inputVars.groupby(['vetPractice'])['model_residual'].count())

In [None]:
# get increase in accuracy by practice
practice_stats['accuracy_increase'] = practice_stats.model_accuracy - practice_stats.accuracy

In [None]:
# Drop the 'other' category, leaving only the top 250 practices by size
#practice_stats_top250 = practice_stats[practice_stats.index<250]

In [None]:
# increase in accuracy by practice
plt.rcParams.update({'font.size': 18})
sb.histplot(practice_stats.accuracy_increase, kde=True)
plt.ylabel('Number of practices')
plt.xlabel('Accuracy increase with model')
plt.savefig('../Paper/figs/vet_improvement.pdf',bbox_inches='tight')

In [None]:
# mean increase in accuracy
practice_stats.accuracy_increase.mean()

In [None]:
# Plot model accuracy by size
plt.rcParams.update({'font.size': 18})
sb.jointplot(x='meanFarmSize', y='model_accuracy', data=practice_stats, kind='reg')
plt.ylabel('Model accuracy')
plt.xlabel('Mean herd size tested')

In [None]:
# Pearson R of model accuracy by size
pearsonr(practice_stats.meanFarmSize,practice_stats.model_accuracy)

In [None]:
# Plot model accuracy increase by herd size
sb.jointplot(x='meanFarmSize', y='accuracy_increase', data=practice_stats, kind='reg')
plt.xlabel('Mean herd size tested')
plt.ylabel('Practice accuracy increase')
plt.savefig('../Paper/figs/vet_acc_inc.pdf', bbox_inches='tight')

# Geospatial analysis

Is the geo distribution of tests with vet data similar to all tests?

Where do best/worst performing vets operate?

In [None]:
#Projections:
bng = 'epsg:27700' # British National Grid
wgs84 = 'epsg:4326' # Lat.Long.

In [None]:
#UK base map
uk_shp = gp.read_file('/Data/Shapefiles/bdline_essh_gb/Data/Supplementary_Country/country_region.shp').to_crs(bng)
#uk_shp.plot(color='white', edgecolor='black')
eng_shp = uk_shp[uk_shp.NAME=='England']

In [None]:
# create geodataframe from data
geo_data = gp.GeoDataFrame(inputVars, geometry=gp.points_from_xy(inputVars.locationX,inputVars.locationY), crs=bng)

In [None]:
# join practice stats
geo_data = geo_data.join(practice_stats, on='vetPractice')

In [None]:
geo_data

In [None]:
# Plot KDE of all tests

ax = uk_shp.to_crs(bng).plot(alpha=0.2, figsize=(10,20))
sb.kdeplot(ax=ax, x=geo_data.locationX, y=geo_data.locationY, fill=True)#, color='gold')

In [None]:
# Plot KDE of only tests with vet data

ax = uk_shp.to_crs(bng).plot(alpha=0.2, figsize=(10,20))
sb.kdeplot(ax=ax, x=geo_data.dropna(subset=['vetPractice']).locationX, y=geo_data.dropna(subset=['vetPractice']).locationY, fill=True)

----
## Testing...
----

In [None]:
# Location of Vets who do the most tests
ax = eng_shp.plot(alpha=0.2, figsize=(10,20))
geo_data.plot(ax=ax, markersize=1.0, column='vetPractice', legend=True, legend_kwds={"label": "Top 250 Vets by tests conducted (0=largest)", "orientation": "horizontal"})
ax.set_axis_off()

In [None]:
# find convex hull of points for each practice
ax = eng_shp.plot(color='white', edgecolor='black', figsize=(10,20))
vet_coverage = geo_data.dissolve('vetPractice').convex_hull.reset_index().iloc[:-1]
vet_coverage.plot(ax=ax,column='vetPractice',alpha=0.2)
ax.set_axis_off()

In [None]:
# plot locations for vet accuracy
ax = eng_shp.plot(alpha=0.2, figsize=(10,20))
geo_data.plot(ax=ax, markersize=1.0, column='accuracy', legend=True, legend_kwds={"label": "Vet performance", "orientation": "horizontal"})
ax.set_axis_off()

In [None]:
# convex hull of vet locations by accuracy
ax = eng_shp.plot(color='white', edgecolor='black', figsize=(10,20))
vet_coverage.join(practice_stats).plot(ax=ax, column='accuracy', alpha=0.2, legend=True, legend_kwds={"label": "Vet accuracy", "orientation": "horizontal"})
ax.set_axis_off()

In [None]:
# plot locations for vet mean herd size
ax = eng_shp.plot(alpha=0.2, figsize=(10,20))
geo_data.plot(ax=ax, markersize=1.0, column='meanFarmSize', cmap='viridis_r',legend=True, legend_kwds={"label": "Mean herd size for vet", "orientation": "horizontal"})
ax.set_axis_off()