## Predicting Horse Colic Survival: A Machine Learning Approach for Early Intervention
### 6/7/23

X23 Group 15 Members:
- Andrew Kassis
- Christian Vargas
- Deniz Erisgen
- Tiffany Andersen

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://www.animalclinicfortlupton.com/uploads/5/7/6/2/57623911/colicrxhorse-with-gi-diagram_orig.jpg")
# Animal Clinic, LLC "Colic Diagnosis" photo source  : https://www.animalclinicfortlupton.com/colic-diagnosis.html

Objective: To develop a predictive model capable of estimating the probability of survival for horses affected by colic. This project aims to provide valuable insights to veterinarians and horse owners, enabling them to make well-informed decisions regarding treatment options and care. Furthermore, the objective involves conducting an in-depth analysis to identify the most influential features or attributes for accurately predicting colic occurrences.

### Initial Exploration

In [None]:
import warnings
# Ignore the warning
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Load the dataset and test dataset from the provided link
url = 'https://raw.githubusercontent.com/c-vargas/horse_colic_data/main/horse_colic.csv'
df = pd.read_csv(url)

url2 = 'https://raw.githubusercontent.com/c-vargas/horse_colic_data/main/horse_test_data.csv'
test = pd.read_csv(url2)

In [None]:
# Check the size of the dataset
data_size = df.shape
print("Data Size:", data_size)
test_size = test.shape
print("Test data Size:", test_size)

In [None]:
print(df.head(10))

In [None]:
print(df.dtypes.value_counts())

In [None]:
# Get the types of each column
column_types = df.dtypes
print("Column Types:")
print(column_types)

In [None]:
# Count the number of missing values in each column
na_count = df.isnull().sum()
print("NA Values:")
print(na_count)

In [None]:
# Bar chart to illustrate missing values per feature
plt.figure(figsize=(10, 8))
# Customize bar colors
colors = ['red' if value == max(na_count) else 'gray' for value in na_count]
plt.bar(range(len(na_count)), na_count, color=colors)
plt.title('Missing Values per Feature', fontsize=16, fontweight='bold')
plt.xlabel('Features', fontsize=12)
plt.ylabel('Missing Values', fontsize=12)
# Adjust the x-ticks properties
plt.xticks(np.arange(len(df.columns)), df.columns.values, rotation='vertical', fontsize=10)
# Adjust the y-ticks properties
plt.yticks(fontsize=10)
# Adding value labels to the bars
for i, value in enumerate(na_count):
    plt.text(i, value, str(value), ha='center', va='bottom', fontsize=10)

# Adjust the spacing between subplots
plt.tight_layout()
plt.show()

In [None]:
# Histogram representations of each column in the DataFrame
fig, ax = plt.subplots(figsize=(16, 14))
df.hist(ax=ax, color='steelblue', edgecolor='white', alpha=0.7)
ax.grid(axis='y', linestyle='--', alpha=0.5)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
# Check the number of missing values (NA values)
missing_values = df.isna().sum().sum()
print("Number of missing values in the dataset:", missing_values)

In [None]:
# Check for obviously bad data (e.g., outliers, unrealistic values)
print(df.describe().to_string(float_format="{:.2f}".format))

In [None]:
# Additional information about the dataset
df.info()

In [None]:
# Check the amount of data
data_length = len(df)
print("Number of instances in the dataset:", data_length)

### Initial preprocessing and cleaning

In [None]:
# Delete unecessary columns
df.drop(['hospital_number'], axis=1, inplace=True)

In [None]:
# Remove columns that are useless or irrelevant
# columns_to_drop = ['useless_column', 'irrelevant_column']
# df = df.drop(columns_to_drop, axis=1)

In [None]:
# Percentage of missing data by column:
missing_percentage = df.isnull().mean() * 100
print("Missing Percentage:")
print(missing_percentage)

In [None]:
# Remove columns with lots of missing data (threshold at 60%)
threshold = 50
columns_to_drop = df.columns[missing_percentage > threshold]
df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
# Remove other missing data:
# df = df.dropna()  # Remove rows with any missing values

### Exploration and Visualization

In [None]:
print("\nCount values:")
print(df['outcome'].value_counts())

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 6))
ax = sns.countplot(data=df, x='outcome')
# Add count labels above each bar
for p in ax.patches:
    ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=12)

plt.xlabel('Outcome', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.title('Outcome Distribution', fontsize=16)
plt.ylim(0, 200)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
print(df.outcome.value_counts())

In [None]:
# Figure showing that horses exhibiting the highest symptoms of extreme pain, severe pain, and depression ultimately died.
sns.set(style="whitegrid", font_scale=1.2)
plt.figure(figsize=(10, 6))
ax = sns.countplot(data=df, x='outcome', hue='pain', palette='colorblind')
ax.set_xlabel('Outcome', fontsize=14)
ax.set_ylabel('Count', fontsize=14)
ax.set_title('Distribution of Outcomes by Pain', fontsize=16)
ax.legend(title='Pain', loc='upper left')
# Add value labels to the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2, p.get_height()), ha='center',
                va='bottom', fontsize=12, color='black')

plt.tight_layout()
plt.show()

In [None]:
# Histograms of single numeric value (age of horse)
plt.figure(figsize=(4, 6))
plt.hist(df['age'], bins='auto', edgecolor='black')
plt.xlabel('Age', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Histogram of Age')
plt.grid(True, linestyle='--', alpha=0.5)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
# Bar plots of value counts of single categorical variable (surgery)
sns.set(style='whitegrid')
plt.figure(figsize=(8, 6))
ax = sns.countplot(x='surgery', data=df, palette='dark')
plt.xlabel('Surgery?')
plt.ylabel('Count')
plt.title('Number of Horses Requiring Surgery')
plt.xticks([0, 1], ['No', 'Yes'])

for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2, p.get_height()), ha='center', va='bottom')
plt.show()

In [None]:
# Subplots indicating that horses that died had a pulse of around 70 bpm, while
# horses that lived had the highest pulse count at approximately 40 bpm.
sns.set(style="whitegrid")
facet_grid = sns.FacetGrid(data=test, col='outcome', margin_titles=True, height=4)
facet_grid .map(plt.hist, 'pulse', bins=10, color='skyblue', edgecolor='white')
plt.subplots_adjust(top=0.85)
facet_grid.fig.suptitle('Outcome vs Pulse', fontsize=16, y=1.05)
facet_grid.set_axis_labels('Pulse', 'Count')
plt.show()

In [None]:
# Pivot table to count the occurrences of each peripheral pulse by outcome
sns.set(style="whitegrid")
pivot_table = test.pivot_table(index='peripheral_pulse', columns='outcome', aggfunc='size', fill_value=0)
# Reorder columns based on outcome categories
pivot_table = pivot_table[['died', 'euthanized', 'lived']]
ax = pivot_table.plot(kind='bar', stacked=True, color=['#E74C3C', '#F39C12', '#27AE60'])
ax.set_title('Outcome by Peripheral Pulse')
ax.set_xlabel('Peripheral Pulse')
ax.set_ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Query the counts for horses that either died or were euthanized and also had
# a reduced or absent peripheral pulse, as these two values indicate 
# poor perfusion. Additionally, cross-reference this data with capillary 
# refill time, as a longer refill time suggests poorer circulation.

custom_color_palette = ['#d36135','#fac05e', '#33658a']

# Filter the DataFrame based on conditions
reduced_absent_pulse_df = df[
    (df['outcome'].isin(['died', 'euthanized'])) &
    (df['peripheral_pulse'].isin(['reduced', 'absent']))
]

sns.set_style("ticks")

capillary_refill_plot = sns.catplot(
    data=reduced_absent_pulse_df,
    x='capillary_refill_time',
    col='outcome',
    kind='count',
    palette=custom_color_palette,
    height=4,
    aspect=1.2
)

capillary_refill_plot.set_titles('Outcome = {col_name}')
capillary_refill_plot.set_xlabels('Capillary Refill Time')
capillary_refill_plot.set_ylabels('Count')
sns.despine()
capillary_refill_plot.set_xticklabels(rotation=45)
capillary_refill_plot.fig.suptitle('Distribution of Capillary Refill Time by Outcome', y=1.05, fontsize=16)
capillary_refill_plot.axes.flat[0].grid(axis='y', linestyle=':', alpha=0.5)
capillary_refill_plot.fig.subplots_adjust(top=0.8)
plt.show()