In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/animal-disease-dataset/animal_disease_dataset.csv


### Importing Important LIbraries


In [3]:
df = pd.read_csv("/kaggle/input/animal-disease-dataset/animal_disease_dataset.csv")
print("Data Shape: ", df.shape)
display(df.head())

Data Shape:  (43778, 7)


Unnamed: 0,Animal,Age,Temperature,Symptom 1,Symptom 2,Symptom 3,Disease
0,cow,3,103.1,depression,painless lumps,loss of appetite,pneumonia
1,buffalo,13,104.5,painless lumps,loss of appetite,depression,lumpy virus
2,sheep,1,100.5,depression,painless lumps,loss of appetite,lumpy virus
3,cow,14,100.3,loss of appetite,swelling in limb,crackling sound,blackleg
4,sheep,2,103.6,painless lumps,loss of appetite,depression,pneumonia


### Normalize and list uique syplrtom phrases

In [5]:
import re

def normalize_symptom(s):
    if pd.isna(s): return None
    s = str(s).strip().lower()
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

symptom_set = set()
for col in ["Symptom 1", "Symptom 2", "Symptom 3"]:
    symptom_set.update(df[col].dropna().astype(str).map(normalize_symptom).unique())
symptom_set.discard(None)
symptoms = sorted(symptom_set)
print("Found", len(symptoms), "unique symptom phrases")
print(symptoms)

Found 24 unique symptom phrases
['blisters on gums', 'blisters on hooves', 'blisters on mouth', 'blisters on tongue', 'chest discomfort', 'chills', 'crackling sound', 'depression', 'difficulty walking', 'fatigue', 'lameness', 'loss of appetite', 'painless lumps', 'shortness of breath', 'sores on gums', 'sores on hooves', 'sores on mouth', 'sores on tongue', 'sweats', 'swelling in abdomen', 'swelling in extremities', 'swelling in limb', 'swelling in muscle', 'swelling in neck']


## Create binary sypmtom values

In [6]:
def symptom_col_name(s):
    return s.replace(' ', '_').replace('-', '_')

for s in symptoms:
    col = symptom_col_name(s)
    df[col] = df.apply(lambda r: int(
        normalize_symptom(r.get("Symptom 1")) == s or
        normalize_symptom(r.get("Symptom 2")) == s or
        normalize_symptom(r.get("Symptom 3")) == s
    ), axis=1)

#drop originals 
df = df.drop(columns=["Symptom 1", "Symptom 2", "Symptom 3"])
print("New shape after expanding symptoms:", df.shape)
display(df.head())

New shape after expanding symptoms: (43778, 28)


Unnamed: 0,Animal,Age,Temperature,Disease,blisters_on_gums,blisters_on_hooves,blisters_on_mouth,blisters_on_tongue,chest_discomfort,chills,...,sores_on_gums,sores_on_hooves,sores_on_mouth,sores_on_tongue,sweats,swelling_in_abdomen,swelling_in_extremities,swelling_in_limb,swelling_in_muscle,swelling_in_neck
0,cow,3,103.1,pneumonia,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,buffalo,13,104.5,lumpy virus,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,sheep,1,100.5,lumpy virus,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,cow,14,100.3,blackleg,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,sheep,2,103.6,pneumonia,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Clean and basic type conversions

In [11]:
# unify column names
df = df.rename(columns={"Animal": "animal_type", "Age": "age", "Temperature": "body_temperature", "Disease": "disease" })
#convert numeric types
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['body_temperature'] = pd.to_numeric(df['body_temperature'], errors='coerce')

#drop rows with missing target
df = df.dropna(subset=['disease'])
print("After cleaning shape:", df.shape)
display(df[['animal_type', 'age', 'body_temperature','disease']].head())

After cleaning shape: (43778, 28)


Unnamed: 0,animal_type,age,body_temperature,disease
0,cow,3,103.1,pneumonia
1,buffalo,13,104.5,lumpy virus
2,sheep,1,100.5,lumpy virus
3,cow,14,100.3,blackleg
4,sheep,2,103.6,pneumonia


## Encoding categorical features and target

In [16]:
# Cell 5: encode categorical features (one-hot Animal) and encode target label
from sklearn.preprocessing import LabelEncoder

# one-hot for animal_type
df = pd.get_dummies(df, columns=['animal_type'], prefix='animal')

# label encode target
le = LabelEncoder()
df['disease_label'] = le.fit_transform(df['disease'])

# save mapping for later use
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping (disease -> int):", label_mapping)

# move target to final column
feature_cols = [c for c in df.columns if c not in ['disease','disease_label']]
print("Number of features:", len(feature_cols))


Label mapping (disease -> int): {'anthrax': 0, 'blackleg': 1, 'foot and mouth': 2, 'lumpy virus': 3, 'pneumonia': 4}
Number of features: 30


### Train Test Split

In [18]:
from sklearn.model_selection import train_test_split

X = df  [feature_cols]
y=df['disease_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (35022, 30) Test shape: (8756, 30)


## Train Decision Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib

clf = DecisionTreeClassifier(max_depth=6, min_samples_leaf=5, random_state=42)
clf.fit(X_train, y_train)

# evaluate
y_pred = clf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)


#save model and label encoder for feature list
model_artifact = {
    'model': clf,
    'label_encoder_classes': list(le.classes_),
    'features': list(X.columns)
}
joblib.dump(model_artifact, "decision_tree_model.pkl")
print("Saved model artifact to decision_tree_model.pkl")

Accuracy:  0.8321151210598446

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1969
           1       1.00      1.00      1.00      1943
           2       1.00      1.00      1.00      1940
           3       0.48      0.32      0.38      1438
           4       0.50      0.67      0.57      1466

    accuracy                           0.83      8756
   macro avg       0.80      0.80      0.79      8756
weighted avg       0.83      0.83      0.83      8756

Saved model artifact to decision_tree_model.pkl


## Testing the Saved Nodel

In [23]:
import joblib
import numpy as np

art = joblib.load("decision_tree_model.pkl")
model = art['model']
features = art['features']
label_classes = art['label_encoder_classes']

#pick a sample row from X_test
sample = X_test.iloc[0:3]
pred = model.predict(sample)
prob = None
if hasattr(model, "predict_proba"):
    prob = model.predict_proba(sample)

print("Sample rows (features):")
display(sample)
print("Predicted labels (int):", pred)
print("Predicted diseases:", [label_classes[p] for p in pred])
if prob is not None:
    print ("Top confidences:", prob.max(axis=1))
    

Sample rows (features):


Unnamed: 0,age,body_temperature,blisters_on_gums,blisters_on_hooves,blisters_on_mouth,blisters_on_tongue,chest_discomfort,chills,crackling_sound,depression,...,sweats,swelling_in_abdomen,swelling_in_extremities,swelling_in_limb,swelling_in_muscle,swelling_in_neck,animal_buffalo,animal_cow,animal_goat,animal_sheep
15525,2,101.3,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,False,False,False,True
13104,3,104.6,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,False,False,False,True
17030,8,101.2,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,False,True,False,False


Predicted labels (int): [3 4 4]
Predicted diseases: ['lumpy virus', 'pneumonia', 'pneumonia']
Top confidences: [0.50769231 0.51791531 0.56557377]


### Save cleaned CSV

In [25]:
df.to_csv("/kaggle/working/animal_disease_clean.csv")
print("Saved cleaned CSV to animal_disease_clean.csv")

Saved cleaned CSV to animal_disease_clean.csv
