In [2]:
import pandas as pd

# Load the dataset
try:
    df = pd.read_csv('vet_data.csv')
    print(" Dataset Loaded Successfully")
    
    # Display the column names
    print("\n--- COLUMN NAMES ---")
    print(df.columns.tolist())
    
    # Show the first 5 rows to see what the data looks like
    display(df.head())
    
except FileNotFoundError:
    print(" Error: 'vet_data.csv' not found. Check the file name and folder.")

 Dataset Loaded Successfully

--- COLUMN NAMES ---
['Animal_Type', 'Breed', 'Age', 'Gender', 'Weight', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Duration', 'Appetite_Loss', 'Vomiting', 'Diarrhea', 'Coughing', 'Labored_Breathing', 'Lameness', 'Skin_Lesions', 'Nasal_Discharge', 'Eye_Discharge', 'Body_Temperature', 'Heart_Rate', 'Disease_Prediction']


Unnamed: 0,Animal_Type,Breed,Age,Gender,Weight,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Duration,...,Diarrhea,Coughing,Labored_Breathing,Lameness,Skin_Lesions,Nasal_Discharge,Eye_Discharge,Body_Temperature,Heart_Rate,Disease_Prediction
0,Dog,Labrador,4,Male,25.0,Fever,Lethargy,Appetite Loss,Vomiting,3 days,...,No,No,No,No,No,No,No,39.5°C,120,Parvovirus
1,Cat,Siamese,2,Female,4.5,Coughing,Sneezing,Eye Discharge,Nasal Discharge,1 week,...,No,Yes,No,No,No,Yes,Yes,38.9°C,150,Upper Respiratory Infection
2,Cow,Holstein,3,Female,600.0,Fever,Nasal Discharge,Labored Breathing,Coughing,5 days,...,No,Yes,Yes,No,No,Yes,No,40.1°C,90,Foot and Mouth Disease
3,Dog,Beagle,1,Male,10.0,Diarrhea,Vomiting,Lethargy,Appetite Loss,2 days,...,Yes,No,No,No,No,No,No,39.2°C,130,Gastroenteritis
4,Cat,Persian,5,Male,3.8,Lethargy,Appetite Loss,Skin Lesions,No,2 weeks,...,No,No,No,No,Yes,No,No,38.7°C,160,Fungal Infection


In [4]:
import pandas as pd

# 1. Reload the data (just to be safe)
df = pd.read_csv('vet_data.csv')

# 2. Filter for Pets Only (Dogs and Cats)
# We exclude 'Cow', 'Sheep', etc.
df = df[df['Animal_Type'].isin(['Dog', 'Cat'])].copy()

# 3. Clean the "Body_Temperature" column
# Remove the '°C' string and convert to a number
df['Body_Temperature'] = df['Body_Temperature'].astype(str).str.replace('°C', '').astype(float)

# 4. Convert Binary Columns (Yes/No) to 0/1
# We find all columns that contain "Yes" or "No" and map them.
binary_cols = ['Diarrhea', 'Coughing', 'Labored_Breathing', 'Lameness', 
               'Skin_Lesions', 'Nasal_Discharge', 'Eye_Discharge']

for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].map({'Yes': 1, 'No': 0})

# 5. Clean the Target Column (Disease)
# We need to make sure there are no empty values
df = df.dropna(subset=['Disease_Prediction'])

print(f" Data Cleaned! We have {len(df)} pet records ready for training.")
print("\n--- SAMPLE CLEAN DATA ---")
display(df[['Animal_Type', 'Body_Temperature', 'Diarrhea', 'Disease_Prediction']].head())

 Data Cleaned! We have 147 pet records ready for training.

--- SAMPLE CLEAN DATA ---


Unnamed: 0,Animal_Type,Body_Temperature,Diarrhea,Disease_Prediction
0,Dog,39.5,0,Parvovirus
1,Cat,38.9,0,Upper Respiratory Infection
3,Dog,39.2,1,Gastroenteritis
4,Cat,38.7,0,Fungal Infection
6,Dog,39.3,0,Lyme Disease


In [5]:
# 1. Select the Text Symptom Columns
text_symptom_cols = ['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4']

# 2. Convert them to "Dummy Variables" (One-Hot Encoding)
# This turns "Symptom_1: Fever" into a column "Symptom_Fever: 1"
dummies = pd.get_dummies(df[text_symptom_cols], prefix='Sym')

# 3. Combine with our main table
# We drop the old text columns and add the new number columns
df_final = pd.concat([df, dummies], axis=1)
df_final = df_final.drop(columns=text_symptom_cols)

# 4. Drop non-training columns (like ID or Duration if they are text)
# We keep only numeric columns + the Target (Disease_Prediction)
# We also drop 'Animal_Type', 'Breed', 'Gender' for now to focus purely on medical symptoms
X = df_final.drop(columns=['Disease_Prediction', 'Animal_Type', 'Breed', 'Gender', 'Duration'])

# 5. The Target (What we want to predict)
y = df_final['Disease_Prediction']

print(f"✅ Feature Engineering Complete!")
print(f"We are now training on {X.shape[1]} different features (symptoms).")
print("\n--- NEW COLUMN LIST (Partial) ---")
print(X.columns.tolist()[:15]) # Show first 15 columns to verify

✅ Feature Engineering Complete!
We are now training on 66 different features (symptoms).

--- NEW COLUMN LIST (Partial) ---
['Age', 'Weight', 'Appetite_Loss', 'Vomiting', 'Diarrhea', 'Coughing', 'Labored_Breathing', 'Lameness', 'Skin_Lesions', 'Nasal_Discharge', 'Eye_Discharge', 'Body_Temperature', 'Heart_Rate', 'Sym_Appetite Loss', 'Sym_Coughing']


In [6]:
# Get the full list of input features
feature_list = X.columns.tolist()

print(f"--- THE {len(feature_list)} INPUT FEATURES ---")

# Print them in a clean list
for i, feature in enumerate(feature_list):
    print(f"{i+1}. {feature}")

--- THE 66 INPUT FEATURES ---
1. Age
2. Weight
3. Appetite_Loss
4. Vomiting
5. Diarrhea
6. Coughing
7. Labored_Breathing
8. Lameness
9. Skin_Lesions
10. Nasal_Discharge
11. Eye_Discharge
12. Body_Temperature
13. Heart_Rate
14. Sym_Appetite Loss
15. Sym_Coughing
16. Sym_Diarrhea
17. Sym_Eye Discharge
18. Sym_Fever
19. Sym_Labored Breathing
20. Sym_Lameness
21. Sym_Lethargy
22. Sym_Nasal Discharge
23. Sym_Skin Lesions
24. Sym_Sneezing
25. Sym_Vomiting
26. Sym_Appetite Loss
27. Sym_Coughing
28. Sym_Diarrhea
29. Sym_Eye Discharge
30. Sym_Fever
31. Sym_Labored Breathing
32. Sym_Lethargy
33. Sym_Loss of Appetite
34. Sym_Nasal Discharge
35. Sym_Sneezing
36. Sym_Swelling
37. Sym_Vomiting
38. Sym_Weight Loss
39. Sym_Appetite Loss
40. Sym_Coughing
41. Sym_Dehydration
42. Sym_Diarrhea
43. Sym_Eye Discharge
44. Sym_Fever
45. Sym_Labored Breathing
46. Sym_Lethargy
47. Sym_Loss of Appetite
48. Sym_Nasal Discharge
49. Sym_Skin Lesions
50. Sym_Sneezing
51. Sym_Vomiting
52. Sym_Weight Loss
53. Sym_Appe

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# 1. FORCE CLEAN: Convert ALL "Yes"/"No" to 1/0 across the entire table
# This covers any column we might have missed earlier
X = X.replace({'Yes': 1, 'No': 0, 'yes': 1, 'no': 0})

# 2. SAFETY CHECK: Convert everything to numbers
# If there is any other text (like "Male/Female") this will turn it into 0 to prevent crashes
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

print(" Data Force-Cleaned. All values are now numbers.")

# 3. SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. TRAIN
model = RandomForestClassifier(n_estimators=100, random_state=42)
print(" Training the AI Brain...")
model.fit(X_train, y_train)

# 5. TEST
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n TRAINING SUCCESS!")
print(f" Accuracy: {accuracy * 100:.2f}%")

# 6. SAVE
joblib.dump(model, 'vet_triage_model.pkl')
joblib.dump(X.columns.tolist(), 'model_columns.pkl')
print(" Model saved.")

  X = X.replace({'Yes': 1, 'No': 0, 'yes': 1, 'no': 0})
  type_true = type_of_target(y_true, input_name="y_true")


 Data Force-Cleaned. All values are now numbers.
 Training the AI Brain...

 TRAINING SUCCESS!
 Accuracy: 23.33%
 Model saved.


In [9]:
import pandas as pd

df = pd.read_csv('dataset.csv')
print(df.columns.tolist())
print(f"\nTotal columns: {len(df.columns)}")
print(f"\nFirst few rows:\n{df.head()}")

['_id', 'ecg_path', 'duration', 'pet_id', 'breeds', 'weight', 'age', 'segments_br', 'segments_hr', 'ecg_pulses', 'bad_ecg']

Total columns: 11

First few rows:
                        _id                               ecg_path  duration  \
0  62271dc2b9baee839b0c1e99  ecg_data/62271dc2b9baee839b0c1e99.wav    300.12   
1  62271ea0be903091edb2bf09  ecg_data/62271ea0be903091edb2bf09.wav    300.08   
2  622723bebe903091edb2bf0b  ecg_data/622723bebe903091edb2bf0b.wav    300.08   
3  622776fabe903091edb2bf0d  ecg_data/622776fabe903091edb2bf0d.wav    300.14   
4  622799e5ea79f8f9cc02b284  ecg_data/622799e5ea79f8f9cc02b284.wav    300.12   

   pet_id breeds  weight  age  \
0      14  boxer    34.0  4.5   
1      14  boxer    34.0  4.5   
2      14  boxer    34.0  4.5   
3      14  boxer    34.0  4.5   
4      14  boxer    34.0  4.5   

                                         segments_br  \
0     [{'deb': 200.0, 'fin': 240.0, 'value': 13.24}]   
1  [{'deb': 240.0, 'fin': 280.0, 'value': 12.34}