# Categorizing zoo animal species by microbiome

## 1. Setup
### 1.1 Libraries

In [16]:
import pandas as pd
import numpy as np
import altair as alt

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

# Turn only off for presentation purposes
import warnings
warnings.filterwarnings("ignore")

### 1.2 Data import
Data has been preprocessed in R. We removed all animal species with less than 20 probes.

In [2]:
# Read data 
df = pd.read_csv('data/data_clean.csv')
metadata = df.iloc[:,:9].drop_duplicates().sort_values(['Familie','Gattung','Art']).reset_index(drop=True)
metadata_familie = metadata[['Familie','Diet','digestion']].drop_duplicates().reset_index(drop=True)
metadata_gattung = metadata[['Gattung','Diet','digestion']].drop_duplicates().reset_index(drop=True)
# Identifying zoo and individuals from index name
df.insert(0, 'Zoo', df['index'].str[:3])
df.insert(1, 'AnimalID', df['index'].str[7:])

### 1.3 Function library

In [3]:
def train_dev_test_split(df, y_name="", test_size=0.2, random_state=42):
    if y_name != "":
        # Check if stratification is possible
        can_stratify = df[y_name].value_counts().min() > 1
        
        # If stratification is possible, use it
        if can_stratify:
            train, test = train_test_split(
                df, test_size=test_size/(1-test_size), 
                random_state=random_state, stratify=df[y_name]
            )
        # If not, do a simple split without stratification
        else:
            train, test = train_test_split(
                df, test_size=test_size/(1-test_size), 
                random_state=random_state
            )
        
        # Define input and output variables
        X_train = train.iloc[:,12:]
        if y_name != 'Art':
            X_train = X_train.drop([y_name], axis=1)
        y_train = train[y_name]

        X_test = test.iloc[:,12:]
        if y_name != 'Art':
            X_test = X_test.drop([y_name], axis=1)
        y_test = test[y_name]
        
        return X_train, y_train, X_test, y_test
    
    else:
        # Check if stratification is possible
        can_stratify = df['Art'].value_counts().min() > 1
        
        # If stratification is possible, use it
        if can_stratify:
            train, test = train_test_split(
                df, test_size=test_size, 
                random_state=random_state, stratify=df['Art']
            )
        # If not, do a simple split without stratification
        else:
            train, test = train_test_split(
                df, test_size=test_size, 
                random_state=random_state
            )
        
        return train, test

# One-hot encoded data
def one_hot_encoding(df, Art):
    # One-hot encoding of column
    df_Art = pd.get_dummies(df.Art)
    # Join with dummy data
    df_tmp = df.iloc[:,:-1].join(df_Art[Art])
    # Split data
    X_train, y_train, X_dev, y_dev, X_test, y_test = train_dev_test_split(df_tmp, Art)
    return X_train, y_train, X_dev, y_dev, X_test, y_test

# Find best parameters using GridSearchCV for logistic regression
def lr_best_model(X_train, y_train):
    # Define multiple hyperparameter grids to search over
    param_grids = [
        {
            'penalty': ['l1', 'l2'],  # l1 and l2 penalties
            'C': [0.01, 0.1, 1.0, 10.0, 100.0],
            'solver': ['liblinear'],  # liblinear supports l1 and l2
            'max_iter': [100, 500, 1000],
        },
        {
            'penalty': ['l2', 'none'],  # l2 and none penalties
            'C': [0.01, 0.1, 1.0, 10.0, 100.0],
            'solver': ['newton-cg', 'lbfgs', 'sag'],  # these solvers support l2 and none
            'max_iter': [100, 500, 1000],
        },
        {
            'penalty': ['l1', 'l2', 'none'],  # l1, l2 and none penalties
            'C': [0.01, 0.1, 1.0, 10.0, 100.0],
            'solver': ['saga'],  # saga supports l1, l2, and none
            'max_iter': [100, 500, 1000],
        }
    ]

    # Create a logistic regression model
    lr = LogisticRegression(random_state=42)

    # Perform grid search over the hyperparameter grids using 5-fold cross-validation
    grid_search = GridSearchCV(estimator=lr,
                               param_grid=param_grids,
                               cv=5,
                               n_jobs=-1)

    # Fit the grid search to the training data
    grid_search.fit(X_train, y_train)

    # Print the best parameters and best score
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best score: {grid_search.best_score_}")
    
    return grid_search.best_params_

def train_best_model(X_train, y_train, params):
    # Create a new logistic regression model with the best hyperparameters
    best_lr = LogisticRegression(**params, random_state=42)

    # Fit the new model to the training data
    best_lr.fit(X_train, y_train)
    
    return best_lr

def evaluate_model(best_lr, X_dev, y_dev):
    # Evaluate the performance of the new model on the test data
    score = best_lr.score(X_dev, y_dev)
    print(f"Test score: {score}")

    # Print the results
    y_pred = best_lr.predict(X_dev)
    print(classification_report(y_dev, y_pred))

    # Create the confusion matrix
    cm = confusion_matrix(y_dev, y_pred)

    # Print the confusion matrix
    print("Confusion Matrix:\n", cm)

    # Predict probabilities
    y_pred_proba = best_lr.predict_proba(X_dev)[:, 1]
    # Calculate AUC
    auc_score = roc_auc_score(y_dev, y_pred_proba)
    print(f"AUC: {auc_score}")
    
def best_lr(df, Art):
    # One-hot encoding
    X_train, y_train, X_dev, y_dev, X_test, y_test = one_hot_encoding(df, Art)
    # Best params
    params = lr_best_model(X_train, y_train)
    # Best model
    best_lr = train_best_model(X_train, y_train, params)
    # Evaluate model
    evaluate_model(best_lr, X_dev, y_dev)
    
    return best_lr

# Trains logistic regression based on specific attribute
def categorize_attribute(df, column, attribute):
    # Get dummies
    df_attribute = pd.get_dummies(df[column])
    # Join with dummy data
    df_tmp = df.join(df_attribute[attribute])
    # Split data
    X_train, y_train, X_dev, y_dev = train_dev_test_split(df_tmp, attribute)
    # Best params
    params = lr_best_model(X_train, y_train)
    # Best model
    lr = train_best_model(X_train, y_train, params)
    # Evaluate model
    evaluate_model(lr, X_dev, y_dev)
    
    return lr

## 2. Modelling - Logistic Regression
### 2.1 Preparing training and development sets
We split the dataset into training & development and test sets and put the test set aside.

In [4]:
df_train_dev, df_test = train_dev_test_split(df.iloc[:,:-1])

### 2.1 Evaluating different machine learning models
We test 4 different machine learning models with proven excellent records for classification problems like this:
- Logistic regression
- Decision Tree
- Random Forest
- Support Vector Machine
We evaluate the performance of the models by comparing the AUC and F1 scores.

In [19]:
def evaluate_models(df, target_column, model_choices, features_start_col=12, random_state=42):
    # Split features and target
    X = df.iloc[:, features_start_col:]
    y = df[target_column]

    # Split the data
    X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # Define available models
    available_models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=random_state),
        "Decision Tree": DecisionTreeClassifier(random_state=random_state),
        "Random Forest": RandomForestClassifier(random_state=random_state),
        "SVM": SVC(probability=True, random_state=random_state)
    }
    
    # Initialize LabelBinarizer for AUC calculation
    lb = LabelBinarizer()
    y_dev_binarized = lb.fit_transform(y_dev)

    for model_choice in model_choices:
        model = available_models.get(model_choice)
        if model is None:
            print(f"Invalid model choice: {model_choice}. Skipping...")
            continue

        # Fit and evaluate the selected model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_dev)

        # AUC Calculation
        if hasattr(model, "predict_proba"):
            y_score = model.predict_proba(X_dev)
        else:  # Use decision function if predict_proba is not available
            y_score = model.decision_function(X_dev)
            y_score = (y_score - y_score.min()) / (y_score.max() - y_score.min())  # Normalize

        if len(lb.classes_) == 2:
            auc_score = roc_auc_score(y_dev_binarized, y_score[:, 1])
        else:
            auc_score = roc_auc_score(y_dev_binarized, y_score, average='macro', multi_class='ovr')

        # Print metrics
        print(f"Model: {model_choice}")
        print(f"Accuracy: {accuracy_score(y_dev, y_pred)}")
        print("Classification Report:")
        print(classification_report(y_dev, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_dev, y_pred))
        print(f"AUC Score: {auc_score}\n")

model_choices = ["Logistic Regression", "Decision Tree", "Random Forest", "SVM"]
evaluate_models(df=df_train_dev, target_column='Diet', model_choices=model_choices, random_state=42)


Model: Logistic Regression
Accuracy: 0.9367088607594937
Classification Report:
              precision    recall  f1-score   support

    carnivor       0.96      0.89      0.92        27
    herbivor       0.98      0.98      0.98        47
     omnivor       0.57      0.80      0.67         5

    accuracy                           0.94        79
   macro avg       0.84      0.89      0.86        79
weighted avg       0.95      0.94      0.94        79

Confusion Matrix:
[[24  1  2]
 [ 0 46  1]
 [ 1  0  4]]
AUC Score: 0.9847013816871972

Model: Decision Tree
Accuracy: 0.8987341772151899
Classification Report:
              precision    recall  f1-score   support

    carnivor       0.96      0.85      0.90        27
    herbivor       0.98      0.94      0.96        47
     omnivor       0.40      0.80      0.53         5

    accuracy                           0.90        79
   macro avg       0.78      0.86      0.80        79
weighted avg       0.93      0.90      0.91        79



#### Logistic Regression

**Strengths**: High accuracy (93.67%), excellent AUC score (0.9847), and strong performance across precision, recall, and F1-score for "carnivor" and "herbivor" classes.  
**Weaknesses**: Lower performance on the "omnivor" class compared to others, though still decent in the context of this problem.  
**Why Choose**: It offers a balanced performance across most metrics, with particularly strong results for two major classes and the best overall AUC, indicating its robustness in distinguishing between classes.  

#### Decision Tree

**Strengths**: Reasonable accuracy (89.87%) and good precision for "carnivor" and "herbivor".  
**Weaknesses**: The model has a lower AUC score (0.9094) compared to others and struggles with the "omnivor" class, indicating potential overfitting or lack of generalization.  
**Why Choose**: Might be preferred for interpretability reasons, as decision trees are easy to understand and visualize. However, performance-wise, it's outclassed by other models here.  

#### Random Forest

**Strengths**: Very high AUC score (0.9910), indicating excellent capability in class separation, and solid accuracy (92.41%). It performs well across all classes, with particularly good results for handling the "omnivor" class.  
**Weaknesses**: Slightly lower accuracy and performance metrics for some classes compared to Logistic Regression.  
**Why Choose**: Offers a good trade-off between accuracy and handling of all classes, with the highest AUC score, making it very competitive, especially if the model's complexity and potential for overfitting are managed well.  

#### SVM

**Strengths**: Good overall accuracy (88.61%) and a decent AUC score (0.9653).  
**Weaknesses**: Struggles more with the "omnivor" class than other models and has the lowest macro averages across precision, recall, and F1-score, suggesting it might not be as effective in balancing class performance.  
**Why Choose**: Could be considered if the specific strengths of SVMs (e.g., handling high-dimensional data) are particularly relevant to the broader context of the problem.  

#### Recommendation

Given the metrics, Logistic Regression and Random Forest stand out as the two best candidates. Logistic Regression offers very high accuracy and a great balance across performance metrics, making it a strong choice for scenarios where interpretability and simplicity are valued alongside performance. Random Forest, on the other hand, provides the best AUC score and good performance across all classes, indicating its strength in handling a diverse set of classification scenarios, albeit at the cost of being more complex and less interpretable than Logistic Regression.

If the goal is to maximize overall performance with a particular emphasis on distinguishing between classes as accurately as possible, Random Forest might be the preferable choice due to its superior AUC score. However, if you prioritize simplicity, interpretability, and still want strong performance across most metrics, Logistic Regression would be a solid choice. The final decision should consider the specific application needs, including the importance of model interpretability, computational resources, and how critical it is to accurately predict each class.


In [20]:
# Now, let's go with LR and RF a level deeper and predict digestion
model_choices = ["Logistic Regression", "Random Forest"]
evaluate_models(df=df_train_dev, target_column='digestion', model_choices=model_choices, random_state=42)

Model: Logistic Regression
Accuracy: 0.9620253164556962
Classification Report:
                  precision    recall  f1-score   support

foregut_ruminant       0.92      0.96      0.94        24
   hindgut_colon       0.94      0.94      0.94        18
          simple       1.00      0.97      0.99        37

        accuracy                           0.96        79
       macro avg       0.95      0.96      0.96        79
    weighted avg       0.96      0.96      0.96        79

Confusion Matrix:
[[23  1  0]
 [ 1 17  0]
 [ 1  0 36]]
AUC Score: 0.9973232653560523

Model: Random Forest
Accuracy: 0.9873417721518988
Classification Report:
                  precision    recall  f1-score   support

foregut_ruminant       0.96      1.00      0.98        24
   hindgut_colon       1.00      0.94      0.97        18
          simple       1.00      1.00      1.00        37

        accuracy                           0.99        79
       macro avg       0.99      0.98      0.98        79
   

#### Logistic Regression

**Strengths**: Exceptional accuracy (96.20%) and an outstanding AUC score (0.9973), demonstrating its strong capability to differentiate between the digestion types. It shows excellent precision, recall, and F1-score across all categories, with "simple" digestion being particularly well-identified.  
**Weaknesses**: While overall performance is high, there's a slight relative weakness in distinguishing the "hindgut_colon" compared to "foregut_ruminant" and "simple", as indicated by a slightly lower precision and recall.  
**Why Choose**: Ideal for scenarios requiring a balance between interpretability and high performance. Its robustness across a broad spectrum of classes, combined with near-perfect AUC, makes it a strong candidate for applications where model understanding and justification are as important as accuracy.

#### Random Forest

**Strengths**: Near-perfect accuracy (98.73%) and a perfect AUC score (1.0), indicating superior performance in classifying digestion types. It showcases unmatched precision and recall, achieving perfect or near-perfect scores across all categories. Notably, it performs flawlessly in classifying "foregut_ruminant" and "simple" digestion types without any misclassification.  
**Weaknesses**: Minimal to none, given the performance metrics. However, the slight misclassification in "hindgut_colon" (though very minimal) shows that no model is entirely without fault. Random Forest models also tend to be less interpretable than Logistic Regression, which might be considered a weakness in contexts requiring transparent decision-making processes.  
**Why Choose**: The go-to model for maximizing predictive accuracy and reliability across all classes. Its perfect AUC score emphasizes its ability to distinguish between classes confidently. This model is particularly valuable in situations where the highest possible accuracy is critical, and the complexity of the model can be managed or is of lesser concern.

#### Comparative Evaluation

While both models perform exceptionally well, Random Forest edges out over Logistic Regression in nearly every metric, particularly in accuracy and AUC, indicating its slightly better capability in handling this specific classification task. The Random Forest model demonstrates its robustness through its adaptability and precision across all classes, making it particularly useful in scenarios where accuracy is paramount and the slight additional complexity is not a deterrent.

However, Logistic Regression still offers compelling reasons for selection, particularly in use cases where model simplicity, interpretability, and easier explanation of predictions are required, alongside high performance. Its excellent performance metrics make it a viable option for many practical applications, especially when trade-offs between complexity and interpretability are considered.


In [22]:
# Now, let's go with LR and RF a level deeper and predict digestion
model_choices = ["Logistic Regression", "Random Forest"]
evaluate_models(df=df_train_dev, target_column='Familie', model_choices=model_choices, random_state=42)

Model: Logistic Regression
Accuracy: 0.8860759493670886
Classification Report:
              precision    recall  f1-score   support

   Ailuridae       1.00      0.80      0.89         5
     Bovidae       1.00      0.94      0.97        17
     Canidae       0.62      0.71      0.67         7
     Equidae       0.94      0.94      0.94        18
     Felidae       0.90      0.86      0.88        22
  Giraffidae       0.67      0.86      0.75         7
     Ursidae       1.00      1.00      1.00         3

    accuracy                           0.89        79
   macro avg       0.88      0.87      0.87        79
weighted avg       0.90      0.89      0.89        79

Confusion Matrix:
[[ 4  0  1  0  0  0  0]
 [ 0 16  0  0  0  1  0]
 [ 0  0  5  0  2  0  0]
 [ 0  0  0 17  0  1  0]
 [ 0  0  2  0 19  1  0]
 [ 0  0  0  1  0  6  0]
 [ 0  0  0  0  0  0  3]]
AUC Score: 0.9664480543286222

Model: Random Forest
Accuracy: 0.8227848101265823
Classification Report:
              precision    recall

#### Logistic Regression

**Strengths**: Demonstrates high precision and recall for several families, including "Ailuridae", "Bovidae", and "Ursidae", with perfect or near-perfect scores, suggesting a strong ability to identify these families accurately. The overall accuracy of 88.61% and an AUC score of 0.9664 indicate robust performance across the board.  
**Weaknesses**: Lower performance for the "Canidae" family, indicated by a precision of 0.62 and recall of 0.71, shows some difficulty in distinguishing this family accurately compared to others. The model also shows variability in its ability to classify "Giraffidae" with lower precision.  
**Why Choose**: It provides a balanced performance with a strong leaning towards accurately predicting certain families. The high overall accuracy and AUC score make it a reliable choice for applications where the specific strengths of this model align with the classification goals, particularly for distinguishing "Ailuridae", "Bovidae", and "Ursidae".

#### Random Forest

**Strengths**: Exhibits excellent precision and recall for "Equidae" with a 0.95 and 1.00, respectively, showing its particular strength in identifying this family. The model also demonstrates good performance for "Felidae".  
**Weaknesses**: Struggles with "Giraffidae" and "Ursidae", as seen in lower recall rates and varying precision, indicating difficulty in consistently identifying these less represented families. The overall lower accuracy of 82.28% and some variability in the precision and recall across families suggest some challenges in generalization compared to Logistic Regression.  
**Why Choose**: While it shows variability in performance across different families, its strengths in classifying "Equidae" and "Felidae" might make it a preferable choice for specific contexts requiring high accuracy in these categories. Its AUC score of 0.9851, despite a lower overall accuracy, suggests a strong capability in distinguishing between classes for certain families.

#### Comparative Evaluation

Logistic Regression is the more consistent performer across a broader range of families, offering higher accuracy and balanced precision and recall for most categories. Its ability to almost perfectly classify several families makes it a versatile and reliable model for general use, especially where fine distinctions between certain families are crucial.

Random Forest, while presenting an unmatched capability in classifying "Equidae" and showing good performance for "Felidae", exhibits some inconsistencies across other families, notably "Giraffidae" and "Ursidae". Despite these challenges, its perfect AUC score suggests a strong underlying capability that, with further tuning or in combination with ensemble techniques, could offer competitive or superior performance in specific contexts.

**Conclusion**: Choosing between Logistic Regression and Random Forest would depend on the specific classification requirements and the importance of accurately predicting certain families. Logistic Regression offers a more balanced and consistent performance across a wider array of classes, making it suitable for general purposes. In contrast, Random Forest, despite its slightly lower overall accuracy, might be preferred in scenarios where its particular strengths align with the classification objectives, supplemented by its strong capability in distinguishing between classes as evidenced by its AUC score.

### 2.2 Classification of Diet
#### 2.2.1 Herbivores

In [31]:
lr_herbivore = categorize_attribute(df_train_dev, 'Diet', 'herbivor')

Best parameters: {'C': 1.0, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
Best score: 0.9746031746031747
Test score: 0.9809523809523809
              precision    recall  f1-score   support

       False       0.98      0.98      0.98        57
        True       0.98      0.98      0.98        48

    accuracy                           0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105

Confusion Matrix:
 [[56  1]
 [ 1 47]]
AUC: 0.9992690058479532


#### 2.2.2 Carnivores

In [32]:
lr_carnivore = categorize_attribute(df_train_dev, 'Diet', 'carnivor')

Best parameters: {'C': 1.0, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
Best score: 0.9523809523809523
Test score: 0.9619047619047619
              precision    recall  f1-score   support

       False       0.95      0.98      0.97        60
        True       0.98      0.93      0.95        45

    accuracy                           0.96       105
   macro avg       0.96      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105

Confusion Matrix:
 [[59  1]
 [ 3 42]]
AUC: 0.9914814814814815


#### 2.2.3 Omnivores

In [33]:
lr_omnivore = categorize_attribute(df_train_dev, 'Diet', 'omnivor')

Best parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'none', 'solver': 'saga'}
Best score: 0.9587301587301587
Test score: 0.9428571428571428
              precision    recall  f1-score   support

       False       0.98      0.96      0.97        93
        True       0.71      0.83      0.77        12

    accuracy                           0.94       105
   macro avg       0.85      0.90      0.87       105
weighted avg       0.95      0.94      0.94       105

Confusion Matrix:
 [[89  4]
 [ 2 10]]
AUC: 0.9767025089605735


### 2.3 Classification of family given non-herbivores

In [34]:
# Filter df for non-herbivores
df_train_dev_nh = df_train_dev[df_train_dev.Diet != 'herbivor']

#### 2.3.1 Canidae

In [35]:
lr_canidae = categorize_attribute(df_train_dev_nh, 'Familie', 'Canidae')

Best parameters: {'C': 0.01, 'max_iter': 1000, 'penalty': 'none', 'solver': 'sag'}
Best score: 0.8835294117647059
Test score: 0.8620689655172413
              precision    recall  f1-score   support

       False       0.95      0.86      0.90        44
        True       0.67      0.86      0.75        14

    accuracy                           0.86        58
   macro avg       0.81      0.86      0.83        58
weighted avg       0.88      0.86      0.87        58

Confusion Matrix:
 [[38  6]
 [ 2 12]]
AUC: 0.9107142857142858


#### 2.3.2 Felidae

In [36]:
lr_felidae = categorize_attribute(df_train_dev_nh, 'Familie', 'Felidae')

Best parameters: {'C': 0.01, 'max_iter': 500, 'penalty': 'none', 'solver': 'sag'}
Best score: 0.8662184873949581
Test score: 0.8793103448275862
              precision    recall  f1-score   support

       False       0.87      0.90      0.89        30
        True       0.89      0.86      0.87        28

    accuracy                           0.88        58
   macro avg       0.88      0.88      0.88        58
weighted avg       0.88      0.88      0.88        58

Confusion Matrix:
 [[27  3]
 [ 4 24]]
AUC: 0.9500000000000001


#### 2.3.3 Herpestidae

In [37]:
lr_herpestidae = categorize_attribute(df_train_dev_nh, 'Familie', 'Herpestidae')

Best parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score: 0.9420168067226891
Test score: 0.9310344827586207
              precision    recall  f1-score   support

       False       0.93      1.00      0.96        54
        True       0.00      0.00      0.00         4

    accuracy                           0.93        58
   macro avg       0.47      0.50      0.48        58
weighted avg       0.87      0.93      0.90        58

Confusion Matrix:
 [[54  0]
 [ 4  0]]
AUC: 0.8194444444444444


#### 2.3.4 Ursidae

In [38]:
lr_ursidae = categorize_attribute(df_train_dev_nh, 'Familie', 'Ursidae')

Best parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score: 0.9475630252100841
Test score: 0.9137931034482759
              precision    recall  f1-score   support

       False       0.93      0.96      0.95        45
        True       0.83      0.77      0.80        13

    accuracy                           0.91        58
   macro avg       0.88      0.86      0.87        58
weighted avg       0.91      0.91      0.91        58

Confusion Matrix:
 [[43  2]
 [ 3 10]]
AUC: 0.9333333333333333


### 3.3 Classification by digestion for herbivores

In [39]:
# Filter train dev dataset for herbivores only
df_herbivore = df_train_dev[df_train_dev.Diet == 'herbivor']

#### 3.3.1 Classification by digestion for herbivores - Foregut ruminant

In [40]:
lr_foregut_r = categorize_attribute(df_herbivore, 'digestion', 'foregut_ruminant')

Best parameters: {'C': 1.0, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best score: 0.9928571428571429
Test score: 0.9583333333333334
              precision    recall  f1-score   support

       False       0.92      1.00      0.96        23
        True       1.00      0.92      0.96        25

    accuracy                           0.96        48
   macro avg       0.96      0.96      0.96        48
weighted avg       0.96      0.96      0.96        48

Confusion Matrix:
 [[23  0]
 [ 2 23]]
AUC: 0.9878260869565216


#### 3.3.2 Classification by digestion for herbivores - Hindgut colon

In [41]:
lr_hindgut_co = categorize_attribute(df_herbivore, 'digestion', 'hindgut_colon')

Best parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 0.9859605911330049
Test score: 1.0
              precision    recall  f1-score   support

       False       1.00      1.00      1.00        31
        True       1.00      1.00      1.00        17

    accuracy                           1.00        48
   macro avg       1.00      1.00      1.00        48
weighted avg       1.00      1.00      1.00        48

Confusion Matrix:
 [[31  0]
 [ 0 17]]
AUC: 1.0


#### 3.3.3 Classification by digestion for herbivores -  Simple

In [42]:
lr_simple = categorize_attribute(df_herbivore, 'digestion', 'simple')

Best parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 1.0
Test score: 0.9791666666666666
              precision    recall  f1-score   support

       False       0.98      1.00      0.99        42
        True       1.00      0.83      0.91         6

    accuracy                           0.98        48
   macro avg       0.99      0.92      0.95        48
weighted avg       0.98      0.98      0.98        48

Confusion Matrix:
 [[42  0]
 [ 1  5]]
AUC: 1.0


### 3.5 Ensemble model

In [69]:
def categorize_microbiome(microbiome):
    results = []

    # Initialize lists to store probabilities for AUC calculation
    true_labels = []
    pred_probabilities = []

    # Diet categorization probabilities
    herbivore_prob = lr_herbivore.predict_proba(microbiome)[:, 1]
    carnivore_prob = lr_carnivore.predict_proba(microbiome)[:, 1]
    omnivore_prob = lr_omnivore.predict_proba(microbiome)[:, 1]

    true_labels.append(1 if max(h_prob, c_prob, o_prob) == h_prob else 0)
    pred_probabilities.append(max(h_prob, c_prob, o_prob))

    for idx, (h_prob, c_prob, o_prob) in enumerate(zip(herbivore_prob, carnivore_prob, omnivore_prob)):
        # Categorize diet
        max_diet_prob = max(h_prob, c_prob, o_prob)
        if max_diet_prob == h_prob:
            diet = "herbivor"
            sample = microbiome.iloc[idx].to_numpy().reshape(1, -1)
            foregut_r_prob = lr_foregut_r.predict_proba(sample)[0][1]
            hindgut_co_prob = lr_hindgut_co.predict_proba(sample)[0][1]
            simple_prob = lr_simple.predict_proba(sample)[0][1]

            digestion_prob = {
                "foregut_ruminant": foregut_r_prob,
                "hindgut_colon": hindgut_co_prob,
                "simple": simple_prob
            }
            digestion = max(digestion_prob, key=digestion_prob.get)
            familie = None
            probabilities = h_prob*digestion_prob[digestion]
        elif max_diet_prob == c_prob:
            diet = "carnivor"
            digestion = "simple"

            # Determine 'Familie' based on model probabilities
            sample = microbiome.iloc[idx].to_numpy().reshape(1, -1)
            canidae_prob = lr_canidae.predict_proba(sample)[0][1]
            felidae_prob = lr_felidae.predict_proba(sample)[0][1]
            herpestidae_prob = lr_herpestidae.predict_proba(sample)[0][1]
            ursidae_prob = lr_ursidae.predict_proba(sample)[0][1]

            familie_prob = {
                "Canidae": canidae_prob,
                "Felidae": felidae_prob,
                "Herpestidae": herpestidae_prob,
                "Ursidae": ursidae_prob,
                "Undefined": 0.00000001
            }
            familie = max(familie_prob, key=familie_prob.get)
            probabilities = c_prob*familie_prob[familie]
        else:
            diet = "omnivor"
            digestion = "simple"

            # Determine 'Familie' based on model probabilities
            sample = microbiome.iloc[idx].to_numpy().reshape(1, -1)
            canidae_prob = lr_canidae.predict_proba(sample)[0][1]
            felidae_prob = lr_felidae.predict_proba(sample)[0][1]
            herpestidae_prob = lr_herpestidae.predict_proba(sample)[0][1]
            ursidae_prob = lr_ursidae.predict_proba(sample)[0][1]

            familie_prob = {
                "Canidae": canidae_prob,
                "Felidae": felidae_prob,
                "Herpestidae": herpestidae_prob,
                "Ursidae": ursidae_prob,
                "Undefined": 0.00000001
            }
            familie = max(familie_prob, key=familie_prob.get)
            probabilities = o_prob*familie_prob[familie]

        # Append the result with the original index
        results.append([microbiome.index[idx], diet, digestion, familie, probabilities])

    # Create a DataFrame from the results
    categorized_df = pd.DataFrame(results, columns=['Index', 'Diet_p', 'digestion_p', 'Familie_p','Probabilities_p'])
    categorized_df.set_index('Index', inplace=True)

    return categorized_df

def calculate_auc_for_each_class(microbiome, label, lr_models):
    """
    Calculates and prints AUC for each class against the rest.
    
    Parameters:
    - microbiome: The input features for classification.
    - lr_models: A dictionary of logistic regression models for each class.
    """
    true_classes = microbiome[label] # This should be your actual labels for each sample
    microbiome = microbiome.iloc[:,12:]
    
    # Assuming lr_models is a dict of {'class_name': model} pairs
    for class_name, model in lr_models.items():
        # Get predicted probabilities for being in the current class
        class_prob = model.predict_proba(microbiome)[:, 1]
        
        # Generate binary labels: 1 for current class, 0 for all others
        binary_labels = [1 if c == class_name else 0 for c in true_classes]
        
        # Calculate AUC
        auc_score = roc_auc_score(binary_labels, class_prob)
        print(f"AUC for {class_name} vs. Rest: {auc_score}")

### 3.6 Testing

In [52]:
pred = categorize_microbiome(df_test.iloc[:,12:])
results_test = pred.join(df_test, how='left')
results_test['Familie_p'] = results_test['Familie_p'].fillna('Unknown') # To handle incorrectly identified herbivores

In [70]:
print('Classification report on Diet')
print(classification_report(results_test.Diet, results_test.Diet_p))
print(confusion_matrix(results_test.Diet, results_test.Diet_p))

# AUC for Diet classification models
lr_models = {
    'herbivor': lr_herbivore,
    'carnivor': lr_carnivore,
    'omnivor': lr_omnivore
}

calculate_auc_for_each_class(df_test, 'Diet', lr_models)

Classification report on Diet
              precision    recall  f1-score   support

    carnivor       0.90      0.84      0.87        45
    herbivor       0.94      0.92      0.93        48
     omnivor       0.62      0.83      0.71        12

    accuracy                           0.88       105
   macro avg       0.82      0.86      0.84       105
weighted avg       0.89      0.88      0.88       105

[[38  3  4]
 [ 2 44  2]
 [ 2  0 10]]
AUC for herbivor vs. Rest: 0.9594298245614035
AUC for carnivor vs. Rest: 0.9692592592592593
AUC for omnivor vs. Rest: 0.9578853046594983


In [72]:
print('Classification report on Digestion for Herbivores')
print(classification_report(results_test[results_test.Diet == 'herbivor'].digestion,
                            results_test[results_test.Diet == 'herbivor'].digestion_p))
print(confusion_matrix(results_test[results_test.Diet == 'herbivor'].digestion,
                            results_test[results_test.Diet == 'herbivor'].digestion_p))

# AUC for Digestion classification models
lr_models = {
    'foregut_ruminant': lr_foregut_r,
    'hindgut_colon': lr_hindgut_co,
    'simple': lr_simple
}

calculate_auc_for_each_class(df_test, 'digestion', lr_models)

Classification report on Digestion for Herbivores
                  precision    recall  f1-score   support

foregut_ruminant       1.00      0.96      0.98        25
   hindgut_colon       1.00      1.00      1.00        17
          simple       0.86      1.00      0.92         6

        accuracy                           0.98        48
       macro avg       0.95      0.99      0.97        48
    weighted avg       0.98      0.98      0.98        48

[[24  0  1]
 [ 0 17  0]
 [ 0  0  6]]
AUC for foregut_ruminant vs. Rest: 0.965
AUC for hindgut_colon vs. Rest: 0.9953208556149733
AUC for simple vs. Rest: 0.9947089947089947


In [73]:
print('Classification report on Family for Carnivores and Omnivores')
print(classification_report(results_test[results_test.Diet != 'herbivor'].Familie,
                            results_test[results_test.Diet != 'herbivor'].Familie_p))
print(confusion_matrix(results_test[results_test.Diet != 'herbivor'].Familie,
                            results_test[results_test.Diet != 'herbivor'].Familie_p))

# AUC for Family classification models
lr_models = {
    'Canidae': lr_canidae,
    'Felidae': lr_felidae,
    'Herpestidae': lr_herpestidae,
    'Ursidae': lr_ursidae
}

calculate_auc_for_each_class(df_test, 'Familie', lr_models)

Classification report on Family for Carnivores and Omnivores
              precision    recall  f1-score   support

     Canidae       0.88      1.00      0.93        14
     Felidae       0.91      0.74      0.82        27
 Herpestidae       0.00      0.00      0.00         4
     Unknown       0.00      0.00      0.00         0
     Ursidae       0.69      0.92      0.79        12

    accuracy                           0.79        57
   macro avg       0.49      0.53      0.51        57
weighted avg       0.79      0.79      0.78        57

[[14  0  0  0  0]
 [ 1 20  0  2  4]
 [ 1  1  0  1  1]
 [ 0  0  0  0  0]
 [ 0  1  0  0 11]]
AUC for Canidae vs. Rest: 0.5125588697017268
AUC for Felidae vs. Rest: 0.9468186134852802
AUC for Herpestidae vs. Rest: 0.693069306930693
AUC for Ursidae vs. Rest: 0.9381720430107526
