# Categorizing zoo animal species by microbiome

## 1. Setup
### 1.1 Libraries

In [1]:
import pandas as pd
import numpy as np
import altair as alt

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Turn only off for presentation purposes
import warnings
warnings.filterwarnings("ignore")

### 1.2 Data import
Data has been preprocessed in R. We removed all animal species with less than 20 probes.

In [2]:
# Read data 
df = pd.read_csv('data/data_clean.csv')
metadata = df.iloc[:,:9].drop_duplicates().sort_values(['Familie','Gattung','Art']).reset_index(drop=True)
metadata_familie = metadata[['Familie','Diet','digestion']].drop_duplicates().reset_index(drop=True)
metadata_gattung = metadata[['Gattung','Diet','digestion']].drop_duplicates().reset_index(drop=True)
# Identifying zoo and individuals from index name
df.insert(0, 'Zoo', df['index'].str[:3])
df.insert(1, 'AnimalID', df['index'].str[7:])

### 1.3 Function library

In [3]:
def train_dev_test_split(df, y_name="", test_size=0.2, random_state=42):
    if y_name != "":
        # Check if stratification is possible
        can_stratify = df[y_name].value_counts().min() > 1
        
        # If stratification is possible, use it
        if can_stratify:
            train, test = train_test_split(
                df, test_size=test_size/(1-test_size), 
                random_state=random_state, stratify=df[y_name]
            )
        # If not, do a simple split without stratification
        else:
            train, test = train_test_split(
                df, test_size=test_size/(1-test_size), 
                random_state=random_state
            )
        
        # Define input and output variables
        X_train = train.iloc[:,12:]
        if y_name != 'Art':
            X_train = X_train.drop([y_name], axis=1)
        y_train = train[y_name]

        X_test = test.iloc[:,12:]
        if y_name != 'Art':
            X_test = X_test.drop([y_name], axis=1)
        y_test = test[y_name]
        
        return X_train, y_train, X_test, y_test
    
    else:
        # Check if stratification is possible
        can_stratify = df['Art'].value_counts().min() > 1
        
        # If stratification is possible, use it
        if can_stratify:
            train, test = train_test_split(
                df, test_size=test_size, 
                random_state=random_state, stratify=df['Art']
            )
        # If not, do a simple split without stratification
        else:
            train, test = train_test_split(
                df, test_size=test_size, 
                random_state=random_state
            )
        
        return train, test

# One-hot encoded data
def one_hot_encoding(df, Art):
    # One-hot encoding of column
    df_Art = pd.get_dummies(df.Art)
    # Join with dummy data
    df_tmp = df.iloc[:,:-1].join(df_Art[Art])
    # Split data
    X_train, y_train, X_dev, y_dev, X_test, y_test = train_dev_test_split(df_tmp, Art)
    return X_train, y_train, X_dev, y_dev, X_test, y_test

# Find best parameters using GridSearchCV for logistic regression
def lr_best_model(X_train, y_train):
    # Define multiple hyperparameter grids to search over
    param_grids = [
        {
            'penalty': ['l1', 'l2'],  # l1 and l2 penalties
            'C': [0.01, 0.1, 1.0, 10.0, 100.0],
            'solver': ['liblinear'],  # liblinear supports l1 and l2
            'max_iter': [100, 500, 1000],
        },
        {
            'penalty': ['l2', 'none'],  # l2 and none penalties
            'C': [0.01, 0.1, 1.0, 10.0, 100.0],
            'solver': ['newton-cg', 'lbfgs', 'sag'],  # these solvers support l2 and none
            'max_iter': [100, 500, 1000],
        },
        {
            'penalty': ['l1', 'l2', 'none'],  # l1, l2 and none penalties
            'C': [0.01, 0.1, 1.0, 10.0, 100.0],
            'solver': ['saga'],  # saga supports l1, l2, and none
            'max_iter': [100, 500, 1000],
        }
    ]

    # Create a logistic regression model
    lr = LogisticRegression(random_state=42)

    # Perform grid search over the hyperparameter grids using 5-fold cross-validation
    grid_search = GridSearchCV(estimator=lr,
                               param_grid=param_grids,
                               cv=5,
                               n_jobs=-1)

    # Fit the grid search to the training data
    grid_search.fit(X_train, y_train)

    # Print the best parameters and best score
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best score: {grid_search.best_score_}")
    
    return grid_search.best_params_

def train_best_model(X_train, y_train, params):
    # Create a new logistic regression model with the best hyperparameters
    best_lr = LogisticRegression(**params, random_state=42)

    # Fit the new model to the training data
    best_lr.fit(X_train, y_train)
    
    return best_lr

def evaluate_model(best_lr, X_dev, y_dev):
    # Evaluate the performance of the new model on the test data
    score = best_lr.score(X_dev, y_dev)
    print(f"Test score: {score}")

    # Print the results
    y_pred = best_lr.predict(X_dev)
    print(classification_report(y_dev, y_pred))

    # Create the confusion matrix
    cm = confusion_matrix(y_dev, y_pred)

    # Print the confusion matrix
    print("Confusion Matrix:\n", cm)
    
def best_lr(df, Art):
    # One-hot encoding
    X_train, y_train, X_dev, y_dev, X_test, y_test = one_hot_encoding(df, Art)
    # Best params
    params = lr_best_model(X_train, y_train)
    # Best model
    best_lr = train_best_model(X_train, y_train, params)
    # Evaluate model
    evaluate_model(best_lr, X_dev, y_dev)
    
    return best_lr

# Trains logistic regression based on specific attribute
def categorize_attribute(df, column, attribute):
    # Get dummies
    df_attribute = pd.get_dummies(df[column])
    # Join with dummy data
    df_tmp = df.join(df_attribute[attribute])
    # Split data
    X_train, y_train, X_dev, y_dev = train_dev_test_split(df_tmp, attribute)
    # Best params
    params = lr_best_model(X_train, y_train)
    # Best model
    lr = train_best_model(X_train, y_train, params)
    # Evaluate model
    evaluate_model(lr, X_dev, y_dev)
    
    return lr

## 2. Modelling - Logistic Regression
### 2.1 Preparing training and development sets
We split the dataset into training & development and test sets and put the test set aside.

In [4]:
df_train_dev, df_test = train_dev_test_split(df.iloc[:,:-1])

### 2.2 Classification of Diet
#### 2.2.1 Herbivores

In [5]:
lr_herbivore = categorize_attribute(df_train_dev, 'Diet', 'herbivor')

Best parameters: {'C': 1.0, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
Best score: 0.9746031746031747
Test score: 0.9809523809523809
              precision    recall  f1-score   support

       False       0.98      0.98      0.98        57
        True       0.98      0.98      0.98        48

    accuracy                           0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105

Confusion Matrix:
 [[56  1]
 [ 1 47]]


#### 2.2.2 Carnivores

In [6]:
lr_carnivore = categorize_attribute(df_train_dev, 'Diet', 'carnivor')

Best parameters: {'C': 1.0, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
Best score: 0.9523809523809523
Test score: 0.9619047619047619
              precision    recall  f1-score   support

       False       0.95      0.98      0.97        60
        True       0.98      0.93      0.95        45

    accuracy                           0.96       105
   macro avg       0.96      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105

Confusion Matrix:
 [[59  1]
 [ 3 42]]


#### 2.2.3 Omnivores

In [7]:
lr_omnivore = categorize_attribute(df_train_dev, 'Diet', 'omnivor')

Best parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'none', 'solver': 'saga'}
Best score: 0.9587301587301587
Test score: 0.9428571428571428
              precision    recall  f1-score   support

       False       0.98      0.96      0.97        93
        True       0.71      0.83      0.77        12

    accuracy                           0.94       105
   macro avg       0.85      0.90      0.87       105
weighted avg       0.95      0.94      0.94       105

Confusion Matrix:
 [[89  4]
 [ 2 10]]


### 2.3 Classification of family given non-herbivores

In [8]:
# Filter df for non-herbivores
df_train_dev_nh = df_train_dev[df_train_dev.Diet != 'herbivor']

#### 2.3.1 Canidae

In [9]:
lr_canidae = categorize_attribute(df_train_dev_nh, 'Familie', 'Canidae')

Best parameters: {'C': 0.01, 'max_iter': 1000, 'penalty': 'none', 'solver': 'sag'}
Best score: 0.8835294117647059
Test score: 0.8620689655172413
              precision    recall  f1-score   support

       False       0.95      0.86      0.90        44
        True       0.67      0.86      0.75        14

    accuracy                           0.86        58
   macro avg       0.81      0.86      0.83        58
weighted avg       0.88      0.86      0.87        58

Confusion Matrix:
 [[38  6]
 [ 2 12]]


#### 2.3.2 Felidae

In [10]:
lr_felidae = categorize_attribute(df_train_dev_nh, 'Familie', 'Felidae')

Best parameters: {'C': 0.01, 'max_iter': 500, 'penalty': 'none', 'solver': 'sag'}
Best score: 0.8662184873949581
Test score: 0.8793103448275862
              precision    recall  f1-score   support

       False       0.87      0.90      0.89        30
        True       0.89      0.86      0.87        28

    accuracy                           0.88        58
   macro avg       0.88      0.88      0.88        58
weighted avg       0.88      0.88      0.88        58

Confusion Matrix:
 [[27  3]
 [ 4 24]]


#### 2.3.3 Herpestidae

In [11]:
lr_herpestidae = categorize_attribute(df_train_dev_nh, 'Familie', 'Herpestidae')

Best parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score: 0.9420168067226891
Test score: 0.9310344827586207
              precision    recall  f1-score   support

       False       0.93      1.00      0.96        54
        True       0.00      0.00      0.00         4

    accuracy                           0.93        58
   macro avg       0.47      0.50      0.48        58
weighted avg       0.87      0.93      0.90        58

Confusion Matrix:
 [[54  0]
 [ 4  0]]


#### 2.3.4 Ursidae

In [12]:
lr_ursidae = categorize_attribute(df_train_dev_nh, 'Familie', 'Ursidae')

Best parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score: 0.9475630252100841
Test score: 0.9137931034482759
              precision    recall  f1-score   support

       False       0.93      0.96      0.95        45
        True       0.83      0.77      0.80        13

    accuracy                           0.91        58
   macro avg       0.88      0.86      0.87        58
weighted avg       0.91      0.91      0.91        58

Confusion Matrix:
 [[43  2]
 [ 3 10]]


### 3.3 Classification by digestion for herbivores

In [13]:
# Filter train dev dataset for herbivores only
df_herbivore = df_train_dev[df_train_dev.Diet == 'herbivor']

#### 3.3.1 Classification by digestion for herbivores - Foregut ruminant

In [14]:
lr_foregut_r = categorize_attribute(df_herbivore, 'digestion', 'foregut_ruminant')

Best parameters: {'C': 1.0, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best score: 0.9928571428571429
Test score: 0.9583333333333334
              precision    recall  f1-score   support

       False       0.92      1.00      0.96        23
        True       1.00      0.92      0.96        25

    accuracy                           0.96        48
   macro avg       0.96      0.96      0.96        48
weighted avg       0.96      0.96      0.96        48

Confusion Matrix:
 [[23  0]
 [ 2 23]]


#### 3.3.2 Classification by digestion for herbivores - Hindgut colon

In [15]:
lr_hindgut_co = categorize_attribute(df_herbivore, 'digestion', 'hindgut_colon')

Best parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 0.9859605911330049
Test score: 1.0
              precision    recall  f1-score   support

       False       1.00      1.00      1.00        31
        True       1.00      1.00      1.00        17

    accuracy                           1.00        48
   macro avg       1.00      1.00      1.00        48
weighted avg       1.00      1.00      1.00        48

Confusion Matrix:
 [[31  0]
 [ 0 17]]


#### 3.3.3 Classification by digestion for herbivores -  Simple

In [16]:
lr_simple = categorize_attribute(df_herbivore, 'digestion', 'simple')

Best parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 1.0
Test score: 0.9791666666666666
              precision    recall  f1-score   support

       False       0.98      1.00      0.99        42
        True       1.00      0.83      0.91         6

    accuracy                           0.98        48
   macro avg       0.99      0.92      0.95        48
weighted avg       0.98      0.98      0.98        48

Confusion Matrix:
 [[42  0]
 [ 1  5]]


### 3.5 Ensemble model

In [17]:
def categorize_microbiome(microbiome):
    results = []

    # Diet categorization probabilities
    herbivore_prob = lr_herbivore.predict_proba(microbiome)[:, 1]
    carnivore_prob = lr_carnivore.predict_proba(microbiome)[:, 1]
    omnivore_prob = lr_omnivore.predict_proba(microbiome)[:, 1]

    for idx, (h_prob, c_prob, o_prob) in enumerate(zip(herbivore_prob, carnivore_prob, omnivore_prob)):
        # Categorize diet
        max_diet_prob = max(h_prob, c_prob, o_prob)
        if max_diet_prob == h_prob:
            diet = "herbivor"
            sample = microbiome.iloc[idx].to_numpy().reshape(1, -1)
            foregut_r_prob = lr_foregut_r.predict_proba(sample)[0][1]
            hindgut_co_prob = lr_hindgut_co.predict_proba(sample)[0][1]
            simple_prob = lr_simple.predict_proba(sample)[0][1]

            digestion_prob = {
                "foregut_ruminant": foregut_r_prob,
                "hindgut_colon": hindgut_co_prob,
                "simple": simple_prob
            }
            digestion = max(digestion_prob, key=digestion_prob.get)
            familie = None
        else:
            diet = "carnivor" if max_diet_prob == c_prob else "omnivor"
            digestion = "simple"

            # Determine 'Familie' based on model probabilities
            sample = microbiome.iloc[idx].to_numpy().reshape(1, -1)
            canidae_prob = lr_canidae.predict_proba(sample)[0][1]
            felidae_prob = lr_felidae.predict_proba(sample)[0][1]
            herpestidae_prob = lr_herpestidae.predict_proba(sample)[0][1]
            ursidae_prob = lr_ursidae.predict_proba(sample)[0][1]

            familie_prob = {
                "Canidae": canidae_prob,
                "Felidae": felidae_prob,
                "Herpestidae": herpestidae_prob,
                "Ursidae": ursidae_prob,
                "Undefined": 0.00000001
            }
            familie = max(familie_prob, key=familie_prob.get)

        # Append the result with the original index
        results.append([microbiome.index[idx], diet, digestion, familie])

    # Create a DataFrame from the results
    categorized_df = pd.DataFrame(results, columns=['Index', 'Diet_p', 'digestion_p', 'Familie_p'])
    categorized_df.set_index('Index', inplace=True)

    return categorized_df


### 3.6 Testing

In [18]:
pred = categorize_microbiome(df_test.iloc[:,12:])
results_test = pred.join(df_test, how='left')
results_test['Familie_p'] = results_test['Familie_p'].fillna('Unknown') # To handle incorrectly identified herbivores

In [19]:
print('Classification report on Diet')
print(classification_report(results_test.Diet, results_test.Diet_p))
print(confusion_matrix(results_test.Diet, results_test.Diet_p))

Classification report on Diet
              precision    recall  f1-score   support

    carnivor       0.90      0.84      0.87        45
    herbivor       0.94      0.92      0.93        48
     omnivor       0.62      0.83      0.71        12

    accuracy                           0.88       105
   macro avg       0.82      0.86      0.84       105
weighted avg       0.89      0.88      0.88       105

[[38  3  4]
 [ 2 44  2]
 [ 2  0 10]]


In [20]:
print('Classification report on Digestion for Herbivores')
print(classification_report(results_test[results_test.Diet == 'herbivor'].digestion,
                            results_test[results_test.Diet == 'herbivor'].digestion_p))
print(confusion_matrix(results_test[results_test.Diet == 'herbivor'].digestion,
                            results_test[results_test.Diet == 'herbivor'].digestion_p))

Classification report on Digestion for Herbivores
                  precision    recall  f1-score   support

foregut_ruminant       1.00      0.96      0.98        25
   hindgut_colon       1.00      1.00      1.00        17
          simple       0.86      1.00      0.92         6

        accuracy                           0.98        48
       macro avg       0.95      0.99      0.97        48
    weighted avg       0.98      0.98      0.98        48

[[24  0  1]
 [ 0 17  0]
 [ 0  0  6]]


In [21]:
print('Classification report on Family for Carnivores and Omnivores')
print(classification_report(results_test[results_test.Diet != 'herbivor'].Familie,
                            results_test[results_test.Diet != 'herbivor'].Familie_p))
print(confusion_matrix(results_test[results_test.Diet != 'herbivor'].Familie,
                            results_test[results_test.Diet != 'herbivor'].Familie_p))

Classification report on Family for Carnivores and Omnivores
              precision    recall  f1-score   support

     Canidae       0.88      1.00      0.93        14
     Felidae       0.91      0.74      0.82        27
 Herpestidae       0.00      0.00      0.00         4
     Unknown       0.00      0.00      0.00         0
     Ursidae       0.69      0.92      0.79        12

    accuracy                           0.79        57
   macro avg       0.49      0.53      0.51        57
weighted avg       0.79      0.79      0.78        57

[[14  0  0  0  0]
 [ 1 20  0  2  4]
 [ 1  1  0  1  1]
 [ 0  0  0  0  0]
 [ 0  1  0  0 11]]
