# Machine-learning classifiers for predicting sample characteristics

In [1]:
import os
import pandas as pd
import numpy as np
import qiime2 as q2
from qiime2 import Visualization
from scipy.stats import shapiro, kruskal, f_oneway
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
data_dir = 'project_data'


if not os.path.isdir(data_dir):
    os.makedirs(data_dir) 

# 1. Data import

In [3]:
#filtered sequence
! wget -nv -O $data_dir/rep-seqs.qza 'https://polybox.ethz.ch/index.php/s/MBLSUQXzglnn66u/download?path=%2F&files=Sequences_rep_set.qza'

# Feature Table
! wget -nv -O $data_dir/table.qza 'https://polybox.ethz.ch/index.php/s/MBLSUQXzglnn66u/download?path=%2F&files=Feature_table.qza'

# Taxonomy file generate from silva
! wget -nv -O $data_dir/taxonomy_1.qza 'https://polybox.ethz.ch/index.php/s/MBLSUQXzglnn66u/download?path=%2F&files=taxonomy_1.qza'

# Phylogeny file 
! wget -nv -O $data_dir/fasttree-tree-rooted.qza 'https://polybox.ethz.ch/index.php/s/MBLSUQXzglnn66u/download?path=%2F&files=fasttree-tree-rooted.qza'

# Metadata
! wget -nv -O $data_dir/sample_metadata.tsv 'https://polybox.ethz.ch/index.php/s/MBLSUQXzglnn66u/download?path=%2F&files=metadata-nutrition.tsv'

2022-12-16 08:50:46 URL:https://polybox.ethz.ch/index.php/s/MBLSUQXzglnn66u/download?path=%2F&files=Sequences_rep_set.qza [390624/390624] -> "project_data/rep-seqs.qza" [1]
2022-12-16 08:50:47 URL:https://polybox.ethz.ch/index.php/s/MBLSUQXzglnn66u/download?path=%2F&files=Feature_table.qza [504534/504534] -> "project_data/table.qza" [1]
2022-12-16 08:50:47 URL:https://polybox.ethz.ch/index.php/s/MBLSUQXzglnn66u/download?path=%2F&files=taxonomy_1.qza [303165/303165] -> "project_data/taxonomy_1.qza" [1]
2022-12-16 08:50:48 URL:https://polybox.ethz.ch/index.php/s/MBLSUQXzglnn66u/download?path=%2F&files=fasttree-tree-rooted.qza [249852/249852] -> "project_data/fasttree-tree-rooted.qza" [1]
2022-12-16 08:50:48 URL:https://polybox.ethz.ch/index.php/s/MBLSUQXzglnn66u/download?path=%2F&files=metadata-nutrition.tsv [300302/300302] -> "project_data/sample_metadata.tsv" [1]


### 1.1 Metadata

In [4]:
metadata_df = pd.read_csv(f'project_data/sample_metadata.tsv', sep='\t', float_precision='round_trip')
metadata_df.head()

Unnamed: 0,sampleid,GEN_age_cat,GEN_age_corrected,GEN_bmi_cat,GEN_bmi_corrected,GEN_cat,GEN_collection_timestamp,GEN_country,GEN_dog,GEN_elevation,...,NUT_probiotic_frequency,NUT_red_meat_frequency,NUT_salted_snacks_frequency,NUT_seafood_frequency,NUT_sugary_sweets_frequency,NUT_vegetable_frequency,NUT_vitamin_b_supplement_frequency,NUT_vitamin_d_supplement_frequency,NUT_whole_eggs,NUT_whole_grain_frequency
0,10317.000046,20s,20.0,Normal,23.75,False,2016-08-25 18:30:00,USA,True,1919.3,...,Rarely,Regularly,Occasionally,Rarely,Occasionally,Occasionally,Never,Never,Daily,Daily
1,10317.00005,Not provided,,Overweight,25.61,False,2016-07-06 09:00:00,United Kingdom,False,65.5,...,Rarely,Rarely,Regularly,Occasionally,Regularly,Regularly,Never,Never,Rarely,Occasionally
2,10317.000038,30s,39.0,Overweight,27.67,False,2016-06-29 09:30:00,United Kingdom,False,44.5,...,Never,Occasionally,Daily,Occasionally,Rarely,Occasionally,Never,Never,Regularly,Occasionally
3,10317.000047,50s,56.0,Normal,19.71,False,2016-07-12 17:30:00,Germany,False,8.7,...,Daily,Occasionally,Rarely,Not provided,Rarely,Regularly,Daily,Daily,Rarely,Regularly
4,10317.000046,40s,45.0,Normal,23.15,False,2016-05-24 19:00:00,United Kingdom,True,68.8,...,Regularly,Never,Never,Occasionally,Never,Daily,Rarely,Occasionally,Regularly,Daily


In [7]:
metadata_df.columns.to_list()

['sampleid',
 'GEN_age_cat',
 'GEN_age_corrected',
 'GEN_bmi_cat',
 'GEN_bmi_corrected',
 'GEN_cat',
 'GEN_collection_timestamp',
 'GEN_country',
 'GEN_dog',
 'GEN_elevation',
 'GEN_geo_loc_name',
 'GEN_height_cm',
 'GEN_host_common_name',
 'GEN_last_move',
 'GEN_last_travel',
 'GEN_latitude',
 'GEN_level_of_education',
 'GEN_longitude',
 'GEN_race',
 'GEN_sample_type',
 'GEN_sex',
 'GEN_weight_kg',
 'NUT_alcohol_consumption',
 'NUT_alcohol_frequency',
 'NUT_alcohol_types_beercider',
 'NUT_alcohol_types_red_wine',
 'NUT_alcohol_types_sour_beers',
 'NUT_alcohol_types_spiritshard_alcohol',
 'NUT_alcohol_types_unspecified',
 'NUT_alcohol_types_white_wine',
 'NUT_artificial_sweeteners',
 'NUT_diet_type',
 'NUT_drinks_per_session',
 'NUT_fed_as_infant',
 'NUT_fermented_plant_frequency',
 'NUT_frozen_dessert_frequency',
 'NUT_fruit_frequency',
 'NUT_gluten',
 'NUT_high_fat_red_meat_frequency',
 'NUT_homecooked_meals_frequency',
 'NUT_lowgrain_diet_type',
 'NUT_meat_eggs_frequency',
 'NUT_mil

# creating feature daable to train feature and metadata

In [75]:
! qiime sample-classifier metatable --help

Usage: [94mqiime sample-classifier metatable[0m [OPTIONS]

  Convert numeric sample metadata from TSV file into a feature table.
  Optionally merge with an existing feature table. Only numeric metadata
  will be converted; categorical columns will be silently dropped. By
  default, if a table is used as input only samples found in both the table
  and metadata (intersection) are merged, and others are silently dropped.
  Set missing_samples="error" to raise an error if samples found in the
  table are missing from the metadata file. The metadata file can always
  contain a superset of samples. Note that columns will be dropped if they
  are non-numeric, contain no unique values (zero variance), contain only
  empty cells, or contain negative values. This method currently only
  converts postive numeric metadata into feature data. Tip: convert
  categorical columns to dummy variables to include them in the output
  feature table.

[1mInputs[0m:
  [94m--i-table[0m ARTIFACT [32mFea

In [76]:
! qiime sample-classifier metatable \
  --i-table $data_dir/table.qza \
  --m-metadata-file $data_dir/sample_metadata.tsv \
  --p-missing-values 'drop_samples' \
  --o-converted-table $data_dir/table-w-metadata.qza

[32mSaved FeatureTable[Frequency] to: project_data/table-w-metadata.qza[0m
[0m

In [79]:
! qiime feature-table summarize \
    --i-table $data_dir/table-w-metadata.qza \
    --m-sample-metadata-file $data_dir/sample_metadata.tsv \
    --o-visualization $data_dir/table-w-metadata.qzv

[32mSaved Visualization to: project_data/table-w-metadata.qzv[0m
[0m

In [80]:
Visualization.load(f'{data_dir}/table-w-metadata.qzv')

# 2. Random Forest classifier

In [16]:
! qiime sample-classifier classify-samples

Usage: [94mqiime sample-classifier classify-samples[0m [OPTIONS]

  Predicts a categorical sample metadata column using a supervised learning
  classifier. Splits input data into training and test sets. The training
  set is used to train and test the estimator using a stratified k-fold
  cross-validation scheme. This includes optional steps for automated
  feature extraction and hyperparameter optimization. The test set validates
  classification accuracy of the optimized estimator. Outputs classification
  results for test set. For more details on the learning algorithm, see
  http://scikit-learn.org/stable/supervised_learning.html

[1mInputs[0m:
  [94m[4m--i-table[0m ARTIFACT [32mFeatureTable[Frequency][0m
                       Feature table containing all features that should be
                       used for target prediction.                  [35m[required][0m
[1mParameters[0m:
  [94m[4m--m-metadata-file[0m METADATA
  [94m[4m--m-metadata-column[0m COLUMN  [3

In [81]:
! qiime sample-classifier classify-samples \
    --i-table $data_dir/table-w-metadata.qza \
    --m-metadata-file $data_dir/sample_metadata.tsv \
    --m-metadata-column 'NUT_alcohol_frequency' \
    --p-test-size 0.2 \
    --p-estimator RandomForestClassifier \
    --p-random-state 14 \
    --p-n-estimators 200 \
    --p-optimize-feature-selection True \
    --p-parameter-tuning True \
    --p-missing-samples 'ignore' \
    --p-cv 10 \
    --output-dir $data_dir/RF-ALC-freq-4

[32mSaved SampleEstimator[Classifier] to: project_data/RF-ALC-freq-4/sample_estimator.qza[0m
[32mSaved FeatureData[Importance] to: project_data/RF-ALC-freq-4/feature_importance.qza[0m
[32mSaved SampleData[ClassifierPredictions] to: project_data/RF-ALC-freq-4/predictions.qza[0m
[32mSaved Visualization to: project_data/RF-ALC-freq-4/model_summary.qzv[0m
[32mSaved Visualization to: project_data/RF-ALC-freq-4/accuracy_results.qzv[0m
[32mSaved SampleData[Probabilities] to: project_data/RF-ALC-freq-4/probabilities.qza[0m
[32mSaved Visualization to: project_data/RF-ALC-freq-4/heatmap.qzv[0m
[32mSaved SampleData[TrueTargets] to: project_data/RF-ALC-freq-4/training_targets.qza[0m
[32mSaved SampleData[TrueTargets] to: project_data/RF-ALC-freq-4/test_targets.qza[0m
[0m

In [82]:
Visualization.load(f'{data_dir}/RF-ALC-freq-4/accuracy_results.qzv')

In [83]:
! qiime sample-classifier classify-samples \
    --i-table $data_dir/table-w-metadata.qza \
    --m-metadata-file $data_dir/sample_metadata.tsv \
    --m-metadata-column 'NUT_alcohol_consumption' \
    --p-test-size 0.2 \
    --p-estimator RandomForestClassifier \
    --p-random-state 14 \
    --p-n-estimators 200 \
    --p-optimize-feature-selection True \
    --p-parameter-tuning True \
    --p-missing-samples 'ignore' \
    --p-cv 5 \
    --output-dir $data_dir/RF-ALC-cons-2

[32mSaved SampleEstimator[Classifier] to: project_data/RF-ALC-cons-2/sample_estimator.qza[0m
[32mSaved FeatureData[Importance] to: project_data/RF-ALC-cons-2/feature_importance.qza[0m
[32mSaved SampleData[ClassifierPredictions] to: project_data/RF-ALC-cons-2/predictions.qza[0m
[32mSaved Visualization to: project_data/RF-ALC-cons-2/model_summary.qzv[0m
[32mSaved Visualization to: project_data/RF-ALC-cons-2/accuracy_results.qzv[0m
[32mSaved SampleData[Probabilities] to: project_data/RF-ALC-cons-2/probabilities.qza[0m
[32mSaved Visualization to: project_data/RF-ALC-cons-2/heatmap.qzv[0m
[32mSaved SampleData[TrueTargets] to: project_data/RF-ALC-cons-2/training_targets.qza[0m
[32mSaved SampleData[TrueTargets] to: project_data/RF-ALC-cons-2/test_targets.qza[0m
[0m

In [84]:
Visualization.load(f'{data_dir}/RF-ALC-cons-2/accuracy_results.qzv')

In [13]:
! qiime sample-classifier classify-samples \
  --i-table $data_dir/table.qza \
  --m-metadata-file $data_dir/sample_metadata.tsv \
  --m-metadata-column 'NUT_alcohol_consumption' \
  --p-random-state 666 \
  --p-n-jobs 1 \
  --output-dir $data_dir/Alcohol-classifier-results/

[32mSaved SampleEstimator[Classifier] to: project_data/Alcohol-classifier-results/sample_estimator.qza[0m
[32mSaved FeatureData[Importance] to: project_data/Alcohol-classifier-results/feature_importance.qza[0m
[32mSaved SampleData[ClassifierPredictions] to: project_data/Alcohol-classifier-results/predictions.qza[0m
[32mSaved Visualization to: project_data/Alcohol-classifier-results/model_summary.qzv[0m
[32mSaved Visualization to: project_data/Alcohol-classifier-results/accuracy_results.qzv[0m
[32mSaved SampleData[Probabilities] to: project_data/Alcohol-classifier-results/probabilities.qza[0m
[32mSaved Visualization to: project_data/Alcohol-classifier-results/heatmap.qzv[0m
[32mSaved SampleData[TrueTargets] to: project_data/Alcohol-classifier-results/training_targets.qza[0m
[32mSaved SampleData[TrueTargets] to: project_data/Alcohol-classifier-results/test_targets.qza[0m
[0m

In [14]:
Visualization.load(f'{data_dir}/Alcohol-classifier-results/heatmap.qzv')

In [15]:
Visualization.load(f'{data_dir}/Alcohol-classifier-results/accuracy_results.qzv')

# 3. Linear SVC

In [92]:
! qiime sample-classifier classify-samples \
    --i-table $data_dir/table-w-metadata.qza \
    --m-metadata-file $data_dir/sample_metadata.tsv \
    --m-metadata-column 'NUT_alcohol_frequency' \
    --p-test-size 0.1 \
    --p-estimator LinearSVC \
    --p-random-state 14 \
    --p-n-estimators 100 \
    --p-optimize-feature-selection False \
    --p-parameter-tuning False \
    --p-cv 5 \
    --output-dir $data_dir/LinearSVC-Alc-freq

[32mSaved SampleEstimator[Classifier] to: project_data/LinearSVC-Alc-freq/sample_estimator.qza[0m
[32mSaved FeatureData[Importance] to: project_data/LinearSVC-Alc-freq/feature_importance.qza[0m
[32mSaved SampleData[ClassifierPredictions] to: project_data/LinearSVC-Alc-freq/predictions.qza[0m
[32mSaved Visualization to: project_data/LinearSVC-Alc-freq/model_summary.qzv[0m
[32mSaved Visualization to: project_data/LinearSVC-Alc-freq/accuracy_results.qzv[0m
[32mSaved SampleData[Probabilities] to: project_data/LinearSVC-Alc-freq/probabilities.qza[0m
[32mSaved Visualization to: project_data/LinearSVC-Alc-freq/heatmap.qzv[0m
[32mSaved SampleData[TrueTargets] to: project_data/LinearSVC-Alc-freq/training_targets.qza[0m
[32mSaved SampleData[TrueTargets] to: project_data/LinearSVC-Alc-freq/test_targets.qza[0m
[0m

In [93]:
Visualization.load(f'{data_dir}/LinearSVC-Alc-freq/accuracy_results.qzv')

# 3. KNeighborsClassifier Classifier

In [50]:
! qiime sample-classifier classify-samples \
  --i-table $data_dir/table.qza \
  --m-metadata-file $data_dir/sample_metadata.tsv \
  --m-metadata-column 'NUT_alcohol_consumption' \
  --p-test-size 0.1 \
  --p-estimator KNeighborsClassifier \
  --p-random-state 14 \
  --output-dir $data_dir/KNeighborsClassifier-Alc-cons

[32mSaved SampleEstimator[Classifier] to: project_data/KNeighborsClassifier-Alc-cons/sample_estimator.qza[0m
[32mSaved FeatureData[Importance] to: project_data/KNeighborsClassifier-Alc-cons/feature_importance.qza[0m
[32mSaved SampleData[ClassifierPredictions] to: project_data/KNeighborsClassifier-Alc-cons/predictions.qza[0m
[32mSaved Visualization to: project_data/KNeighborsClassifier-Alc-cons/model_summary.qzv[0m
[32mSaved Visualization to: project_data/KNeighborsClassifier-Alc-cons/accuracy_results.qzv[0m
[32mSaved SampleData[Probabilities] to: project_data/KNeighborsClassifier-Alc-cons/probabilities.qza[0m
[32mSaved Visualization to: project_data/KNeighborsClassifier-Alc-cons/heatmap.qzv[0m
[32mSaved SampleData[TrueTargets] to: project_data/KNeighborsClassifier-Alc-cons/training_targets.qza[0m
[32mSaved SampleData[TrueTargets] to: project_data/KNeighborsClassifier-Alc-cons/test_targets.qza[0m
[0m

In [51]:
Visualization.load(f'{data_dir}/KNeighborsClassifier-Alc-cons/accuracy_results.qzv')

# 4. Regressor

## 4.1 

In [87]:
! qiime sample-classifier regress-samples --help

Usage: [94mqiime sample-classifier regress-samples[0m [OPTIONS]

  Predicts a continuous sample metadata column using a supervised learning
  regressor. Splits input data into training and test sets. The training set
  is used to train and test the estimator using a stratified k-fold cross-
  validation scheme. This includes optional steps for automated feature
  extraction and hyperparameter optimization. The test set validates
  classification accuracy of the optimized estimator. Outputs classification
  results for test set. For more details on the learning algorithm, see
  http://scikit-learn.org/stable/supervised_learning.html

[1mInputs[0m:
  [94m[4m--i-table[0m ARTIFACT [32mFeatureTable[Frequency][0m
                       Feature table containing all features that should be
                       used for target prediction.                  [35m[required][0m
[1mParameters[0m:
  [94m[4m--m-metadata-file[0m METADATA
  [94m[4m--m-metadata-column[0m COLUMN  [32m

In [88]:
! qiime sample-classifier regress-samples \
  --i-table $data_dir/table-w-metadata.qza \
  --m-metadata-file $data_dir/sample_metadata.tsv \
  --m-metadata-column GEN_age_corrected \
  --p-test-size 0.2 \
  --p-estimator Lasso \
  --p-random-state 22 \
  --output-dir $data_dir/Lasso-regressor

[32mSaved SampleEstimator[Regressor] to: project_data/Lasso-regressor/sample_estimator.qza[0m
[32mSaved FeatureData[Importance] to: project_data/Lasso-regressor/feature_importance.qza[0m
[32mSaved SampleData[RegressorPredictions] to: project_data/Lasso-regressor/predictions.qza[0m
[32mSaved Visualization to: project_data/Lasso-regressor/model_summary.qzv[0m
[32mSaved Visualization to: project_data/Lasso-regressor/accuracy_results.qzv[0m
[0m

In [89]:
Visualization.load(f'{data_dir}/Lasso-regressor/accuracy_results.qzv')