# Prepare trait and 16S rRNA gene sequence data

## Environment settings
```sh
# Working Directory
cd Bac2fFeature/scripts/01_data_preparation
# Conda environment
conda create --name bac2feature_experiment --file ../../environment/env_bac2feature_experiment.txt
conda create --name qiime2-2023.5 --file ../../environment/env_qiime2-2023.5-py38-linux-conda.txt
conda create --name r_bac2feature --file ../../environment/env_r_bac2feature.txt
# Activate Conda environment
conda activate bac2feature_experiment
# Install the Bac2Feature pipeline
git clone https://fuyo780/Bac2Feature.git
pip install Bac2Feature/bac2feature
rm -rf Bac2Feature
# Output Directory
directories=(
    "../../data/"
    "../../data/silva"
    "../../data/trait"
    "../../data/intermediate_dir"
    "../../data/rnacental"
    "../../data/emp"
    "../../data/t1d_samples"
)
for dir in "${directories[@]}"; do
  if [ ! -d "$dir" ]; then
    mkdir -p "$dir"
  fi
done
```

## Download

### Trait database (Madin et al., 2020)
Downloaded at 2024.06.06
```sh
wget -nc -P ../../data/trait https://raw.githubusercontent.com/bacteria-archaea-traits/bacteria-archaea-traits/master/output/condensed_species_NCBI.csv
```

### SILVA 16S rRNA seqs (version 138.1)
Downloaded at 2024.06.06
```sh
silva_dir="../../data/silva"
# 16S rRNA sequences, multiple sequence alignment, phylogenetic tree, and metadata from SILVA release. 138
## Full 16S rRNA sequences
wget -nc -P ${silva_dir} https://www.arb-silva.de/fileadmin/silva_databases/release_138.1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz
## Multiple sequence alignment
wget -nc -P ${silva_dir} https://www.arb-silva.de/fileadmin/silva_databases/release_138.1/Exports/SILVA_138.1_SSURef_NR99_tax_silva_full_align_trunc.fasta.gz
## Phylogenetic tree derived from 16S rRNA seqs
wget -nc -P ${silva_dir} https://www.arb-silva.de/fileadmin/silva_databases/release_138.1/user/guide-trees/SILVA_138.1_SSURef_NR99.accessions.ntree.gz
## Metadata
wget -nc -P ${silva_dir} https://www.arb-silva.de/fileadmin/silva_databases/release_138.1/Exports/full_metadata/SILVA_138.1_SSURef_Nr99.full_metadata.gz
# Unpack
find ${silva_dir} -type f -name "*.gz" -exec unpigz -k {} \;
```

### RNA-central 
Downloaded at 2022.11.07
```sh
rnacentral_dir="../../data/rnacental"
## RNA-central
wget -nc -P ${rnacentral_dir} https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/id_mapping.tsv.gz
wget -nc -P ${rnacentral_dir} https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_active.fasta.gz
# Unpack
find ${rnacentral_dir} -type f -name "*.gz" -exec unpigz -k {} \;
```

### Earth Microbiome Project 1
```sh
emp_dir="../../data/emp"
# Download and copy files utilized in the following analysis
wget --output-file=${emp_dir}/wget_logfile -P ${emp_dir} --mirror --no-parent ftp://ftp.microbio.me/emp/release1
biom convert -i ${emp_dir}/ftp.microbio.me/emp/release1/otu_tables/deblur/emp_deblur_150bp.subset_2k.rare_5000.biom -o ${emp_dir}/emp_deblur_150bp.subset_2k.rare_5000.tsv --to-tsv
biom convert -i ${emp_dir}/ftp.microbio.me/emp/release1/otu_tables/deblur/emp_deblur_150bp.qc_filtered.rare_5000.biom -o ${emp_dir}/emp_deblur_150bp.qc_filtered.rare_5000.tsv --to-tsv
cp ${emp_dir}/ftp.microbio.me/emp/release1/mapping_files/emp_qiime_mapping_subset_2k.tsv ${emp_dir}/
```

### Diabimmune samples
```sh
t1d_dir="../../data/t1d_samples"
# Extract file URL
wget -q -O - https://diabimmune.broadinstitute.org/diabimmune/data/9/ | grep -oP "(?<=href=')[^']*.fna.gz" > ${t1d_dir}/t1d_file_list.txt
# Download fastq files
wget --no-check-certificate --directory-prefix=${t1d_dir} --input-file=${t1d_dir}/t1d_file_list.txt --output-file=${t1d_dir}/wget_log --wait=10
```