## Convert Qiime2 output
Script to take the Qiime2 output csv data and read it so that it ends up in the form of:
- Rows: Sequences, identified by an ID (called Feature ID or index depending on input)
- Columns: One column for each taxonomic level ('Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'), and possibly one with the assignment confidence (depending on input)

The script contains different parts depending on the input file, but might have to be adapted to fit the specific data at hand

In [1]:
# TO BE SPECIFIED
# Input and output
qiime_taxonomy = '/Users/claranordquist/Documents/Universitetet/HT24/Tillämpad_bioinformatik/Applied-bioinformatics/05_Plot_the_results/01_Data/Short_classifier/Fasta_short_reads.tsv'
output_taxonomy = '/Users/claranordquist/Documents/Universitetet/HT24/Tillämpad_bioinformatik/Applied-bioinformatics/05_Plot_the_results/01_Data/Short_classifier/Short_classifier_fasta_short_names_fixed.csv'

In [2]:
# Import packages
import pandas as pd
import numpy as np
import regex as re

## If the taxonomy has been made for multiple sequences at a time
For example Illumina runs, where each classification has been done on all levels. All samples thus have a classification at all levels, and there is no Confidence value

In [24]:
# Read the input data
# Might need skiprows=1, or to define the header
qiime_read = pd.read_csv(qiime_taxonomy, sep=',')
taxonomic_levels = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

# Set the index column, split the taxonomy column into the different levels
qiime_read.set_index('index', inplace=True)
qiime_read[taxonomic_levels] = qiime_read['Taxonomy'].str.split(';', expand=True)

# If there has been a classification at species level, uncomment this part
# qiime_read[['Genus1', 'Species']] = qiime_read['Species1'].str.split(' ', expand=True)
# qiime_read.drop(['Genus1', 'Species1'], axis=1, inplace=True)

# Drop the large column with the whole taxonomy
# Replace empty cells in at the 'Kingdom' level with 'Unassigned'
qiime_read.drop(['Taxonomy'], axis=1, inplace=True)
qiime_read['Kingdom'].replace(np.NaN, 'Unassigned', inplace=True)

## If the taxonomy has been made for one sequence at a time
For example FASTA runs, where each sequence has been classified separetely and hence doesn't contain all levels necessarily. Often has the classification confidence included as well. 

In [20]:
# Read the input data
# Might need skiprows=1 and header=None
columns = ['Feature ID', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Genus1', 'Species', 'Confidence']
qiime_read = pd.read_csv(qiime_taxonomy, sep=' ', names=columns, skiprows=1)
# qiime_read = pd.read_csv(qiime_taxonomy, sep=' ', header=0)

In [21]:
# If the taxonomy classification has columns separated by ' ', run this part
# Define the columns
# qiime_read.columns = ['Feature ID', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Genus1', 'Species', 'Confidence']
qiime_read.set_index('Feature ID', inplace=True)

# Loop through the whole dataframe to see if the confidence level has ended up in the wrong column
# We know it's wrong if the value in the cell is a string that doesn't start with ie d__, c__, s__ etc
# If so, move it to the correct column
for column in qiime_read.columns[1:-1]:
    for row in qiime_read.index:
        value = qiime_read.loc[row, column]

        # Check if value is a string before searching for '__'
        if isinstance(value, str) and '__' not in value:
            if value[0] == '0':
                qiime_read.loc[row, 'Confidence'] = value
                qiime_read.loc[row, column] = np.NaN

# Remove any ';' and extra spaces in the strings
for column in qiime_read.columns[:-1]:
    for row in qiime_read.index:
        value = qiime_read.loc[row, column]
        if isinstance(value, str):
            qiime_read.loc[row, column] = value.replace(';', '').strip()

# Convert the confidence value to a float
qiime_read['Confidence'] = qiime_read['Confidence'].apply(lambda x: float(x))

In [19]:
# If the taxonomy classification has the taxonomy separated by ';', run this part
# Define the columns (might need to be adopted, depending on if there is species classification or not because that might
# give an extra genus column that has to be dropped later on)
qiime_read.columns = ['Feature ID', 'Taxonomy', 'Confidence']
taxonomic_levels = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
qiime_read.set_index('Feature ID', inplace=True)

# Split the large taxonomy to each level and drop it
qiime_read[taxonomic_levels] = qiime_read['Taxonomy'].str.split(';', expand=True)
qiime_read.drop(['Taxonomy'], axis=1, inplace=True)

# Convert the confidence value to a float
# qiime_read['Confidence'] = qiime_read['Confidence'].apply(lambda x: float(x))

## Run this in either case

In [22]:
# Delete the taxonomical levels from the names (k__Bacteria --> Bacteria)
def delete_prefix(name):
    '''Deletes the taxonomical prefix in classifications
    k_Bacteria --> Bacteria'''
    if type(name) == str:
        return re.sub('.{1}__', '', name)

# Take away things following _ (Akkermansia muciniphila_D_776786 --> Akkermansia muciniphila)
def extract_name(name):
    '''Deletes all characters following _
    Akkermansia muciniphila_D_776786 --> Akkermansia muciniphila'''
    if type(name) == str:
        return re.sub(r'_.*', '', name)

In [23]:
# Make a copy not to mess with the original datasets
a = qiime_read.copy(deep=True)

# Loop through each cell and change the name
for column in qiime_read.columns[:-1]:
    for row in qiime_read.index:
        value = delete_prefix(a.loc[row, column])
        name = extract_name(value)
        a.loc[row, column] = name

# Convert to an output csv file
a.to_csv(output_taxonomy)