# Microbiome Relative Abundance Data Cleanup
### April 4, 2024

$\color{red}{\text{ONLY RUN THIS CODE ONCE! It permanently edits the data files.}}$

All data sourced from [GMRepo](https://gmrepo.humangut.info/home) database.

Let's start by importing relevant packages.

In [1]:
# Relevant string operations
import string

# Data handling
import glob
import os
import pandas as pd

We will use the `glob` module, which allows us to get all our files of interest with the same extension for convenience.

In [3]:
# Specify directories holding data
data_dir = ['../Data/ASD', '../Data/AD', '../Data/PD', '../Data/MS', '../Data/Epilepsy']

# Initialize DataFrame for concatenation steps
df = pd.DataFrame()

# Iterate through each of the directories
for direc in data_dir:
    
    # Glob string for all .txt files, which have relative abundance data
    file_glob = os.path.join(direc, '*.txt')
    
    # Get list of files in direc
    file_list = glob.glob(file_glob)

    # Iterate through all files in list
    for file in file_list:

        # Initialize skip_row_id to be sufficiently large before the loop starts
        skip_row_id = len(file)

        # List to store modified lines
        modified_lines = []
        
        # Iterate through all lines in file
        with open(file, 'r') as f:
            # Find where relative abundance data starts
            for row_number, line in enumerate(f, start=1):
                if "ncbi_taxon_id" in line:
                    skip_row_id = row_number - 1
        
                # Ensure column entries are one "word"
                if row_number > skip_row_id:  # Ensure lines before are not edited
                    modified_line = ''
                    prev_char = None
                    for char in line:
                        if char == ' ' and prev_char in string.ascii_lowercase + '.':
                            modified_line += '_'
                        else:
                            modified_line += char
                        prev_char = char
                    modified_lines.append(modified_line)
        
        # Write modified lines back to file
        with open(file, 'w') as f:
            f.writelines(modified_lines)

        # Create DataFrame from file, removing filler info
        data = pd.read_csv(file, sep='\s+')

        # Reshape data
        data = data[["relative_abundance", "scientific_name"]].pivot(columns="scientific_name")
        data.columns = data.columns.get_level_values(1)

        # Add label
        data["Diagnosis"] = [direc[direc.rfind('/') + 1:]] * len(data)

        # Update DataFrame
        df = pd.concat([df, data], ignore_index=True).fillna(0)

# Check out df
df.head()

scientific_name,Akkermansia,Bacteroides,Bifidobacterium,Butyricicoccus,Clostridium,Collinsella,Coprococcus,Enterococcus,Faecalibacterium,Gemmiger,...,Promicromonospora,Waddlia,Cellulosimicrobium,Pleomorphomonas,Nitriliruptor,Robiginitalea,Dactylosporangium,Thermomonospora,Salimicrobium,Inquilinus
0,0.0,0.0,4.52788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,2.09829,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,18.3876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.193263,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Finally, let's export the data as a CSV!

In [4]:
# Export to CSV
df.to_csv("../Data/20440_cleaned_data.csv", index=False)