# CS4220 Project 2 - Pathogen Detection


In [2]:
# import packages
import numpy as np
import pandas as pd
import timeit
import time
from sklearn import preprocessing
import csv
from joblib import dump, load
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl


In [None]:
# Load dictionary that maps k-mer to their corresponding index.
# A k-mer and its reverse complement are mapped to the same index.
# We use k=6 here as an example.

import json

with open("./training_data/8-mers.json", 'r') as dict_file:
    canonical_kmer_dict = json.load(dict_file)

In [None]:
# We define a utility function here that turns sequences to their 8-mer profiles.

def sequence_to_kmer_profile(sequence : str, k : int = 8):
    """
    Return the k-mer profile of the input sequence (string)
    """
    res = np.zeros(len(set(canonical_kmer_dict.values())))
    
    for i in range(len(sequence) - k + 1):
        k_mer = sequence[i:i + k]
        if k_mer in canonical_kmer_dict:
            res[canonical_kmer_dict[k_mer]] += 1
        else:
            res[-1] += 1

    res /= np.sum(res)
    return res

In [None]:
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

class CS4220Dataset(Dataset):
    def __init__(self, data_file, label_df=None, k=6, samples_index=None, kmer_profile_on_the_fly=False, dtype=np.float32):
        """
        Dataset class to load large CS4220 sequence database.

        Args:
            - data_file (`str`): Can either be a *.fasta file if the input is raw reads, or *.npy file
                                 if the input is k-mer profile.
            - label_df (`pd.DataFrame` or `None`): A dataframe with "labels" column indicating the label
                                                   of the data (must match with data_file), or `None` if there is
                                                   no label (in the case of test sets).
            - k (`int`): The lengt of k-mer. We use 6 in this project.
            - samples_index (`List` or `None`): list of indices of data we sample from the data file. You
                                                can use this if the dataset is very large and can't fit in memory.
                                                set this to `None` if you want to use all the data.
            - kmer_profile_on_the_fly (`bool`): If input data_file is raw reads and this set to `True`,
                                                we will build k-mer profile on the fly. This is helpful if you want to
                                                alter the input sequences during training, or the k-mer profile can't fit in memory.
                                                Otherwise, we build k-mer profile in advance, which will speed up the
                                                training process.
            - dtype: type to store the k-mer profile. You may use, for example, `np.float32` for better precision,
                     or `np.float16` for smaller memory usage. If loaded from ".npy" file, it is always `np.float16`.
        """
        self.data_file = data_file

        if ".fasta" in data_file or ".fa" in data_file or ".fna" in data_file:
            self.is_raw_reads = True
        elif ".npy" in data_file:
            self.is_raw_reads = False
        else:
            raise TypeError(f"The input file must be either a fasta file containing raw reads (.fasta, .fa, .fna) or a numpy file containing k-mer profiles (.npy).")


        self.label_df = label_df
        self.kmer_profile_otf = kmer_profile_on_the_fly

        # k-mer length, set to be 6.
        self.k = k

        # the samples we take from the read dataset
        self.samples_index = samples_index

        self.dtype = dtype

        # Load the data and store in self.reads and self.labels
        self.X = []
        self.Y = []
        self._read_labels()
        self._read_data()


    def _read_labels(self):
        """
        Read the labels and record them in self.labels.
        """
        if self.label_df is None:
            self.Y = None
        elif self.samples_index is None:
            # Load the whole dataset
            self.Y = list(self.label_df["labels"])
        else:
            # Load only the data corresponding to the sampled index
            self.Y = list(self.label_df.iloc[self.samples_index]["labels"])


    def _read_data(self):
        if self.is_raw_reads:
            # Read the fasta file
            with open(self.data_file, 'r') as fasta_file:
                lines = fasta_file.readlines()

            read_range = self.samples_index if self.samples_index is not None else range(int(len(lines) / 2))
            if not self.kmer_profile_otf:
                self.X = np.zeros(
                    (len(read_range), len(set(canonical_kmer_dict.values()))),
                    dtype=self.dtype
                )

            for i, j in enumerate(tqdm(read_range, desc=f"Parsing fasta file {self.data_file}")):
                read = lines[j * 2 + 1].strip()
                if self.kmer_profile_otf:
                    # If chose to do k-mer profiling on the fly, simply store the reads
                    self.X.append(read)
                else:
                    # Otherwise, do k-mer profiling during training/testing, cost more time during training/testing
                    self.X[i, :] = sequence_to_kmer_profile(read, self.k)
        else:
            # Read the .npy file, and load the numpy matrix
            # Each row corresponds to a read, and each column corresponds to a k-mer (see training_data/6-mers.txt).
            self.X = np.load(self.data_file)
            if self.samples_index is not None:
                self.X = self.X[self.samples_index, :]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        """
        If you are using pytorch, this function helps taking data points during each epoch
        of your training.
        """
        x = self.X[idx]
        if self.kmer_profile_otf:
            read_tensor = torch.tensor(sequence_to_kmer_profile(x, self.k), dtype=self.dtype)
        else:
            read_tensor = torch.tensor(x)

        label = self.Y[idx] if self.Y is not None else None
        return read_tensor, label


#INSERT PATIENT INPUT PATH HERE
input_file_path = './training_data/patient[i].npy'

label_df = pd.read_csv('./training_data/train_labels.csv')
dataset = CS4220Dataset(input_file_path, label_df, k=8, samples_index=samples_index)

## Load Groups and Assigning Groups
First, we create labels with the groups decided by Jaccard index.

In [3]:
with open("groups"+str(8)+".csv", "r") as file:
    reader = csv.reader(file)
    groups_loaded = [row for row in reader]

groups_loaded

[['corynebacterium_diphtheriae',
  'corynebacterium_striatum',
  'corynebacterium_ulcerans'],
 ['acinetobacter_baumannii',
  'enterococcus_faecium',
  'legionella_pneumophila',
  'listeria_monocytogenes',
  'staphylococcus_aureus',
  'staphylococcus_epidermidis',
  'staphylococcus_haemolyticus',
  'staphylococcus_pseudintermedius',
  'staphylococcus_pyogenes',
  'streptococcus_agalactiae',
  'streptococcus_anginosus',
  'streptococcus_mitis',
  'streptococcus_pneumoniae',
  'streptococcus_salivarius',
  'streptococcus_suis'],
 ['mycobacterium_tuberculosis',
  'mycobacterium_ulcerans',
  'pseudomonas_aeruginosa',
  'stenotrophomonas_maltophilia'],
 ['escherichia_coli',
  'klebsiella_michiganensis',
  'klebsiella_pneumoniae',
  'salmonella_enterica_typhimurium',
  'serratia_liquefaciens',
  'vibrio_parahaemolyticus',
  'yersinia_enterocolitica'],
 ['neisseria_gonorrhoeae', 'neisseria_lactamica']]

In [4]:
#make labels
label_df = pd.read_csv('./training_data/train_labels.csv')
label_df['species_name'].value_counts()

species_name
homo_sapiens                       400027
burkholderia_pseudomallei            3540
pseudomonas_aeruginosa               3111
mycobacterium_ulcerans               3024
klebsiella_michiganensis             3010
bacillus_cereus                      2889
klebsiella_pneumoniae                2799
escherichia_coli                     2760
serratia_liquefaciens                2685
vibrio_parahaemolyticus              2572
stenotrophomonas_maltophilia         2524
salmonella_enterica_typhimurium      2491
yersinia_enterocolitica              2271
mycobacterium_tuberculosis           2173
clostridioides_difficile             2020
acinetobacter_baumannii              2005
legionella_pneumophila               1674
listeria_monocytogenes               1487
enterococcus_faecium                 1467
corynebacterium_striatum             1437
staphylococcus_aureus                1392
staphylococcus_haemolyticus          1298
staphylococcus_pseudintermedius      1254
corynebacterium_dipht

In [5]:
def assign_group(species, species_list, group_id):
    if species in species_list:
        return group_id
    return species

#TO APPEND TO
label_groups = {}
species_to_keep = {}
master_labels = label_df.copy()
grouped_labels = label_df.copy()

#FOR GROUP 1
species_to_keep[1] = ['corynebacterium_diphtheriae',
'corynebacterium_striatum',
'corynebacterium_ulcerans']

#FOR GROUP 2
species_to_keep[2] = ['acinetobacter_baumannii',
'enterococcus_faecium',
'legionella_pneumophila',
'listeria_monocytogenes',
'staphylococcus_aureus',
'staphylococcus_epidermidis',
'staphylococcus_haemolyticus',
'staphylococcus_pseudintermedius',
'staphylococcus_pyogenes',
'streptococcus_agalactiae',
'streptococcus_anginosus',
'streptococcus_mitis',
'streptococcus_pneumoniae',
'streptococcus_salivarius',
'streptococcus_suis']

#FOR GROUP 3
species_to_keep[3] = ['mycobacterium_tuberculosis',
  'mycobacterium_ulcerans',
  'pseudomonas_aeruginosa',
  'stenotrophomonas_maltophilia']

#FOR GROUP 4
species_to_keep[4] = ['escherichia_coli',
  'klebsiella_michiganensis',
  'klebsiella_pneumoniae',
  'salmonella_enterica_typhimurium',
  'serratia_liquefaciens',
  'vibrio_parahaemolyticus',
  'yersinia_enterocolitica']

#FOR GROUP 5
species_to_keep[5] = ['neisseria_gonorrhoeae', 'neisseria_lactamica']

species_to_keep[6] = ['homo_sapiens']

#REPLACING OUTGROUP SPECIES W 'Other'
for i in range(1,7):
    label_groups[i] = label_df.copy()

    #keep label if in species to keep
    label_groups[i]['species_name'] = label_groups[i]['species_name'].apply(lambda x: x if x in species_to_keep[i] else 'Other')

    #replace w group # if in species to keep 
    grouped_labels['species_name'] = grouped_labels['species_name'].apply(lambda x: assign_group(x, species_to_keep[i], str(i)))


In [6]:
#HUMAN
lehumans = preprocessing.LabelEncoder()
lehumans.fit(label_groups[6]['species_name'].unique())
y_index = lehumans.transform(label_groups[6]['species_name'].values)
label_groups[6]['labels'] = y_index

#GROUPED, FIRST LAYER
le = preprocessing.LabelEncoder()
le.fit(grouped_labels['species_name'].unique())
y_index = le.transform(grouped_labels['species_name'].values)
grouped_labels['labels'] = y_index

#MASTER GROUPS
masle = preprocessing.LabelEncoder()
masle.fit(master_labels['species_name'].unique())
y_index = masle.transform(master_labels['species_name'].values)
master_labels['labels'] = y_index

#GROUP 1
le1 = preprocessing.LabelEncoder()
le1.fit(label_groups[1]['species_name'].unique())
y_index = le1.transform(label_groups[1]['species_name'].values)
label_groups[1]['labels'] = y_index

#GROUP 2
le2 = preprocessing.LabelEncoder()
le2.fit(label_groups[2]['species_name'].unique())
y_index = le2.transform(label_groups[2]['species_name'].values)
label_groups[2]['labels'] = y_index

#GROUP 3
le3 = preprocessing.LabelEncoder()
le3.fit(label_groups[3]['species_name'].unique())
y_index = le3.transform(label_groups[3]['species_name'].values)
label_groups[3]['labels'] = y_index

#GROUP 4
le4 = preprocessing.LabelEncoder()
le4.fit(label_groups[4]['species_name'].unique())
y_index = le4.transform(label_groups[4]['species_name'].values)
label_groups[4]['labels'] = y_index

#GROUP 5
le5 = preprocessing.LabelEncoder()
le5.fit(label_groups[5]['species_name'].unique())
y_index = le5.transform(label_groups[5]['species_name'].values)
label_groups[5]['labels'] = y_index

# TRAINING MODEL FOR GROUP CLASSIFICATION 
Loading models that have already been trained

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [9]:
species_df = pd.read_csv('./all_species.csv')
pathogen_list = list(species_df[species_df['type'] == 'pathogen']['genome_name'])

In [10]:
#loading models
from joblib import dump, load

rf_only = {}
rf = load('models/rf_humans.joblib')
rf_nonhumans_groups = load('models/rf_nonhumans_into_groups.joblib')
rf_only[1] = load('models/rf_group1.joblib')
rf_only[2] = load('models/rf_group2.joblib')
rf_only[3] = load('models/rf_group3.joblib')
rf_only[4] = load('models/rf_group4.joblib')
rf_only[5] = load('models/rf_group5.joblib')

In [12]:
#PREDICTIONS, FIRST LAYER - human vs nonhuman
def human_predict():
    y_humans_pred = rf.predict(df_test)
    y_humans_predprob = rf.predict_proba(df_test)
    df_test_df['human_predictions'] = y_humans_pred

In [11]:
#PREDICTIONS, SECOND LAYER - genera and groupings
def grouping(threshold = 0.7):    
    subset_mask = df_test_df['human_predictions'] == 0
    feature_cols = list(range(0, 32897))
    subset = df_test_df.loc[subset_mask, feature_cols]

    # predict
    '''if len(subset) == 0:
        continue'''
    y_groups_pred = rf_nonhumans_groups.predict(subset)
    y_groups_predprob = rf_nonhumans_groups.predict_proba(subset)

    grouped_predictions = [
                le.inverse_transform([np.argmax(item)])[0] if np.max(item) >= threshold else 'homo_sapiens'
                for item in y_groups_predprob
            ]

    # replace the original 'grouped_predictions' values in the subset
    df_test_df.loc[subset_mask, 'grouped_predictions'] = grouped_predictions

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
def jaccard_index_per_patient(patient_id, preds):
    df_true = pd.read_csv('test_data/patient{}_labels.csv'.format(patient_id))
    tp, fp, tp_fn = 0, 0, df_true['labels'].shape[0]
    # print('my predition(s) for patient {}:'.format(patient_id))
    # print(preds)
    # print('true pathogen')
    # print(df_true['labels'].values)
    # if don't predict any pathogen, it means there is only decoy in the test dataset (your prediction)
    if len(preds) == 0:
        preds = ['NONE']
    if len(df_true['labels']) == 0:
        df_true['labels'] = ['NONE']
        tp_fn = 1

    for item in np.unique(preds):
        if item in df_true['labels'].values:
            tp += 1
        else:
            fp += 1
    #you have to predict all labels correctly, but you are penalized for any false positive
    return tp / (tp_fn + fp)

In [30]:
#intialize
rf_list = [rf_nonhumans_groups, rf_only[1], rf_only[2], rf_only[3], rf_only[4], rf_only[5]]
le_list = [le, le1, le2, le3, le4, le5]
all_jaccard_index = []
results = []
from collections import Counter

df_test_df = pd.DataFrame(dataset)

#predict
human_predict()
grouping()

df_test_for_loop = df_test_df.copy()

#loops over to get predictions for groups 1-5 and replaces them in the df 
for i in range(1, 6):
    rf_now = rf_list[i]
    le_now = le_list[i]
    
    # subset for each group i 
    subset_mask = df_test_df['grouped_predictions'] == str(i)
    feature_cols = list(range(0, 32897))
    subset = df_test_df.loc[subset_mask, feature_cols]

    # predict
    if len(subset) == 0:
        continue

    y_preds = rf_now.predict(subset)
    y_predprob = rf_now.predict_proba(subset)
    
    y_preds_mapped = [
        le_now.inverse_transform([np.argmax(item)])[0] if np.max(item) >=  0.4 else 'homo_sapiens'
        for item in y_predprob]

    #count occurrences of each label
    counts = Counter(y_preds_mapped)

    #replace failed labels with 'homo_sapiens'
    y_preds_mapped = [
        label if counts[label] >= 6 else 'homo_sapiens'
        for label in y_preds_mapped
    ]
    # replace the original 'grouped_predictions' values in the subset
    df_test_for_loop.loc[subset_mask, 'grouped_predictions'] = y_preds_mapped


#get unique only and append
unique_predicts = df_test_for_loop["grouped_predictions"].unique()
unique_predicts = [item for item in unique_predicts if item in pathogen_list]

unique_predicts.to_csv('patient${i}.csv')

In [33]:
pd.DataFrame(results)

Unnamed: 0,patient_id,ground_truth,predictions,jaccard_index
0,0,[neisseria_gonorrhoeae],[neisseria_gonorrhoeae],1.0
1,1,[corynebacterium_ulcerans],[],0.0
2,2,[staphylococcus_aureus],[],0.0
3,3,[streptococcus_pneumoniae],[streptococcus_pneumoniae],1.0
4,4,"[salmonella_enterica_typhimurium, clostridioid...","[klebsiella_pneumoniae, campylobacter_jejuni]",0.0
5,5,"[mycobacterium_tuberculosis, pseudomonas_aerug...","[mycobacterium_ulcerans, neisseria_gonorrhoeae...",0.2
6,6,[klebsiella_pneumoniae],"[klebsiella_michiganensis, streptococcus_agala...",0.0
7,7,"[staphylococcus_pyogenes, corynebacterium_diph...",[clostridioides_difficile],0.0
8,8,"[burkholderia_pseudomallei, listeria_monocytog...",[streptococcus_pneumoniae],0.0
9,9,[],[],1.0


In [34]:
results_df = pd.DataFrame(results)

results_df.to_csv('JI_results_forRF.csv', index=False)  


In [35]:
results_df["jaccard_index"].mean()

0.32