# CS4220 Project 2 - Pathogen Detection

In this notebook, we give an example to show how to read and use the DNA read data. We will train one base line model, using `sklearn.LogisticRegression`, and use it to predict the pathogens in each patient's dataset.

## Related python packages

To get started (if you are using python), you may want to create a virtual python environment and install some packages. Here are some of the commands you might need:

```bash
conda create --name cs4220 python=3.8

# Activate the created virtual environment
conda activate cs4220

# Install jupyter notebook if you are using it
conda install -c anaconda ipykernel
python -m ipykernel install --user --name=cs4220
conda install -c anaconda jupyter

# Some common packages
conda install pandas                      # for reading csv
conda install scikit-learn                # for the logistic regression model
pip install torch                         # if you are using neural networks
conda install -c conda-forge matplotlib   # for plotting
conda install seaborn                     # also for plotting
pip install umap-learn[plot]              # plotting UMAP plots
conda install numpy                       # for many math/vectorized operations
```

In [2]:
# import packages
import numpy as np
import pandas as pd
import timeit
import time
from sklearn import preprocessing
import csv
from joblib import dump, load

## Load Groups and Assigning Groups
First, we create labels with the groups decided by Jaccard index.

In [3]:
with open("groups"+str(8)+".csv", "r") as file:
    reader = csv.reader(file)
    groups_loaded = [row for row in reader]

groups_loaded

[['corynebacterium_diphtheriae',
  'corynebacterium_striatum',
  'corynebacterium_ulcerans'],
 ['acinetobacter_baumannii',
  'enterococcus_faecium',
  'legionella_pneumophila',
  'listeria_monocytogenes',
  'staphylococcus_aureus',
  'staphylococcus_epidermidis',
  'staphylococcus_haemolyticus',
  'staphylococcus_pseudintermedius',
  'staphylococcus_pyogenes',
  'streptococcus_agalactiae',
  'streptococcus_anginosus',
  'streptococcus_mitis',
  'streptococcus_pneumoniae',
  'streptococcus_salivarius',
  'streptococcus_suis'],
 ['mycobacterium_tuberculosis',
  'mycobacterium_ulcerans',
  'pseudomonas_aeruginosa',
  'stenotrophomonas_maltophilia'],
 ['escherichia_coli',
  'klebsiella_michiganensis',
  'klebsiella_pneumoniae',
  'salmonella_enterica_typhimurium',
  'serratia_liquefaciens',
  'vibrio_parahaemolyticus',
  'yersinia_enterocolitica'],
 ['neisseria_gonorrhoeae', 'neisseria_lactamica']]

In [4]:
#make labels
label_df = pd.read_csv('./training_data/train_labels.csv')
label_df['species_name'].value_counts()

species_name
homo_sapiens                       400027
burkholderia_pseudomallei            3540
pseudomonas_aeruginosa               3111
mycobacterium_ulcerans               3024
klebsiella_michiganensis             3010
bacillus_cereus                      2889
klebsiella_pneumoniae                2799
escherichia_coli                     2760
serratia_liquefaciens                2685
vibrio_parahaemolyticus              2572
stenotrophomonas_maltophilia         2524
salmonella_enterica_typhimurium      2491
yersinia_enterocolitica              2271
mycobacterium_tuberculosis           2173
clostridioides_difficile             2020
acinetobacter_baumannii              2005
legionella_pneumophila               1674
listeria_monocytogenes               1487
enterococcus_faecium                 1467
corynebacterium_striatum             1437
staphylococcus_aureus                1392
staphylococcus_haemolyticus          1298
staphylococcus_pseudintermedius      1254
corynebacterium_dipht

In [5]:
def assign_group(species, species_list, group_id):
    if species in species_list:
        return group_id
    return species

#TO APPEND TO
label_groups = {}
species_to_keep = {}
master_labels = label_df.copy()
grouped_labels = label_df.copy()

#FOR GROUP 1
species_to_keep[1] = ['corynebacterium_diphtheriae',
'corynebacterium_striatum',
'corynebacterium_ulcerans']

#FOR GROUP 2
species_to_keep[2] = ['acinetobacter_baumannii',
'enterococcus_faecium',
'legionella_pneumophila',
'listeria_monocytogenes',
'staphylococcus_aureus',
'staphylococcus_epidermidis',
'staphylococcus_haemolyticus',
'staphylococcus_pseudintermedius',
'staphylococcus_pyogenes',
'streptococcus_agalactiae',
'streptococcus_anginosus',
'streptococcus_mitis',
'streptococcus_pneumoniae',
'streptococcus_salivarius',
'streptococcus_suis']

#FOR GROUP 3
species_to_keep[3] = ['mycobacterium_tuberculosis',
  'mycobacterium_ulcerans',
  'pseudomonas_aeruginosa',
  'stenotrophomonas_maltophilia']

#FOR GROUP 4
species_to_keep[4] = ['escherichia_coli',
  'klebsiella_michiganensis',
  'klebsiella_pneumoniae',
  'salmonella_enterica_typhimurium',
  'serratia_liquefaciens',
  'vibrio_parahaemolyticus',
  'yersinia_enterocolitica']

#FOR GROUP 5
species_to_keep[5] = ['neisseria_gonorrhoeae', 'neisseria_lactamica']

species_to_keep[6] = ['homo_sapiens']

#REPLACING OUTGROUP SPECIES W 'Other'
for i in range(1,7):
    label_groups[i] = label_df.copy()

    #keep label if in species to keep
    label_groups[i]['species_name'] = label_groups[i]['species_name'].apply(lambda x: x if x in species_to_keep[i] else 'Other')

    #replace w group # if in species to keep 
    grouped_labels['species_name'] = grouped_labels['species_name'].apply(lambda x: assign_group(x, species_to_keep[i], str(i)))


In [6]:
#HUMAN
lehumans = preprocessing.LabelEncoder()
lehumans.fit(label_groups[6]['species_name'].unique())
y_index = lehumans.transform(label_groups[6]['species_name'].values)
label_groups[6]['labels'] = y_index

#GROUPED, FIRST LAYER
le = preprocessing.LabelEncoder()
le.fit(grouped_labels['species_name'].unique())
y_index = le.transform(grouped_labels['species_name'].values)
grouped_labels['labels'] = y_index

#MASTER GROUPS
masle = preprocessing.LabelEncoder()
masle.fit(master_labels['species_name'].unique())
y_index = masle.transform(master_labels['species_name'].values)
master_labels['labels'] = y_index

#GROUP 1
le1 = preprocessing.LabelEncoder()
le1.fit(label_groups[1]['species_name'].unique())
y_index = le1.transform(label_groups[1]['species_name'].values)
label_groups[1]['labels'] = y_index

#GROUP 2
le2 = preprocessing.LabelEncoder()
le2.fit(label_groups[2]['species_name'].unique())
y_index = le2.transform(label_groups[2]['species_name'].values)
label_groups[2]['labels'] = y_index

#GROUP 3
le3 = preprocessing.LabelEncoder()
le3.fit(label_groups[3]['species_name'].unique())
y_index = le3.transform(label_groups[3]['species_name'].values)
label_groups[3]['labels'] = y_index

#GROUP 4
le4 = preprocessing.LabelEncoder()
le4.fit(label_groups[4]['species_name'].unique())
y_index = le4.transform(label_groups[4]['species_name'].values)
label_groups[4]['labels'] = y_index

#GROUP 5
le5 = preprocessing.LabelEncoder()
le5.fit(label_groups[5]['species_name'].unique())
y_index = le5.transform(label_groups[5]['species_name'].values)
label_groups[5]['labels'] = y_index

In [7]:
#sampling for the first layer
samples_index = master_labels.groupby('labels').sample(800).index
samples_index

Index([   679,   1648,   1364,    264,    268,   1972,    953,    568,     16,
         1241,
       ...
       463455, 463727, 463679, 463368, 463276, 464341, 464776, 463826, 464696,
       463809],
      dtype='int64', length=29600)

# TRAINING MODEL FOR GROUP CLASSIFICATION 
Loading models that have already been trained

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [9]:
species_df = pd.read_csv('./all_species.csv')
pathogen_list = list(species_df[species_df['type'] == 'pathogen']['genome_name'])

In [10]:
#loading models
from joblib import dump, load

rf_only = {}
rf = load('models/rf_humans.joblib')
rf_nonhumans_groups = load('models/rf_nonhumans_into_groups.joblib')
rf_only[1] = load('models/rf_group1.joblib')
rf_only[2] = load('models/rf_group2.joblib')
rf_only[3] = load('models/rf_group3.joblib')
rf_only[4] = load('models/rf_group4.joblib')
rf_only[5] = load('models/rf_group5.joblib')

In [12]:
#PREDICTIONS, FIRST LAYER - human vs nonhuman
def human_predict():
    y_humans_pred = rf.predict(df_test)
    y_humans_predprob = rf.predict_proba(df_test)
    df_test_df['human_predictions'] = y_humans_pred

In [11]:
#PREDICTIONS, SECOND LAYER - genera and groupings
def grouping(threshold = 0.7):    
    subset_mask = df_test_df['human_predictions'] == 0
    feature_cols = list(range(0, 32897))
    subset = df_test_df.loc[subset_mask, feature_cols]

    # predict
    '''if len(subset) == 0:
        continue'''
    y_groups_pred = rf_nonhumans_groups.predict(subset)
    y_groups_predprob = rf_nonhumans_groups.predict_proba(subset)

    grouped_predictions = [
                le.inverse_transform([np.argmax(item)])[0] if np.max(item) >= threshold else 'homo_sapiens'
                for item in y_groups_predprob
            ]

    # replace the original 'grouped_predictions' values in the subset
    df_test_df.loc[subset_mask, 'grouped_predictions'] = grouped_predictions

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
def jaccard_index_per_patient(patient_id, preds):
    df_true = pd.read_csv('test_data/patient{}_labels.csv'.format(patient_id))
    tp, fp, tp_fn = 0, 0, df_true['labels'].shape[0]
    # print('my predition(s) for patient {}:'.format(patient_id))
    # print(preds)
    # print('true pathogen')
    # print(df_true['labels'].values)
    # if don't predict any pathogen, it means there is only decoy in the test dataset (your prediction)
    if len(preds) == 0:
        preds = ['NONE']
    if len(df_true['labels']) == 0:
        df_true['labels'] = ['NONE']
        tp_fn = 1

    for item in np.unique(preds):
        if item in df_true['labels'].values:
            tp += 1
        else:
            fp += 1
    #you have to predict all labels correctly, but you are penalized for any false positive
    return tp / (tp_fn + fp)

In [20]:
#initialize
rf_list = [rf_nonhumans_groups, rf_only[1], rf_only[2], rf_only[3], rf_only[4], rf_only[5]]
le_list = [le, le1, le2, le3, le4, le5]
all_jaccard_index = []
results = []
threshold = 0.7
thresh_list = [0.4, 0.6, 0.8]
occurence_list = [2, 4, 6]
from collections import Counter

#iterate through patients
for patient_id in range(10):
    with open('test_data/patient{}_8mers.npy'.format(patient_id), 'rb') as read_file:
        df_test = np.load(read_file)

        df_test_df = pd.DataFrame(df_test)

        #layers 1 and 2
        human_predict()
        grouping()

        df_test_for_loop = df_test_df.copy()

        #loops over to get predictions for groups 1-5 and replaces them in the df 
        for i in range(1, 6):
            rf_now = rf_list[i]
            le_now = le_list[i]
            
            # subset for each group i 
            subset_mask = df_test_df['grouped_predictions'] == str(i)
            feature_cols = list(range(0, 32897))
            subset = df_test_df.loc[subset_mask, feature_cols]

            # predict
            if len(subset) == 0:
                continue
            y_preds = rf_now.predict(subset)
            y_predprob = rf_now.predict_proba(subset)

            #test threshold 1
            for thresh in thresh_list:
                y_preds_mapped = [
                    le_now.inverse_transform([np.argmax(item)])[0] if np.max(item) >= thresh else 'homo_sapiens'
                    for item in y_predprob #take max if threshold is passed
                ]

                # count occurrences of each label
                counts = Counter(y_preds_mapped)

                for occurence in occurence_list:
                    # replace failed labels with 'homo_sapiens'
                    y_preds_mapped = [
                        label if counts[label] >=  occurence else 'homo_sapiens'
                        for label in y_preds_mapped
                    ]

                    #take unique predictions
                    unique_predicts = np.unique(y_preds_mapped)
                    unique_predicts = [item for item in unique_predicts if item in pathogen_list]
        
                    #append
                    ji = jaccard_index_per_patient(patient_id, unique_predicts)
                    all_jaccard_index.append(ji)

                    results.append({
                        'patient_id': patient_id,
                        'ground_truth': list(pd.read_csv('test_data/patient{}_labels.csv'.format(patient_id))['labels']),
                        'predictions': unique_predicts,
                        'jaccard_index': ji,
                        'threshold prob': thresh,
                        'threshold occurence': occurence
                    })

In [21]:
results

[{'patient_id': 0,
  'ground_truth': ['neisseria_gonorrhoeae'],
  'predictions': [],
  'jaccard_index': 0.0,
  'threshold prob': 0.4,
  'threshold occurence': 2},
 {'patient_id': 0,
  'ground_truth': ['neisseria_gonorrhoeae'],
  'predictions': [],
  'jaccard_index': 0.0,
  'threshold prob': 0.4,
  'threshold occurence': 4},
 {'patient_id': 0,
  'ground_truth': ['neisseria_gonorrhoeae'],
  'predictions': [],
  'jaccard_index': 0.0,
  'threshold prob': 0.4,
  'threshold occurence': 6},
 {'patient_id': 0,
  'ground_truth': ['neisseria_gonorrhoeae'],
  'predictions': [],
  'jaccard_index': 0.0,
  'threshold prob': 0.6,
  'threshold occurence': 2},
 {'patient_id': 0,
  'ground_truth': ['neisseria_gonorrhoeae'],
  'predictions': [],
  'jaccard_index': 0.0,
  'threshold prob': 0.6,
  'threshold occurence': 4},
 {'patient_id': 0,
  'ground_truth': ['neisseria_gonorrhoeae'],
  'predictions': [],
  'jaccard_index': 0.0,
  'threshold prob': 0.6,
  'threshold occurence': 6},
 {'patient_id': 0,
  '

In [22]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,patient_id,ground_truth,predictions,jaccard_index,threshold prob,threshold occurence
0,0,[neisseria_gonorrhoeae],[],0.0,0.4,2
1,0,[neisseria_gonorrhoeae],[],0.0,0.4,4
2,0,[neisseria_gonorrhoeae],[],0.0,0.4,6
3,0,[neisseria_gonorrhoeae],[],0.0,0.6,2
4,0,[neisseria_gonorrhoeae],[],0.0,0.6,4
...,...,...,...,...,...,...
157,9,[],[],1.0,0.6,4
158,9,[],[],1.0,0.6,6
159,9,[],[],1.0,0.8,2
160,9,[],[],1.0,0.8,4


In [23]:
results_df["jaccard_index"].mean()

0.16049382716049382

In [25]:
group_means = results_df.groupby(['threshold prob', 'threshold occurence'])['jaccard_index'].mean()

# best params w the maximum mean jaccard index
best_params = group_means.idxmax()
best_mean = group_means.max()


Grouped mean Jaccard indices:
threshold prob  threshold occurence
0.4             2                      0.157407
                4                      0.212963
                6                      0.240741
0.6             2                      0.166667
                4                      0.166667
                6                      0.166667
0.8             2                      0.111111
                4                      0.111111
                6                      0.111111
Name: jaccard_index, dtype: float64

Best parameter combination:
Threshold Prob: 0.4, Threshold Occurrence: 6
Mean Jaccard Index: 0.24074074074074073


In [30]:
#intialize
rf_list = [rf_nonhumans_groups, rf_only[1], rf_only[2], rf_only[3], rf_only[4], rf_only[5]]
le_list = [le, le1, le2, le3, le4, le5]
all_jaccard_index = []
results = []
from collections import Counter

#iterate thru patients
for patient_id in range(10):
    with open('test_data/patient{}_8mers.npy'.format(patient_id), 'rb') as read_file:
        df_test = np.load(read_file)

        df_test_df = pd.DataFrame(df_test)

        #predict
        human_predict()
        grouping()

        df_test_for_loop = df_test_df.copy()

        #loops over to get predictions for groups 1-5 and replaces them in the df 
        for i in range(1, 6):
            rf_now = rf_list[i]
            le_now = le_list[i]
            
            # subset for each group i 
            subset_mask = df_test_df['grouped_predictions'] == str(i)
            feature_cols = list(range(0, 32897))
            subset = df_test_df.loc[subset_mask, feature_cols]

            # predict
            if len(subset) == 0:
                continue

            y_preds = rf_now.predict(subset)
            y_predprob = rf_now.predict_proba(subset)
            
            y_preds_mapped = [
                le_now.inverse_transform([np.argmax(item)])[0] if np.max(item) >=  best_params[0] else 'homo_sapiens'
                for item in y_predprob]

            #count occurrences of each label
            counts = Counter(y_preds_mapped)

            #replace failed labels with 'homo_sapiens'
            y_preds_mapped = [
                label if counts[label] >=  best_params[1] else 'homo_sapiens'
                for label in y_preds_mapped
            ]
            # replace the original 'grouped_predictions' values in the subset
            df_test_for_loop.loc[subset_mask, 'grouped_predictions'] = y_preds_mapped


        #get unique only and append
        unique_predicts = df_test_for_loop["grouped_predictions"].unique()
        unique_predicts = [item for item in unique_predicts if item in pathogen_list]
    
        ji = jaccard_index_per_patient(patient_id, unique_predicts)
        all_jaccard_index.append(ji)

        results.append({
            'patient_id': patient_id,
            'ground_truth': list(pd.read_csv('test_data/patient{}_labels.csv'.format(patient_id))['labels']),
            'predictions': unique_predicts,
            'jaccard_index': ji,
        })


In [33]:
pd.DataFrame(results)

Unnamed: 0,patient_id,ground_truth,predictions,jaccard_index
0,0,[neisseria_gonorrhoeae],[neisseria_gonorrhoeae],1.0
1,1,[corynebacterium_ulcerans],[],0.0
2,2,[staphylococcus_aureus],[],0.0
3,3,[streptococcus_pneumoniae],[streptococcus_pneumoniae],1.0
4,4,"[salmonella_enterica_typhimurium, clostridioid...","[klebsiella_pneumoniae, campylobacter_jejuni]",0.0
5,5,"[mycobacterium_tuberculosis, pseudomonas_aerug...","[mycobacterium_ulcerans, neisseria_gonorrhoeae...",0.2
6,6,[klebsiella_pneumoniae],"[klebsiella_michiganensis, streptococcus_agala...",0.0
7,7,"[staphylococcus_pyogenes, corynebacterium_diph...",[clostridioides_difficile],0.0
8,8,"[burkholderia_pseudomallei, listeria_monocytog...",[streptococcus_pneumoniae],0.0
9,9,[],[],1.0


In [34]:
results_df = pd.DataFrame(results)

results_df.to_csv('JI_results_forRF.csv', index=False)  


In [35]:
results_df["jaccard_index"].mean()

0.32