In [1]:
import pandas as pd
import numpy as np
from utils import get_species, get_labels, get_labels_all

In [2]:
X, y, y_all = get_species(), get_labels(), get_labels_all()

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, accuracy_score

In [4]:
np.unique(y_all)

array(['ACVD', 'Adenoma', 'Adenoma (MP)', 'Advanced Dementia',
       'Behcet’s disease', 'Breast Cancer', 'CD', 'CRC', 'Cancer',
       "Crohn''s disease", 'Crohns disease',
       'End-stage renal disease (ESRD)', 'Graves’ disease', 'Healthy',
       'Hypertension (HTN)', 'IGT', 'Large adenoma', 'Liver Cirrhosis',
       'NAFLD', 'Non–small cell lung cancer (NSCLC)', 'Obese', 'Obesity',
       'Overweight', 'Pancreatic cancer', 'Renal cell carcinoma (RCC)',
       'Rheumatoid Arthritis', 'Rheumatoid arthritis', 'Small adenoma',
       'T2D', 'Ulcerative colitis', 'Underweight', 'adenoma',
       'advanced adenoma', 'ankylosing spondylitis', 'carcinoma',
       'schizophrenia', 'ulcerative colitis'], dtype=object)

In [5]:
X = X[list(filter(lambda x : "virus" not in x and "unclassified" not in x, X.columns))]

In [31]:
bad_nonhealthies = ((y_all == "Obesity") | (y_all == "Overweight") | (y_all == "Underweight")).values
X_red = X.iloc[~bad_nonhealthies, :]
y_red = y.iloc[~bad_nonhealthies, :]
y_all_red = y_all.iloc[~bad_nonhealthies, :]

In [6]:
c = 0.00001

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(random_state=42, penalty="l1", solver="liblinear", C=1, class_weight="balanced")
clf.fit(X_train > c, y_train)
y_hat = clf.predict(X_test > c)
balanced_accuracy_score(y_test, y_hat), accuracy_score(y_test, y_hat)

  y = column_or_1d(y, warn=True)


(0.6624740054365044, 0.6641404068607898)

In [9]:
phenotype_list = list(np.unique(y_all))
old_phenotype_list = list(np.unique(y_all))
phenotype_list

['ACVD',
 'Adenoma',
 'Adenoma (MP)',
 'Advanced Dementia',
 'Behcet’s disease',
 'Breast Cancer',
 'CD',
 'CRC',
 'Cancer',
 "Crohn''s disease",
 'Crohns disease',
 'End-stage renal disease (ESRD)',
 'Graves’ disease',
 'Healthy',
 'Hypertension (HTN)',
 'IGT',
 'Large adenoma',
 'Liver Cirrhosis',
 'NAFLD',
 'Non–small cell lung cancer (NSCLC)',
 'Obese',
 'Obesity',
 'Overweight',
 'Pancreatic cancer',
 'Renal cell carcinoma (RCC)',
 'Rheumatoid Arthritis',
 'Rheumatoid arthritis',
 'Small adenoma',
 'T2D',
 'Ulcerative colitis',
 'Underweight',
 'adenoma',
 'advanced adenoma',
 'ankylosing spondylitis',
 'carcinoma',
 'schizophrenia',
 'ulcerative colitis']

In [10]:
from tqdm import tqdm

removed_list = []

while phenotype_list != []:
    best_phenotype_list = []
    best_score = -1
    best_removed = None
    for i in tqdm(range(len(phenotype_list))):
        removed = phenotype_list[i]
        if removed == "Healthy":
            continue
        curr_phenotype_list = phenotype_list[:i] + phenotype_list[i + 1:]
        index = y_all.isin(curr_phenotype_list)
        X_curr = X.iloc[index.values, :]
        y_curr = y.iloc[index.values, :]
        
        c = 0.00001

        X_train, X_test, y_train, y_test = train_test_split(X_curr, y_curr, test_size=0.2, random_state=42)

        clf = LogisticRegression(random_state=42, penalty="l1", solver="liblinear", C=1, class_weight="balanced")
        clf.fit(X_train > c, y_train.values.flatten())
        y_hat = clf.predict(X_test > c)
        score = balanced_accuracy_score(y_test, y_hat)
        
        if score > best_score:
            best_score = score
            best_phenotype_list = curr_phenotype_list
            best_removed = removed
    removed_list.append(best_removed)
    phenotype_list = best_phenotype_list
    print("best score:", best_score)
    print("Removed phenotypes:", removed_list)

100%|███████████████████████████████████████████████████████████████| 37/37 [00:35<00:00,  1.04it/s]


best score: 0.7001078021829942
Removed phenotypes: ['Overweight']


100%|███████████████████████████████████████████████████████████████| 36/36 [00:28<00:00,  1.24it/s]


best score: 0.7456869136587693
Removed phenotypes: ['Overweight', 'Obesity']


100%|███████████████████████████████████████████████████████████████| 35/35 [00:44<00:00,  1.26s/it]


best score: 0.7542838249669369
Removed phenotypes: ['Overweight', 'Obesity', 'Underweight']


100%|███████████████████████████████████████████████████████████████| 34/34 [00:40<00:00,  1.18s/it]


best score: 0.7726714901207119
Removed phenotypes: ['Overweight', 'Obesity', 'Underweight', 'T2D']


100%|███████████████████████████████████████████████████████████████| 33/33 [00:39<00:00,  1.21s/it]


best score: 0.7653957853866266
Removed phenotypes: ['Overweight', 'Obesity', 'Underweight', 'T2D', "Crohn''s disease"]


100%|███████████████████████████████████████████████████████████████| 32/32 [00:34<00:00,  1.09s/it]


best score: 0.7734295845997974
Removed phenotypes: ['Overweight', 'Obesity', 'Underweight', 'T2D', "Crohn''s disease", 'ACVD']


100%|███████████████████████████████████████████████████████████████| 31/31 [00:29<00:00,  1.07it/s]


best score: 0.772427828130245
Removed phenotypes: ['Overweight', 'Obesity', 'Underweight', 'T2D', "Crohn''s disease", 'ACVD', 'Ulcerative colitis']


100%|███████████████████████████████████████████████████████████████| 30/30 [00:18<00:00,  1.59it/s]


best score: 0.7948778314799171
Removed phenotypes: ['Overweight', 'Obesity', 'Underweight', 'T2D', "Crohn''s disease", 'ACVD', 'Ulcerative colitis', 'IGT']


 55%|██████████████████████████████████▊                            | 16/29 [00:11<00:09,  1.34it/s]


KeyboardInterrupt: 