In [1]:
import os
import pandas as pd
from plotnine import *

In [2]:
# Sets the working directory.
os.chdir("C:\\Users\\fshen\OneDrive\\Documents\\AI Precision Nutrition Bootcamp\\T2D")

### Subjects

In [3]:
# Create data frame from "Subjects.txt" using [tab] as separator, top row as header
subjects = pd.read_csv("Subjects.txt", sep = "\t")
subjects.head()

Unnamed: 0,SubjectID,Study,Race,Sex,Age,BMI,SSPG,IR_IS_classification
0,ZIS22OE,HMP,C,F,,,,Unknown
1,ZJBOZ2X,Exercise,C,F,65.3,19.82,,Unknown
2,ZJOSZHK,HMP,C,M,41.43,19.42,,Unknown
3,ZJTKAE3,HMP,C,F,58.65,31.24,162.0,IR
4,ZJXC41N,"HMP, Exercise",B,F,49.69,28.24,75.0,IS


In [4]:
# Select only subjects with known insulin sensitivity and known BMI and known race
subjects_IR_IS_known = subjects[(subjects["IR_IS_classification"] != "Unknown") 
                                 & (subjects["BMI"].notna())
                                 & (subjects["Race"] != "unknown")]
subjects_IR_IS_known.describe()

Unnamed: 0,Age,BMI,SSPG
count,64.0,64.0,64.0
mean,56.19125,28.846719,149.830625
std,7.402171,3.788701,65.370466
min,38.8,21.47,40.0
25%,50.96,25.97,91.875
50%,56.43,28.745,156.935
75%,61.9425,31.3125,213.5475
max,69.0,38.9,276.0


In [5]:
# Factorize sex, insulin sensitivity, race, then reorganize to only show important data

subjects_IR_IS_known["Sex"] = subjects_IR_IS_known["Sex"].astype("category")

IR_IS_cats = pd.CategoricalDtype(categories = ["IS", "IR"], ordered = False)
subjects_IR_IS_known["IR_IS_classification"] = subjects_IR_IS_known["IR_IS_classification"].astype(IR_IS_cats)

race_cats = pd.CategoricalDtype(categories = ["C", "A", "B", "H"], ordered = False)
subjects_IR_IS_known["Race"] = subjects_IR_IS_known["Race"].astype(race_cats)

subjects_IR_IS_known = subjects_IR_IS_known.rename({"IR_IS_classification": "IR_IS"}, axis = 1)
subjects_IR_IS_known = subjects_IR_IS_known.drop(columns = ["Study"])
subjects_IR_IS_known.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(64, 7)

In [6]:
subjects_IR_IS_known["IR_IS"].value_counts()

IR_IS
IR    34
IS    30
Name: count, dtype: int64

### Visits

In [7]:
# Create data frame from "Visits.txt" using [tab] as separator, top row as header
visits = pd.read_csv("Visits.txt", sep = "\t")
visits.shape

(1416, 8)

In [8]:
# Select only the visits from subjects listed in subjects_IR_IS_known
visits_known = visits[visits["SubjectID"].isin(subjects_IR_IS_known["SubjectID"])]
visits_known.shape

(1098, 8)

### Clinical Tests

In [9]:
# Create data frame from "clinical_tests.txt" using [tab] as separator, top row as header
clinicals = pd.read_csv("HMP/Clinical/clinical_tests.txt", sep = "\t")
clinicals.shape

(969, 57)

In [10]:
# Rename SubjectID to SubjectID_2 (reserve SubjectID for another column)
clinicals = clinicals.rename({"SubjectID": "SubjectID_2"}, axis = 1)

# New SubjectID column using VisitID (split using '-')
clinicals["SubjectID"] = clinicals["VisitID"].str.split(pat = '-')
clinicals.head()

Unnamed: 0,VisitID,A1C,AG,ALB,ALCRU,ALKP,ALT,AST,BASO,BASOAB,...,TP,UALB,UALBCR,WBC,SubjectID_2,CL1,CL2,CL3,CL4,SubjectID
0,ZOZOW1T-1013,6.0,8.0,4.0,,96.0,48.0,22.0,0.6,0.04,...,6.3,,,6.0,69-001,D7,,Infection_Late,Infection,"[ZOZOW1T, 1013]"
1,ZOZOW1T-1015,5.9,8.0,4.2,,103.0,77.0,120.0,0.9,0.04,...,6.5,,,5.0,69-001,D30,,Infection_Recovery_Late,Infection_L,"[ZOZOW1T, 1015]"
2,ZOZOW1T-1021,6.3,,,173.5,,,,1.0,0.09,...,,7.0,<30,8.9,69-001,D1,,Infection_Early,Infection,"[ZOZOW1T, 1021]"
3,ZOZOW1T-1022,6.1,7.0,4.2,278.2,69.0,40.0,27.0,0.5,0.05,...,6.6,16.0,<30,10.8,69-001,D3,,Infection_Middle,Infection,"[ZOZOW1T, 1022]"
4,ZOZOW1T-1023,6.3,13.0,4.2,412.8,66.0,53.0,31.0,0.6,0.04,...,6.7,18.0,<30,7.0,69-001,D15,,Infection_Recovery_Early,Infection_L,"[ZOZOW1T, 1023]"


In [11]:
# Only keep rows with corresponding data in subjects_IR_IS_known and visits_known
clinicals_known = clinicals[clinicals["VisitID"].isin(visits_known["VisitID"])]
clinicals_known.describe()

Unnamed: 0,A1C,ALB,ALKP,BASO,BASOAB,BUN,CA,CHOL,CHOLHDL,CL,...,NA.,NEUT,NEUTAB,NHDL,PLT,RBC,RDW,TGL,TP,WBC
count,695.0,702.0,702.0,697.0,697.0,702.0,702.0,700.0,700.0,702.0,...,702.0,698.0,698.0,700.0,698.0,698.0,698.0,700.0,702.0,698.0
mean,5.642158,3.97906,81.641026,0.704304,0.03944,16.390313,8.970085,196.371429,3.468571,103.679487,...,139.535613,56.190258,3.220401,135.994286,221.269341,4.679742,13.802006,103.964286,7.039174,5.684241
std,0.433262,0.282999,25.135738,0.34911,0.021284,5.260551,0.35686,33.812227,1.046069,2.511851,...,2.494676,8.565239,1.123932,35.161552,51.088936,0.448061,1.092309,58.065353,0.40375,1.550714
min,4.7,3.0,38.0,0.1,0.0,8.0,7.8,107.0,1.4,96.0,...,131.0,34.3,0.9,37.0,117.0,3.2,11.7,22.0,5.7,2.2
25%,5.3,3.8,63.0,0.5,0.03,13.0,8.7,175.0,2.7,102.0,...,138.0,50.825,2.49,114.0,187.0,4.33,13.1,64.75,6.8,4.6
50%,5.6,4.0,77.5,0.6,0.04,15.0,9.0,195.0,3.35,104.0,...,140.0,56.5,3.09,133.0,215.0,4.705,13.6,89.5,7.1,5.4
75%,5.9,4.2,95.0,0.8,0.05,18.75,9.2,216.0,4.1,105.0,...,141.0,62.175,3.7975,156.0,247.0,5.0,14.3,129.0,7.3,6.6
max,8.0,4.9,198.0,5.0,0.28,40.0,10.3,332.0,9.5,110.0,...,148.0,95.5,12.94,297.0,457.0,6.38,20.5,407.0,8.2,15.3


In [12]:
# Count values, sort by alphabetical order (Ant -> antibiotic, Imz -> immunization)
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6666404/
clinicals_known["CL4"].value_counts().sort_index()

CL4
Allergy            2
Ant               14
Ant_L              4
Colonoscopy        3
Colonoscopy_L      1
Fiber             31
Healthy          406
Imz               69
Imz_L             23
Infection         66
Infection_L       36
Post-Travel        2
Stress             2
Stress_L           1
Weight-gain       22
Weight-loss       23
Name: count, dtype: int64

### Exploratory Analysis

In [13]:
plot_list = list()

plot_list.append(ggplot(subjects_IR_IS_known, aes(fill = "IR_IS"))
                 + geom_histogram(aes(x = "Age"), bins = 5, position = "dodge") 
                 + ylab("Count") 
                 + scale_fill_manual(values = ("skyblue", "orange")) 
                 + theme_bw())

plot_list.append(ggplot(subjects_IR_IS_known, aes(fill = "IR_IS")) 
                 + geom_histogram(aes(x = "SSPG"), bins = 5, position = "dodge")
                 + ylab("Count") 
                 + scale_fill_manual(values = ("skyblue", "orange")) 
                 + theme_bw())

plot_list.append(ggplot(subjects_IR_IS_known, aes(fill = "IR_IS")) 
                 + geom_histogram(aes(x = "BMI"), bins = 5, position = "dodge") 
                 + ylab("Count") 
                 + scale_fill_manual(values = ("skyblue", "orange")) 
                 + theme_bw())

plot_list.append(ggplot(subjects_IR_IS_known, aes(fill = "IR_IS"))
                 + geom_bar(aes(x = "Sex")) 
                 + scale_fill_manual(values = ("skyblue", "orange")) 
                 + theme_bw())

plot_list.append(ggplot(subjects_IR_IS_known, aes(fill = "IR_IS")) 
                 + geom_bar(aes(x = "Race")) 
                 + scale_fill_manual(values = ("skyblue", "orange")) 
                 + theme_bw())

for i in range(len(plot_list)):
    plot_list[i].save(filename = "plot" + str(i) + ".png", dpi = 300)


