Imports

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mpl_dates
import pandas as pd
import random

plt.style.use('seaborn')

Load and Organize Blood Data

In [None]:
df = pd.read_csv("Blood_2023-11-11T1110.csv")
# print(df)
##  Shorten col names
df.rename(columns = {'Blood Draw Data Total Protein (g/dL)':'Protein'}, inplace = True)
df.rename(columns = {'Blood Draw Data Appointment Date':'Date'}, inplace = True)
df.rename(columns = {'Blood Draw Data Participant ID':'ID'}, inplace = True)
df.rename(columns = {'Blood Draw Data A/G Ratio':'A/G Ratio'}, inplace = True)
df = df[['ID', 'Date', 'Protein', 'A/G Ratio']]
df= df.sort_values(by=['ID'], ascending=True)
df['Date'] =pd.to_datetime(df['Date'])  ## convert date object to datetime
print(df)
print(df.dtypes) ## Show data types

Exclude questionable data

In [None]:
pd.to_numeric(df['Protein'], errors = 'coerce').replace(np.nan, 0)  ## some data is text or NaN other is numeric

df1 = df[df.Protein.lt(6)] ## View all protein values less than 6
print(df1)
df2 = df[df.Protein.gt(10)]  ## View all protein values greater than 10
print(df2)
df3 = df[df['A/G Ratio'].gt(2.5)]  ##  Same for A/G ratios
print(df3)
df4 = df[df['A/G Ratio'].lt(0.5)]
print(df4)

Exclude Protein values >= 10 and A/G Ratios > 2.5

In [None]:
df = df[df.Protein.lt(10)] 
df = df[df['A/G Ratio'].lt(2.5)]
print(df)

Group data by participant ID and get grouped stats

In [None]:
result = df.groupby('ID').agg({'Protein': ['count', 'mean', 'min', 'max']}) 
  
print("Count, mean, min, and max values of Protein by ID") 
print(result)

result1 = df.groupby('ID').agg({'A/G Ratio': ['count', 'mean', 'min', 'max']}) 
  
print("Count, mean, min, and max values of Protein by ID") 
print(result1)

Plot protein data

In [None]:
result.hist()  ##Protein values
plt.xlabel("g/dl")
plt.ylabel("count")
plt.show()

result1.hist()  ## A/G ratios
plt.show()
# plt.savefig("protein.jpg")  ## Use VSCode to save as png

Sort data in time series and plot trends

In [None]:
df = df.sort_values(by=['ID','Date'], ascending=True) ##  Sort by ID then Date
df = df.groupby('ID').filter(lambda x : len(x)>5)  # Reduce data to IDs with six or more blood draws

plt.plot_date(df['Date'], df['Protein'], color = 'red',linestyle = 'solid')
plt.gcf().autofmt_xdate()

Plot each ID in a separate trend line 

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

for n, g in df.groupby('ID'):
    g.plot(
        x="Date", y="Protein",
        xlabel="Cycle", ylabel="Salary",
        label=f"id {n}",
        ax=ax,
)

plt.show();

Subplot each participant's trend - Protein Level

In [None]:
import random
fig, axs = plt.subplots(figsize=(10, 8), nrows=4, ncols=3)
c = ['purple', 'red', 'green', 'blue', 'black', 'brown', 'gold', 'silver', 'aqua']
for (n, g), ax in zip(df.groupby("ID"), axs.flatten()):
    g.plot(
        x="Date", y="Protein", marker='o',
        xlabel="Date", ylabel="g/dl",
        label=f"Participant {n}",
        color=c[random.randint(0, 8)],
        ax=ax,
    )
fig.suptitle('Do proteins levels trend or plateau?', fontsize = 20, color = 'black')
plt.tight_layout()
plt.show()

Subplot each participant's trend - A/G Ratio

In [None]:
fig, axs = plt.subplots(figsize=(10, 8), nrows=4, ncols=3)
c = ['purple', 'red', 'green', 'blue', 'black', 'brown', 'gold', 'silver', 'aqua']
for (n, g), ax in zip(df.groupby("ID"), axs.flatten()):
    g.plot(
        x="Date", y="A/G Ratio", marker='o',
        xlabel="Date", ylabel="A/G",
        label=f"Participant {n}",
        color=c[random.randint(0, 8)],
        ax=ax,
    )
fig.suptitle('Do A/G ratios trend or plateau?', fontsize = 20, color = 'black')
plt.tight_layout()
plt.show()