## Import Libraries

In [184]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px
import statsmodels.api as sm
from scipy.stats import f_oneway
from scipy import stats

## Load Data

In [52]:
clinical_data = pd.read_csv("/gladstone/finkbeiner/steve/work/data/npsad_data/vivek/Metadata/clinical_metadata_v2_lates.csv")
plaque_counts = pd.read_csv("/gladstone/finkbeiner/steve/work/data/npsad_data/vivek/csv_merged_data/final_quantify_new.csv")

In [53]:
clinical_data.head(2)

Unnamed: 0,SubID,Brain_bank,Age,Sex,Sex_chr_aneuploidy,Ethnicity,Dx,pH,PMI,Death_Time,...,LewyDorsalVValue,CDR_Memory,CDR_Orientation,CDR_Judgement,CDR_Community,CDR_HomeHobbies,CDR_PersonalCare,CDR_SumBoxes,Cognitive_Resilience,Cognitive_and_Tau_Resilience
0,M00_1061,MSSM,40.0,Male,,Black,Harry's plaintext: CVD+BPD | CERAD_1: Vascular...,6.87,11.333333,,...,0.0,0.0,0.0,1.0,1.0,0.5,0.0,3.5,,
1,M00_1062,MSSM,60.0,Female,,Hispanic,Harry's plaintext: CVD | CERAD_1: Vascular dis...,6.27,2.9,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [54]:
# create a patient id column to map with clinical metadata
plaque_counts["XENum"]=plaque_counts["image_name"].apply(lambda l: l.split("_1_")[0])

In [56]:
#merge clinical metadata with model prredicted output
plaque_counts_merged = pd.merge(plaque_counts,clinical_data[["Dx","Imaging_XENum","Plq_Mn","Plq_Mn_MFG","MidPlaquesValue","MidPlaquesWCoresValue","CDRScore","prs_AD2"]], left_on="XENum", right_on="Imaging_XENum", how = "left")

## One way ANNOVA TEST

In [153]:
def plotly_box_plot(data, cerad_dict, title, x_axis_title, y_axis_title):
  """
  plots a box plot between variables in cerad_dict
  """
  fig = go.Figure()
  for i in range(3):
    fig.add_trace(go.Box(y=data[i],
                boxpoints='all', # can also be outliers, or suspectedoutliers, or False
                jitter=0.3, # add some jitter for a better separation between points
                pointpos=0, # relative position of points wrt box
                name =cerad_dict[i]))
  fig.update_layout(height=900, width = 1000)
  fig.update_layout(
    xaxis_title_text=x_axis_title, # xaxis label
    yaxis_title_text=y_axis_title) # yaxis label
  fig.update_layout(title=title)
  fig.update_layout( plot_bgcolor='white')
  fig.update_xaxes( mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
  fig.update_yaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)

  fig.show()

In [182]:
def boxplot_plaque(plaque_counts_merged, var1, var2, title, x_axis_title, y_axis_title):
    plaque_counts_merged = plaque_counts_merged[(plaque_counts_merged["total_plaques"]<=5000)]
    sparse_count = plaque_counts_merged[plaque_counts_merged[var1]==1][var2].values
    moderate_count = plaque_counts_merged[plaque_counts_merged[var1]==3][var2].values
    freq_count = plaque_counts_merged[plaque_counts_merged[var1]==5][var2].values
    
    data = np.array([sparse_count,moderate_count,freq_count ])
    cerad_list = ["Sparse","Moderate","Frequent"]
    plotly_box_plot(data, cerad_list, title,x_axis_title, y_axis_title)
    
    _, p = stats.ttest_ind(sparse_count, moderate_count, nan_policy='omit')
    print("p-value for Sparse and Moderate", p)
    _, p = stats.ttest_ind(moderate_count, freq_count, nan_policy='omit')
    print("p-value for Moderate and Frequent", p)
    print("ANOVA for freq and mod :" ,f_oneway(moderate_count, freq_count))
    print("ANOVA for all :" ,f_oneway(sparse_count, moderate_count, freq_count))

In [183]:
title =  "Model Predicted - Total Neuritic plaques with Cored vs CERAD Ratings" 
y_axis_title = "Model Predicted - Total Neuritic plaques with Cored"  
x_axis_title = 'CERAD Ratings'
boxplot_plaque(plaque_counts_merged, "MidPlaquesWCoresValue","total_core_plaques", title,x_axis_title, y_axis_title )


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.



p-value for Sparse and Moderate 0.6030511468614959
p-value for Moderate and Frequent 0.005822805881553885
ANOVA for freq and mod : F_onewayResult(statistic=8.029500572357074, pvalue=0.0058228058815538745)
ANOVA for all : F_onewayResult(statistic=4.616368872894282, pvalue=0.012629780959981865)


## Spearman Correlation

In [138]:
def calculate_spearman_coeff(x, y):
    """
    returns correlation coeff and p-val
    """
    
    res = stats.spearmanr(x, y)
    return res

In [141]:
df = plaque_counts_merged[~plaque_counts_merged["MidPlaquesWCoresValue"].isna()]
x = df["MidPlaquesWCoresValue"]
y = df["Plq_Mn_MFG_x"]
calculate_spearman_coeff(x, y)

SpearmanrResult(correlation=0.5727800387407334, pvalue=1.0161411714189147e-08)

In [144]:
x = plaque_counts_merged["total_core_plaques"]
y = plaque_counts_merged["Plq_Mn_x"]
calculate_spearman_coeff(x, y)

SpearmanrResult(correlation=0.45639271611464477, pvalue=1.0031855768356292e-05)

In [145]:
x = plaque_counts_merged["total_core_plaques"]
y = plaque_counts_merged["Plq_Mn_MFG_x"]
calculate_spearman_coeff(x, y)

SpearmanrResult(correlation=0.45639271611464477, pvalue=1.0031855768356292e-05)

In [187]:
plaque_counts_merged.columns

Index(['Unnamed: 0', 'image_name', 'total_core_plaques',
       'total_diffuse_plaques', 'total_caa_plaques', 'avg_area',
       'avg_diameter', 'avg_eccentricity', 'age', 'sex', 'ethinicity', 'Dx_x',
       'Plq_Mn_x', 'Plq_Mn_MFG_x', 'CDRScore_x', 'prs_AD2_x', 'total_plaques',
       'XENum', 'Dx_y', 'Imaging_XENum', 'Plq_Mn_y', 'Plq_Mn_MFG_y',
       'MidPlaquesValue', 'MidPlaquesWCoresValue', 'CDRScore_y', 'prs_AD2_y',
       'CERAD Ratings', 'core_diffuse_plaques'],
      dtype='object')

In [188]:
df = plaque_counts_merged[~plaque_counts_merged["prs_AD2_x"].isna()]
x = df["total_core_plaques"]
y = df["prs_AD2_x"]
calculate_spearman_coeff(x, y)

SpearmanrResult(correlation=0.2649451419527489, pvalue=0.04845325257008303)

## Scatter Plots with Trendline

In [158]:
df = plaque_counts_merged[(plaque_counts_merged["total_plaques"]<=2000) & (~plaque_counts_merged["MidPlaquesWCoresValue"].isna())]
df["MidPlaquesWCoresValue"] = df["MidPlaquesWCoresValue"].astype(int)
df["MidPlaquesWCoresValue_desc"] =  np.where(df["MidPlaquesWCoresValue"]==1,"Sparse",np.where(df["MidPlaquesWCoresValue"]==3,"Moderate", np.where(df["MidPlaquesWCoresValue"]==5,"Frequent","None")))
df["CERAD Ratings"] = df["MidPlaquesWCoresValue_desc"]
df  = df.sort_values(by="CERAD Ratings")
val_y = "Plq_Mn_MFG_x"
val_x= "total_core_plaques"
title = "Model Predicted - Total Neuritic plaques with Cores vs Average density of Neuritic plaques"
fig = px.scatter(df, x=val_x, y=val_y, color="CERAD Ratings", trendline="ols", title=title, category_orders={"CERAD Ratings": ["Sparse","Moderate","Frequent"]},trendline_scope="overall",trendline_color_override="black")
fig.update_layout(height = 700, width = 900)
fig.update_layout( plot_bgcolor='white')
fig.update_xaxes( mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.update_yaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.add_annotation(x=1000, y=20,
            text="R-squared : 0.415",
            showarrow=True,
            arrowhead=1,
            font=dict(
                color="green",
                size=12
            ),arrowcolor="green")
fig.update_layout(
    xaxis_title_text='Model Predicted - Total Neuritic plaques with Cores', # xaxis label
    yaxis_title_text="Average density of Neuritic plaques") # yaxis label
fig.show()


#results = px.get_trendline_results(fig)
#print(results.sumaary())

output = sm.OLS(df[val_y], df[val_x]).fit() 
print(output.summary())



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



                                 OLS Regression Results                                
Dep. Variable:           Plq_Mn_MFG_x   R-squared (uncentered):                   0.415
Model:                            OLS   Adj. R-squared (uncentered):              0.407
Method:                 Least Squares   F-statistic:                              55.26
Date:                Tue, 16 May 2023   Prob (F-statistic):                    1.17e-10
Time:                        13:47:55   Log-Likelihood:                         -304.13
No. Observations:                  79   AIC:                                      610.3
Df Residuals:                      78   BIC:                                      612.6
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------