In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import re
%matplotlib inline

In [2]:
# gather all radiomic data from primary and recurred tumors
rootdir = '/Users/shuang/Documents/Proj_Radiomics/Data/her2'

# get radiomics of all the primary tumor data
fname1 = '{}/her2_Analysis/PETMRI/PETbinwidth0.1_MRItp2_binwidth5/data_all.csv'.format(rootdir)
df_prim_all = pd.read_csv(fname1)

# print(df_prim_all.columns.tolist())

# find all PET radiomics
pat = re.compile('_pet')
feat_names = [ss for ss in df_prim_all.columns.tolist() if re.search('([\w.]+)_pet',ss)]
new_feat_names = [re.search('([\w.]+)_pet', ss).group(1) for ss in df_prim_all.columns.tolist() if re.search('([\w.]+)_pet', ss)]
newer_feat_names = [re.search('([\w.]+)_avg', ss).group(1) if re.search('([\w.]+)_avg', ss) else ss for ss in new_feat_names]

the_col_names = feat_names + ['ptid_side']
df_prim = df_prim_all.loc[:,the_col_names]

# change feature name
col_dict = dict(zip(feat_names, newer_feat_names))
df_prim.rename(col_dict, axis='columns',inplace=True)
df_prim['tumor_type'] = 'Primary'
# print(df_prim.columns.tolist())

json_dir = '{}/her2_ImageFeatures/IsoVoxelSize'.format(rootdir)
all_jsons = glob.glob('{}/*.json'.format(json_dir))

df_recur = pd.DataFrame()
for jj in all_jsons:
    df_tmp = pd.read_json(jj)
    df_recur = df_recur.append(df_tmp, ignore_index=True)
df_recur['FOstats_min'] = df_recur['FOstats_minmax'].apply(lambda x: x[0])
df_recur['FOstats_max'] = df_recur['FOstats_minmax'].apply(lambda x: x[1])
df_recur.drop(columns=['FOstats_minmax'],inplace=True)

# get the average of texture features

pat = re.compile('texture_')
texture_cols = [ss for ss in df_recur.columns.tolist() if pat.match(ss)]
for tc in texture_cols:
    df_recur[tc +'_avg'] = df_recur[tc].apply(np.mean)
    df_recur.drop(tc,axis=1,inplace=True)    
df_recur['tumor_type'] = df_recur['tumor_tag'].map(lambda x: '_'.join(['Recur',x]))
df_recur['ptid_side'] = df_recur[['pt_id','breast_side']].apply(lambda x: '{}_{}'.format(x[0], x[1]), axis=1)
newer_feat_names = [re.search('([\w.]+)_avg', ss).group(1) if re.search('([\w.]+)_avg', ss) else ss for ss in df_recur.columns.tolist()]
col_dict = dict(zip(df_recur.columns.tolist(), newer_feat_names))
df_recur.rename(col_dict, axis='columns',inplace=True)

col_of_interest = df_prim.columns.tolist()
df_recur_oi = df_recur.loc[:,col_of_interest]
df_prim_oi = df_prim.loc[:,col_of_interest]

# combine primary and recur tumor DFs
df_all = pd.concat([df_prim_oi, df_recur_oi], ignore_index=True)
print(df_all)


     FOstats_energy  FOstats_entropy  FOstats_kurtosis  FOstats_mean  \
0     197984.031250         5.536868          5.234993      3.525060   
1     211952.109375         5.470107          5.270049      3.174406   
2         57.455063         1.763697          2.370392      1.091445   
3       7258.604492         5.099894          2.806678      2.700307   
4      57107.488281         5.084719          7.894037      2.232625   
5        983.184448         3.355637          3.422115      1.594757   
6       8460.021484         5.427511          5.978713      2.856201   
7      17500.703125         4.632359          5.544690      2.255796   
8      48165.585938         5.266244          2.678233      2.761572   
9       2454.211670         4.033516          4.632780      1.364780   
10      2425.740234         2.297966          5.669534      0.857146   
11      1980.467773         4.311808          3.249635      1.943283   
12       173.288330         2.303275          3.030634      0.60

In [10]:
ptid_sides = list(df_recur_oi.ptid_side.unique())
print(ptid_sides)

test = [ptid_sides[0]]
df_corr_final = pd.DataFrame()

# find the feature column name
pat = re.compile('texture_|FOstats_|ShapeSize_')
texture_cols = [ss for ss in df_recur.columns.tolist() if pat.match(ss)]

for ps in test:
    df_tmp = df_all.loc[df_all['ptid_side'] == ps,:]
    print(df_tmp.columns)

# ptidsd_oi = ptid_sides[1]
# df1 = df_all[df_all['ptid_side'] == ptidsd_oi]
# val_vars = set(df1.columns.tolist()).symmetric_difference(['ptid_side','tumor_type'])

# # make an appropriate table
# df2 = pd.melt(df1, id_vars=['ptid_side','tumor_type'], value_vars=val_vars, var_name='Radiomics')
# # print(df2)
# # print(df2.columns.tolist())

# df3 = df2.pivot(index='Radiomics',columns='tumor_type',values='value')

# df3 = df3.reset_index()

# # make another column to categorize radiomic feature to FOstats, shape and size and texture
# df3['Radiomics_type'] = df3['Radiomics'].apply(lambda x: re.split('_+',x)[0] if re.split('_+',x) else np.nan)
# print(df3)

['116_L', '117_L', '123_L', '13_R', '25_L', '30_L', '69_L', '99_R']
Index([u'FOstats_energy', u'FOstats_entropy', u'FOstats_kurtosis',
       u'FOstats_mean', u'FOstats_min', u'FOstats_max', u'FOstats_skewness',
       u'FOstats_uniformity', u'FOstats_variance', u'ShapeSize_compactness1',
       u'ShapeSize_compactness2', u'ShapeSize_max_euc_dis',
       u'ShapeSize_spherical_disproportion', u'ShapeSize_sphericity',
       u'ShapeSize_surf_area_cm2', u'ShapeSize_surface2volratio',
       u'ShapeSize_vol_cm3', u'texture_autocorrelation',
       u'texture_cluster_prominence', u'texture_cluster_shade',
       u'texture_cluster_tendency', u'texture_contrast',
       u'texture_correlation', u'texture_diff_entropy',
       u'texture_dissimilarity', u'texture_energy', u'texture_entropy',
       u'texture_homogeneity1', u'texture_homogeneity2', u'texture_idmn',
       u'texture_idn', u'texture_inv_var', u'texture_maxprob',
       u'texture_sum_avg', u'texture_sum_entropy', u'texture_sum_var',
