In [None]:
import pandas as pd
import os
import glob
import re
import scipy.cluster


class FormatFinalCSV():

        def __init__(self, source_dir, file_prefix, output_dir=False):
                self.source_dir = source_dir
                self.output_dir = output_dir
                self.file_prefix = file_prefix


        @staticmethod
        def format_col_names(df_old_cols):
                col_new = [re.sub(r'\W|[^\x00-\x7F]m[^\x00-\x7F]', '_',  str.lower(col)) for col in df_old_cols.columns]
                col_new = [re.sub('_+', '_', col) for col in col_new]
                col_new = [re.sub('_$', '', col) for col in col_new]
                df_old_cols.columns = col_new 
                return df_old_cols
        
        def get_file_list(self):
                dat = glob.glob(f'{self.source_dir}/{self.file_prefix}**/report/{self.file_prefix}**final**sv', recursive=True)
                return dat

        @staticmethod
        def get_final_stats_df(d):
                dat_df = pd.read_csv(d, header=0).assign(sample_id = re.split(os.sep, d)[2])
                dat_df.columns = ['characteristic','values', 'sample_id']
                dat_p = dat_df.pivot(columns='sample_id', values='values',index='characteristic').reset_index()
                dat_p.index = dat_p.characteristic
                return(dat_p)
        
        def format_concat_files(self):
                dat = self.get_file_list()
                if dat:
                        datall = pd.concat([self.get_final_stats_df(d) for d in dat], axis = 1).drop(['characteristic'], axis=1)
                        dat_all = datall.transpose()
                        dat_all.columns = dat_all.columns.to_flat_index()
                        dat_all.columns = [f'vesicularirty_{d}' if d in ["0.5", "0.6", "0.7", "0.8", "0.9"] else d.lower() for d in dat_all.columns]
                        dat_all = self.format_col_names(dat_all).reset_index().sort_values('sample_id')
                        return dat_all
                

In [None]:
am = FormatFinalCSV(source_dir='./hne_2023_12_01', file_prefix = 'pccm')
dat = am.get_file_list()
datall = am.format_concat_files()
datall.index = datall.sample_id

In [None]:
datall = datall.drop(['sample_id'], axis=1)
datall_perc = datall.apply(lambda x: x.str.replace('%', '').astype(float))
# datdesc = datall_perc.describe()
# datdesc

In [None]:
datall_perc

In [None]:
datall_perc

In [None]:
sns.clustermap(datall_perc, method='centroid', metric='euclidean', z_score=1, row_cluster=True)

In [None]:
data_dir = 'hne_2023_12_01'
file_dir = os.path.join(source_dir, data_dir)

In [None]:
dat = glob.glob(f'{file_dir}/pccm**/report/pccm**final**sv', recursive=True, case=False)

In [None]:
def get_final_stats_df(d):
    dat_df = pd.read_csv(d, header=0).assign(sample_id = re.split(os.sep, d)[2])
    dat_df.columns = ['characteristic','values', 'sample_id']
    dat_p = dat_df.pivot(columns='sample_id', values='values',index='characteristic').reset_index()
    dat_p.index = dat_p.characteristic
    return(dat_p)

In [None]:
# datall = pd.concat([get_final_stats_df(d) for d in dat], axis = 1).drop(['characteristic'], axis=1)
# dat_all = datall.transpose()
# dat_all.columns = dat_all.columns.to_flat_index()
# dat_all.columns = [f'vesicularirty_{d}' if d in ["0.5", "0.6", "0.7", "0.8", "0.9"] else d.lower() for d in dat_all.columns]
# dat_all = format_col_names(dat_all)
# [d for d in dat_all.columns if re.findall('til',d)]


In [None]:
get_final_stats_df(dat[0])


In [None]:
get_final_stats_df(dat[4])
pd.read_csv(dat[4]); dat[4].split(os.sep)[2]
pd.read_csv(dat[2]); dat[2].split(os.sep)

In [None]:
df_datall = dat_all.transpose()
df_datall.columns = dat_all.index.tolist()
df_datall = format_col_names(df_datall)
sample_id = dat_all.index.tolist()
# df_datall.index = df_datall.index.droplevel(0)
df_datall = df_datall.assign(sample = sample_id).reset_index(inplace=False)
# df_datall.to_csv('pccm_amaranth_hne_2023_11_30_dk.csv', sep=',')
# df_datall

In [None]:
df_datall

In [None]:
df_datall.columns.to_list()

In [None]:
df_datall.columns = [str.lower(re.sub('[^A-Za-z0-9%]', '_', col)) for col in df_datall.columns.tolist()]
tils =[col for col in df_datall.columns if re.findall('til', col)]
col_to_take = ['sample'] + tils
tils_df = df_datall[col_to_take]
tils_df

## Amaranth-PCCM Key

In [None]:
os.listdir(source_dir)

In [None]:
keyfile = pd.read_excel(os.path.join(source_dir, os.listdir(source_dir)[1]))
keyfile

In [None]:
keyfile['sample'] =  [re.split('\\.', k)[0] for k in keyfile.pccm_amaranth_hne_filename.to_list()]

In [None]:
keyfile['sample']

In [None]:
keyfile = keyfile.filter(['sample','subtype', 
                          'lymphocytes_invasion_score_low_less_10_moderate_10_40_high_greater_40_an',
                         'tils_score_percent',
                         'grade_an']).rename(columns = {
    
'lymphocytes_invasion_score_low_less_10_moderate_10_40_high_greater_40_an': 'pathologist_tils_classification',
'grade_an': 'pathologist_grade'})
keyfile.dtypes

In [None]:
keyfile
import numpy as np
import numpy as np
condition = [keyfile['pathologist_grade'] == 1, keyfile['pathologist_grade']  == 2, 
             keyfile['pathologist_grade']  == 3] 
choice = ['I', "II", 'III']
keyfile['pathologist_grade'] = np.select(choicelist=choice, condlist=condition, default = keyfile['pathologist_grade'])
keyfile.fillna('NA', inplace = True)

In [None]:
dat = pd.merge(df_datall, keyfile, on = 'sample')
dat.to_csv('pccm_amaranth_summary_and_sample_key.csv', sep = ',')
dat

In [None]:
dat.columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
dat['intra_tumoral_tils_area_in_mm_'] = dat.intra_tumoral_tils_area_in_mm_.astype('float')
dat['pathologist_score']  = dat.lymphocytes_invasion_score_low_less_10_moderate_10_40_high_greater_40_an
# dat['tils_score_percent'] = dat.tils_score_percent.astype('float')
dat
# sns.scatterplot(dat, y ='intra_tumoral_tils_area_in_mm_', x = 'tils_score_percent',
#                 hue = 'lymphocytes_invasion_score_low_less_10_moderate_10_40_high_greater_40_an')


In [None]:

tils= dat.tils_score_percent.to_list()[0:10]
tils
dat.columns

In [None]:
g = sns.scatterplot(dat, x ='intra_tumoral_tils_area_in_mm_', y = 'tils_score_percent',
                hue = 'pathologist_score')
g.set_ylabel('amaranth_intra_tumoral_tils_area (mm)')

In [None]:
dat['peri_tumoral_tils_area_in_mm_'] = dat.peri_tumoral_tils_area_in_mm_.astype('float')
g = sns.scatterplot(dat, x ='peri_tumoral_tils_area_in_mm_', y = 'tils_score_percent',
                hue = 'pathologist_score')
g.set_xlabel('amaranth_peri_tumoral_tils_area (mm)')

In [None]:
dat['peri_tumoral_tils_area_in_mm_'] = dat.peri_tumoral_tils_area_in_mm_.astype('float')
g = sns.barplot(dat, y ='peri_tumoral_tils_area_in_mm_', x = 'pathologist_score',
                dodge=True, order=['Low', 'Moderate', 'High'])
g = sns.stripplot(dat, y ='peri_tumoral_tils_area_in_mm_', x = 'pathologist_score',
                hue = 'pathologist_score')
g.set_xlabel('amaranth_peri_tumoral_tils_area (mm)')

In [None]:
dat['stromal_tils_area_in_mm_'] = dat.stromal_tils_area_in_mm_.astype('float')
g = sns.scatterplot(dat, x ='stromal_tils_area_in_mm_', y = 'tils_score_percent',
                hue = 'pathologist_score')
g.set_xlabel('amaranth_stromal_tumoral_tils_area (mm)')

In [None]:
dat['til_per_mm2'] = dat.stromal_tils_area_in_mm_.astype('float')
g = sns.scatterplot(dat, x ='til_per_mm2', y = 'tils_score_percent',
                hue = 'pathologist_score')
g.set_xlabel('amaranth_til_per_mm2')

In [None]:
dat['til_per_1000_epithelial_cells'] = dat.til_per_1000_epithelial_cells.astype('float')
g = sns.scatterplot(dat, x ='til_per_1000_epithelial_cells', y = 'tils_score_percent',
                hue = 'pathologist_score')
g.set_xlabel('amaranth_til_per_1000_epithelial_cells')

In [None]:
import numpy as np
dat['log_til_per_1000_epithelial_cells'] = np.log10(dat['til_per_1000_epithelial_cells'])
g = sns.scatterplot(dat, x ='log_til_per_1000_epithelial_cells', y = 'tils_score_percent',
                hue = 'pathologist_score')
g.set_xlabel('log_amaranth_til_per_1000_epithelial_cells')

In [None]:
dat['intra_tumoral_tils_tils_density'] = [re.sub('%', '', d) for d in dat.intra_tumoral_tils_tils_density.to_list()]
dat['intra_tumoral_tils_percent'] = dat.intra_tumoral_tils_tils_density.astype('float')
dat['intra_tumoral_tils_percent'] = dat.intra_tumoral_tils_percent.div(100)
g = sns.scatterplot(dat, x ='intra_tumoral_tils_percent', y = 'tils_score_percent',
                hue = 'pathologist_score')
g.set_xlabel('amaranth_intra_tumoral_tils_tils_density')

In [None]:
dat['stromal_tils_tils_density'] = [re.sub('%', '', d) for d in dat.stromal_tils_tils_density.to_list()]
dat['stromal_tumoral_tils_percent'] = dat.stromal_tils_tils_density.astype('float')
dat['stromal_tumoral_tils_percent'] = dat.stromal_tumoral_tils_percent.div(100)
g = sns.scatterplot(dat, x ='stromal_tumoral_tils_percent', y = 'tils_score_percent',
                hue = 'pathologist_score')
g.set_xlabel('amaranth_intra_tumoral_tils_tils_density')

tils_df


In [None]:
tils_df.columns[1:]
tils_df.fillna(0, inplace=True)

In [None]:
for col in tils_df.columns[1:]:
    dat = [d for d in tils_df[col].to_list() if re.findall('%', str(d))]
    if len(dat)>0:
        print(dat)
        dat = [float(re.sub('%', '', d)) for d in tils_df[col].to_list()]
        tils_df[col] = dat
        print(tils_df[col].to_list())
    else:
        tils_df[col] = tils_df[col].astype('float')

In [None]:
# tils_df.fillna(0, inplace=True)
tils_df
tils_long = tils_df.melt(value_name='measure', id_vars='sample', var_name = 'feature')
til= pd.merge(tils_long, keyfile, on = 'sample').rename(columns = {
    'lymphocytes_invasion_score_low_less_10_moderate_10_40_high_greater_40_an':'pathologist_score',
'tils_score_percent' : 'pathologist_tils_score'}).sort_values('feature')
tils_dfkey = pd.merge(tils_df, keyfile, on = 'sample').rename(columns = {
    'lymphocytes_invasion_score_low_less_10_moderate_10_40_high_greater_40_an':'pathologist_score',
'tils_score_percent' : 'pathologist_tils_score'})
til

In [None]:
sns.boxplot

In [None]:
col = 'intra_tumoral_tils_area_in_mm_'
# g = sns.FacetGrid(tils_dfkey, col = 'feature',  hue = 'pathologist_score', col_wrap = 3, sharey=False)
sns.boxplot(tils_dfkey, x = 'pathologist_score', y = 'intra_tumoral_tils_area_in_mm_')
sns.stripplot(tils_dfkey, x = 'pathologist_score', y = 'intra_tumoral_tils_area_in_mm_', color='black')
# g.map_dataframe(sns.swarmplot, x = 'pathologist_score', y = 'measure', hue = 'pathologist_score')

In [None]:
sns.swarmplot

In [None]:
sns.catplot(data=tils_dfkey, x= 'pathologist_score', y = 'measure', col='feature', col_wrap=2,
            hue='pathologist_score', kind= 'box', sharey=False, ci = 0.95)
sns.stripplot(data=tils_dfkey, x="pathologist_score", y="measure", hue = 'pathologist_score')

In [None]:
tils_longdf

In [None]:
features = til.feature.drop_duplicates().to_list()
features

In [None]:
features = til.feature.drop_duplicates().tolist()
for feature in features:
    print(feature)
    dfplot = til.query(f'feature == "{features[0]}"')
    print(dfplot.filter(['feature', 'measure']))
    

In [None]:
features = til.feature.drop_duplicates().tolist()
for feature in features:
    dfplot = til.query(f'feature == "{features[0]}"')
    sns.boxplot(data=dfplot, x= 'pathologist_tils_classification', y = 'measure')
    sns.stripplot(data=dfplot, x="pathologist_tils_classification", y="measure", hue = 'pathologist_tils_classification', color='black')
    plt.title(feature)
    plt.show()
    dfplot.head()

In [None]:
tils_df['til_per_mm2']