In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pingouin as pg
from scipy.stats import ttest_ind

In [10]:

def teste_normalidade(x):
    # Plotting the histogram with KDE line
    plt.subplot(1, 2, 1)
    plt.rcParams["figure.figsize"] = (8, 4)
    ax = sns.histplot(x, kde=True)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_linewidth(0.5)
    ax.spines['bottom'].set_color('gray')
    ax.tick_params(axis='x', width=0.5, color='gray')
    ax.tick_params(axis='y', width=0.5, color='gray')

    # Plotting the QQ-plot
    plt.subplot(1, 2, 2)
    pg.qqplot(x, dist='norm')
    
    # Calculating skewness and kurtosis
    print('Sample size:', len(x))
    print('Skewness:', stats.skew(x), '(The farther from zero, the less normal)')
    print('Kurtosis:', stats.kurtosis(x))

    # Shapiro-Wilk test (may not work well with many identical values)
    W, p = stats.shapiro(x)
    print('Shapiro-Wilk: W={0}, p={1}'.format(W, p), '(if p > 0.05, it is normal)')

    # Kolmogorov-Smirnov test
    D, p = stats.kstest(x, cdf='norm', args=(x.mean(), x.std()), N=len(x))
    print('Kolmogorov-Smirnov: D={0}, p={1}'.format(D, p), '(if p > 0.05, it is normal)')

    # Anderson-Darling test
    ad_stat, ad_critical, ad_theoretical = stats.anderson(x, dist='norm')
    print('Anderson-Darling: T={0}, p(0.05)={1}'.format(ad_stat, ad_critical[2]), '(if p > 0.05, it is normal)')
    print('Anderson-Darling critical values:', ad_critical)
    print('Anderson-Darling percentages:', ad_theoretical)

In [11]:
# para ver tudo dentro de um célula de pandas e todas as colunas
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', None)


In [12]:
df_ku80 = pd.read_csv('data/jacob_results ku80.tsv', sep='\t')
df_dnapk_nocluster = pd.read_csv('data/jacob_results dnapk no cluster.tsv', sep='\t')
df_dnapk_cluster = pd.read_csv('data/jacob_results dnapk cluster.tsv', sep='\t')

### A = c3 = FAK
### B = C1 = gH2AX

In [30]:
df_ku80.head()

Unnamed: 0,Unnamed: 1,Image A,Image B,ROI,Auto Threshold A,Auto Threshold B,Using Stack Histogram,Pearson's Coefficient,M1,M2,Threshold A,Threshold B,Thresholded M1,Thresholded M2,Random Pearson Costes 2D,Random Pearson Costes 2D pValueCorrelated,Random Pearson Costes 2D pValueAntiCorrelated,Area tot,Area A,Area B,Area Overlap,Timepoint
0,377,C3-doxo12h-h2ax-647-ku80-564-fak-488-dapi-01_Out_Channel Alignment.czi_nucleo_0(11).tif T1,C1-doxo12h-h2ax-647-ku80-564-fak-488-dapi-01_Out_Channel Alignment.czi_nucleo_0(11).tif T1,nucleus,Otsu,Otsu,True,0.114,0.893,0.981,2122,9804,0.319,0.229,18.514,1.0,2.09e-151,415.083,71.189,104.526,23.37,1
1,378,C3-doxo12h-h2ax-647-ku80-564-fak-488-dapi-01_Out_Channel Alignment.czi_nucleo_0(11).tif T1,C1-doxo12h-h2ax-647-ku80-564-fak-488-dapi-01_Out_Channel Alignment.czi_nucleo_0(11).tif T1,combine,Otsu,Otsu,True,0.043,1.0,0.985,2217,19019,0.261,0.228,1.728,0.993,0.007,103.95,21.616,25.273,5.727,1
2,379,C3-doxo12h-h2ax-647-ku80-564-fak-488-dapi-01_Out_Channel Alignment.czi_nucleo_0(11).tif T1,C1-doxo12h-h2ax-647-ku80-564-fak-488-dapi-01_Out_Channel Alignment.czi_nucleo_0(11).tif T1,nucleus without clusters,Otsu,Otsu,True,0.077,0.848,0.974,2122,4000,0.423,0.179,7.261,1.0,4.87e-25,311.133,47.901,118.676,21.046,1
3,380,C3-doxo12h-h2ax-647-ku80-564-fak-488-dapi-01_Out_Channel Alignment.czi_nucleo_1(11).tif T1,C1-doxo12h-h2ax-647-ku80-564-fak-488-dapi-01_Out_Channel Alignment.czi_nucleo_1(11).tif T1,nucleus,Otsu,Otsu,True,0.207,0.909,0.986,2141,10239,0.335,0.358,32.866,1.0,0.0,383.57,91.84,87.864,30.367,1
4,381,C3-doxo12h-h2ax-647-ku80-564-fak-488-dapi-01_Out_Channel Alignment.czi_nucleo_1(11).tif T1,C1-doxo12h-h2ax-647-ku80-564-fak-488-dapi-01_Out_Channel Alignment.czi_nucleo_1(11).tif T1,combine,Otsu,Otsu,True,0.084,1.0,0.992,2470,20061,0.272,0.32,4.187,1.0,1.59e-09,87.327,24.073,20.784,6.496,1


In [16]:
df_total = pd.concat([df_ku80, df_dnapk_nocluster, df_dnapk_cluster])

In [19]:
df_total.groupby('ROI')['Area Overlap'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ROI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
combine,92.0,1.668391,2.404199,0.002,0.2255,0.6495,2.11075,11.336
nucleus,92.0,7.782804,10.051269,0.006,1.47475,3.35,9.4835,47.843
nucleus without clusters,92.0,8.819761,8.380041,1.166,3.683,5.547,10.3,42.315


In [20]:
df_total['overlap ratio'] = df_total['Area Overlap']/df_total['Area tot']

In [22]:
df_total.groupby('ROI')['overlap ratio'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ROI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
combine,92.0,0.056335,0.030562,0.006431,0.031164,0.054874,0.078004,0.125188
nucleus,92.0,0.024503,0.02537,2.7e-05,0.006304,0.01507,0.031193,0.100482
nucleus without clusters,92.0,0.033507,0.027177,0.004442,0.014496,0.024629,0.04391,0.121894


In [23]:
df_total.groupby('ROI')["Pearson's Coefficient"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ROI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
combine,92.0,0.103539,0.122259,-0.248,0.02775,0.084,0.1865,0.462
nucleus,92.0,0.184641,0.121725,0.014,0.088,0.151,0.26075,0.457
nucleus without clusters,92.0,0.134402,0.073381,0.005,0.07425,0.129,0.1965,0.265


In [28]:
df_total['overlap ratio FAK'] = df_total['Area Overlap']/df_total['Area A']

In [29]:
df_total.groupby('ROI')["overlap ratio FAK"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ROI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
combine,92.0,0.254407,0.096812,0.051546,0.186948,0.261282,0.335184,0.479167
nucleus,92.0,0.16008,0.119918,0.000202,0.061976,0.126681,0.248956,0.460022
nucleus without clusters,92.0,0.237568,0.116462,0.051692,0.155705,0.21904,0.29615,0.556739


# A = c3 = FAK
# B = C1 = gH2AX

# M1 = CH-Both /auto-thresholded CH-FAK
# M2 = CH-Both /auto-thresholded CH-gH2AX


In [31]:
df_total.groupby('ROI')["Thresholded M1"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ROI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
combine,92.0,0.256511,0.104988,0.038,0.1855,0.2595,0.34125,0.529
nucleus,92.0,0.168545,0.128066,0.000172,0.06325,0.128,0.26425,0.472
nucleus without clusters,92.0,0.244935,0.118107,0.054,0.15475,0.2265,0.3105,0.551


In [32]:
df_total.groupby('ROI')["Thresholded M2"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ROI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
combine,92.0,0.270576,0.128371,0.03,0.161,0.256,0.36675,0.6
nucleus,92.0,0.314228,0.150394,0.058,0.18175,0.282,0.4285,0.699
nucleus without clusters,92.0,0.217196,0.07949,0.047,0.155,0.216,0.28725,0.376


In [None]:
# https://www.youtube.com/watch?v=cOrCz4qc8DI&t=1847s