# scipy is a library for fundamental analysis algorithms
* [Statistics](https://scipy.github.io/devdocs/reference/stats.html#module-scipy.stats)
* [Distance calculaton](https://docs.scipy.org/doc/scipy/reference/spatial.distance.html)
* [Contingency table](https://docs.scipy.org/doc/scipy/reference/stats.contingency.html)
* [Optimization](https://scipy.github.io/devdocs/tutorial/optimize.html)

## [statsmodels](https://www.statsmodels.org/stable/examples/index.html) provide more advanced analyses

In [None]:
# !pip install scipy statsmodels

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats
import statsmodels

## Load gene expression and mutation data
And merge them together into a single DataFrame

In [None]:
exp_data = pd.read_excel('CRC_sample_data.xlsx', sheet_name = 'expression', header = 0, index_col = 0)
mut_data = pd.read_excel('CRC_sample_data.xlsx', sheet_name = 'mutation', header = 0, index_col = 0)

data = pd.concat([exp_data, mut_data], axis = 1)
data.head()

## Check whether data is normally distributed
An option is [normaltest](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html) from scipy.stats

What does this p-value mean?

In [None]:
print(scipy.stats.normaltest(data['FAP']))

### Visualize the best normal distribution fit for the data
**norm.fit** estimates the maximum likelihood estimates for mean & SD from data

**norm.pdf** calculates the theoretical density

In [None]:
mean, std = scipy.stats.norm.fit(data['FAP'])

plt.figure()
plt.hist(data['FAP'], bins = 10, density = True, label = 'Observed')

x = np.linspace(mean - 3 * std, mean +  3 * std, 100)
plt.plot(x, scipy.stats.norm.pdf(x, mean, std), label = 'Fitted')

plt.xlabel('FAP'); plt.ylabel('Density'); plt.legend()
plt.show()

## Q-Q plot

In [None]:
mean, std = scipy.stats.norm.fit(data['FAP'])

sorted_fap = sorted(data['FAP'].values)
true_prc = [i / data.shape[0] for i in range(1, data.shape[0] + 1)]
est_prc = scipy.stats.norm.cdf(sorted_fap, mean, std)

plt.figure(figsize = (5, 5))
plt.scatter(est_prc, true_prc)
plt.plot([0, 1], [0, 1], '--k')
plt.xlabel('Estimated percentile'); plt.ylabel('True percentile')
plt.show()

In [None]:
print(true_prc)

In [None]:
print(est_prc)

## Compare AGR2 expression across CMS groups

In [None]:
cms1_agr2 = data.loc[data['CMS'] == 'CMS1', 'AGR2']
cms2_agr2 = data.loc[data['CMS'] == 'CMS2', 'AGR2']
cms3_agr2 = data.loc[data['CMS'] == 'CMS3', 'AGR2']

In [None]:
plt.figure()
plt.boxplot([cms1_agr2, cms2_agr2, cms3_agr2], labels = ['CMS1', 'CMS2', 'CMS3'], vert = False)
plt.xlabel('AGR2')
plt.show()

### Use t-test or Mann-Whitney U test

In [None]:
print('CMS1 vs CMS2')
print(scipy.stats.ttest_ind(cms1_agr2, cms2_agr2, equal_var = False, alternative = 'two-sided'))
print(scipy.stats.mannwhitneyu(cms1_agr2, cms2_agr2, alternative = 'two-sided'))

In [None]:
print('CMS1 vs CMS3')
print(scipy.stats.ttest_ind(cms1_agr2, cms3_agr2, equal_var = False, alternative = 'two-sided'))
print(scipy.stats.mannwhitneyu(cms1_agr2, cms3_agr2, alternative = 'two-sided'))

In [None]:
print('CMS2 vs CMS3')
print(scipy.stats.ttest_ind(cms2_agr2, cms3_agr2, equal_var = False, alternative = 'two-sided'))
print(scipy.stats.mannwhitneyu(cms2_agr2, cms3_agr2, alternative = 'two-sided'))

## ANOVA
Can be performed with [f_oneway](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html)

In [None]:
print(scipy.stats.f_oneway(data['AGR2'].loc[data['CMS'] == 'CMS1'],
                           data['AGR2'].loc[data['CMS'] == 'CMS2'],
                           data['AGR2'].loc[data['CMS'] == 'CMS3']))

## Correlation scores
* [Pearson](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html)
* [Spearman](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html)
* [Kendall](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kendalltau.html)

In [None]:
_ = sns.lmplot(data = data, x = 'AGR2', y = 'REG4')

In [None]:
print('Pearson:', scipy.stats.pearsonr(data['AGR2'], data['REG4']))
print('Spearman:', scipy.stats.spearmanr(data['AGR2'], data['REG4']))
print('Kendall:', scipy.stats.kendalltau(data['AGR2'], data['REG4']))

In [None]:
_ = sns.lmplot(data = data, x = 'SLC5A6', y = 'FAP')

In [None]:
print('Pearson:', scipy.stats.pearsonr(data['SLC5A6'], data['FAP']))
print('Spearman:', scipy.stats.spearmanr(data['SLC5A6'], data['FAP']))
print('Kendall:', scipy.stats.kendalltau(data['SLC5A6'], data['FAP']))

## Paired data
This is a toy data of model A and B's performances across 7 datasets

In [None]:
auc_df = pd.DataFrame([[0.701, 0.503, 0.991, 0.827, 0.623, 0.728, 0.596], 
                       [0.691, 0.478, 0.905, 0.739, 0.589, 0.719, 0.508]],
                      index = ['Model A', 'Model B'], columns = ['Dataset ' + str(i) for i in range(1, 8)])
auc_df.head()

### Visualize with box and scatter plots

In [None]:
plt.figure()
plt.boxplot([auc_df.loc['Model A', :], auc_df.loc['Model B', :]], labels = ['Model A', 'Model B'])
plt.ylabel('AUC')
plt.show()

In [None]:
plt.figure(figsize = (4, 4))
plt.scatter(auc_df.loc['Model A', :], auc_df.loc['Model B', :])
plt.xlabel('Model A'); plt.ylabel('Model B'); plt.title('AUC')
plt.plot([0.5, 1], [0.5, 1], '--k')
plt.show()

### What happen if you switch paired and unpaired tests?

In [None]:
print('unpaired t-test:', scipy.stats.ttest_ind(auc_df.loc['Model A', :], auc_df.loc['Model B', :]))
print('paired t-test:', scipy.stats.ttest_rel(auc_df.loc['Model A', :], auc_df.loc['Model B', :]))

print('---------------------------------------')
print('Mann-Whitney U test:', scipy.stats.mannwhitneyu(auc_df.loc['Model A', :], auc_df.loc['Model B', :]))
print('Wilcoxon signed rank test:', scipy.stats.wilcoxon(auc_df.loc['Model A', :], auc_df.loc['Model B', :]))

## Test for association between CMS and KRAS mutation

In [None]:
cms_vs_kras = pd.crosstab(index = data['CMS'], columns = data['KRAS'])
cms_vs_kras.head()

### Use [fisher_exact](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html)
CMS2 vs non-CMS2

In [None]:
cms2_vs_kras = pd.crosstab(index = data['CMS'] == 'CMS2', columns = data['KRAS'])
display(cms2_vs_kras)
print('two-tailed Fisher\'s exact:', scipy.stats.fisher_exact(cms2_vs_kras))

CMS3 vs other CMS

In [None]:
cms3_vs_kras = pd.crosstab(index = data['CMS'] == 'CMS3', columns = data['KRAS'])
display(cms3_vs_kras)
print('two-tailed Fisher\'s exact:', scipy.stats.fisher_exact(cms3_vs_kras))

## Use looping to perform all CMS vs mutation tests

In [None]:
all_fishers = pd.DataFrame(columns = ['p-value'])

for cms in pd.unique(data['CMS']):
    for gene in ['KRAS', 'BRAF', 'APC', 'TP53', 'PIK3CA', 'PTEN']:
        test_name = cms + ':' + gene
        temp = pd.crosstab(index = data['CMS'] == cms, columns = data[gene])
        all_fishers.loc[test_name, :] = scipy.stats.fisher_exact(temp)[1]

all_fishers = all_fishers.sort_values('p-value')
all_fishers.head(10)

## Correction for multiple testing
Starting with Bonferroni. We are using 0.1 cutoff just to show the effect

In [None]:
cutoff = 0.1

all_fishers['No correction'] = all_fishers['p-value'] <= cutoff
all_fishers.head()

In [None]:
all_fishers['Bonferroni'] = all_fishers['p-value'] <= cutoff / all_fishers.shape[0]
all_fishers.head()

## Benjamini-Hochberg and more
[multipletests](https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html)

In [None]:
from statsmodels.stats.multitest import multipletests

benjamini_hochberg = multipletests(all_fishers['p-value'].values, alpha = cutoff, method = 'fdr_bh')
print(benjamini_hochberg)

In [None]:
pd.DataFrame(benjamini_hochberg[0:2], index = ['Test result', 'FDR'], columns = all_fishers.index)

## Permutation test with numpy.random
1. Shuffle expression values for REG4
2. Recalculate correlation between AGR2 and shuffled REG4
3. Show the distribution of shuffled correlations compared to the actual value

In [None]:
np.random.seed(4649)
all_corrs = []

for i in range(1000):
    shuffled_REG4 = data['REG4'].iloc[np.random.permutation(data.shape[0])]
    all_corrs.append(scipy.stats.pearsonr(data['AGR2'], shuffled_REG4)[0])
    
obs_corr = scipy.stats.pearsonr(data['AGR2'], data['REG4'])[0]
    
plt.hist(all_corrs, bins = 20, density = True)
plt.plot([obs_corr, obs_corr], [0, 2], c = 'tab:red')
plt.xlabel('Correlation between shuffled data'); plt.title('AGR2 vs REG4')
plt.show()

In [None]:
np.random.seed(4649)
all_corrs = []

for i in range(1000):
    shuffled_FAP = data['FAP'].iloc[np.random.permutation(data.shape[0])]
    all_corrs.append(scipy.stats.pearsonr(data['SLC5A6'], shuffled_FAP)[0])
    
obs_corr = scipy.stats.pearsonr(data['SLC5A6'], data['FAP'])[0]
    
plt.hist(all_corrs, bins = 20, density = True)
plt.plot([obs_corr, obs_corr], [0, 2], c = 'tab:red')
plt.xlabel('Correlation between shuffled data'); plt.title('SLC5A6 vs FAP')
plt.show()

## Bootstrapping with numpy.random.choice
Randomly select 40% of the patients (with or without repetition)

In [None]:
np.random.choice(range(10), size = 6, replace = False)

In [None]:
np.random.seed(4649)
subset_size = int(data.shape[0] * 0.4)
reg4_agr2_corrs = []

for i in range(1000):
    bootstrap = data.loc[np.random.choice(data.index, size = subset_size, replace = False), :]
    reg4_agr2_corrs.append(scipy.stats.pearsonr(bootstrap['REG4'], bootstrap['AGR2'])[0])
    
plt.hist(all_corrs, bins = 20, density = True, label = 'Shuffled', alpha = 0.5)
plt.hist(reg4_agr2_corrs, bins = 20, density = True, facecolor = 'tab:red', label = 'Bootstrap', alpha = 0.5)
plt.xlabel('Pearson correlation'); plt.title('AGR2 vs REG4'); plt.legend()
plt.show()

In [None]:
np.random.seed(4649)
subset_size = int(data.shape[0] * 0.4)
reg4_agr2_corrs = []

for i in range(1000):
    bootstrap = data.loc[np.random.choice(data.index, size = subset_size, replace = False), :]
    reg4_agr2_corrs.append(scipy.stats.pearsonr(bootstrap['FAP'], bootstrap['SLC5A6'])[0])
    
plt.hist(all_corrs, bins = 20, density = True, label = 'Shuffled', alpha = 0.5)
plt.hist(reg4_agr2_corrs, bins = 20, density = True, facecolor = 'tab:red', label = 'Bootstrap', alpha = 0.5)
plt.xlabel('Pearson correlation'); plt.title('FAP vs SLC5A6'); plt.legend()
plt.show()