# ANOVA - Analysis of variation

In [1]:
# Imports
from src.statgenex.entity import Project
from src.statgenex.expression import Anova
import pandas as pd

## Import the project

In [2]:
project_options = {
    'name': 'LysOnc',
    'root_dir': 'C:/WORK/PROJECTS/',
    }

project = Project(**project_options)
project.restore()
project.print_summary()

Project LysOnc C:/WORK/PROJECTS/LysOnc/
Dataset TCGA-BRCA: NT (113), All-tumours (1113), Stage-I (179), Stage-II (619), Stage-III (244), Stage-IV (18), Luminal-A (547), Luminal-B (202), HER2-enriched (82), Basal-like (193), T1N0 (151), N0 (449), N1 (295), N2 (98), N3 (50), M1 (22), Claudin-low (33), Young_N_and_T (641), Old_N_and_T (566), SMYD2- (557), SMYD2+ (556), BCAR3- (557), BCAR3+ (556), Young (577), Old (520)


## Select the dataset and groups for ANOVA

In [3]:
dataset_name = 'TCGA-BRCA'
group_names = ['Luminal-A', 'Luminal-B', 'HER2-enriched', 'Basal-like']

In [4]:
# To know all avalailable groups in the selected dataset
all_available_groups = list(project.datasets[dataset_name].groups.keys())
print('All groups available in the dataset', dataset_name, all_available_groups)

All groups available in the dataset TCGA-BRCA ['NT', 'All-tumours', 'Stage-I', 'Stage-II', 'Stage-III', 'Stage-IV', 'Luminal-A', 'Luminal-B', 'HER2-enriched', 'Basal-like', 'T1N0', 'N0', 'N1', 'N2', 'N3', 'M1', 'Claudin-low', 'Young_N_and_T', 'Old_N_and_T', 'SMYD2-', 'SMYD2+', 'BCAR3-', 'BCAR3+', 'Young', 'Old']


## Define the list of genes

Provide a list of genes for which ANOVA should be performed.

In [5]:
# Read the gene list from a file
gene_data = pd.read_excel(project.data_dir + 'Candidate KMTs.xlsx', sheet_name=0, index_col=0)
gene_names = list(gene_data.index)

In [6]:
# Or define the gene list manually
gene_names = ['SMYD2', 'BIRC3']

## Perform ANOVA

The analysis of variations is performed with the parametric ANOVA and non-parametric Kruskal-Wallis (KW) statistical tests. The p-values of ANOVA and KW tests are adjusted for the false discovery rate (FDR) using the method of Benjamini-Hochberg.

In [7]:
anova_options = {
    'project': project,
    'dataset_name': dataset_name,
    'group_names': group_names,
    'features': gene_names,
    'generate_plots': True,
    }

anova = Anova(**anova_options)
print('Anova is processing. Please wait...')
anova.perform()
print('Anova is calculated.')
print('The results of ANOVA are generated in', anova.results_dir)

Anova is processing. Please wait...
Anova is calculated.
The results of ANOVA are generated in C:/WORK/PROJECTS/LysOnc/results/2023.04.28_Anova/


In [8]:
anova.results.head()

Unnamed: 0_level_0,pval_anova,pval_kw,fdr_anova,fdr_kw,significant
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SMYD2,1.261037e-39,8.096529e-37,2.522073e-39,1.619306e-36,1.0
BIRC3,1.26068e-14,1.135086e-11,1.26068e-14,1.135086e-11,1.0
