# Statistical Tests Frequency

In [2]:
import numpy as numpy
import pandas as pd
from scipy.stats import chi2_contingency
from scipy.stats import fisher_exact

## Chi2 Test
- checks whether the amino acid composition of epitope residues differs significantly across different immune checkpoints (CTLA4, PD1,...)

- The contingency table contains amino acids as columns and checkpoints as rows.
- The null hypothesis: residue distributions are the same across all proteins (no association).
- A significant p-value suggests that some proteins have distinct amino acid patterns in their epitopes.

- The standardized residuals show which specific residues are over- or under-represented in each protein (values ≈ ±2 are notable).
This helps identify functionally important residues, potential mutagenesis targets, or protein-specific signatures.

In [None]:
var = 'epitope'
# Dictionary with file paths for each checkpoint
files = {
    'CTLA4': f'data/ctla4/ctla4_{var}_residues.csv',
    'KIR': f'data/kir/kir_{var}_residues.csv',
    'LAG3': f'data/lag3/lag3_{var}_residues.csv',
    'PD1': f'data/pd1/pd1_{var}_residues.csv',
    'PDL1': f'data/pdl1/pdl1_{var}_residues.csv',
    'TIM3': f'data/tim3/tim3_{var}_residues.csv',
}

# Create frequency tables for each checkpoint
dfs = []
for checkpoint, path in files.items():
    df = pd.read_csv(path)
    counts = df['residue_name'].value_counts()  # Count each amino acid
    counts.name = checkpoint                    # Label with checkpoint name
    dfs.append(counts)

# Create contingency table (rows = checkpoints, columns = residues)
contingency_table = pd.DataFrame(dfs).fillna(0).astype(int)

print("Contingency table:\n", contingency_table)

# Perform Chi2-Test
chi2, p, dof, expected = chi2_contingency(contingency_table)

print("\nChi2", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)

# Calculate expected frequencies as a DataFrame
expected_df = pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns)
# Compute standardized residuals: (observed - expected) / sqrt(expected)
residuals = (contingency_table - expected_df) / (expected_df**0.5)

print("\nStandardized residuals:\n", residuals)

Contingency table:
 residue_name  TYR  PRO  GLU  LEU  MET  THR  ILE  LYS  VAL  GLN  ...  ARG  SER  \
CTLA4          35   32   31   29   22   18   18   13   11   11  ...   10    8   
KIR             2    1    0    4    0    1    2    2    0    0  ...    1    2   
LAG3            1    4    4    1    0    0    0    0    2    5  ...    4    7   
PD1             8   37   26   21    2   13   30   31   15   27  ...   16   43   
PDL1           22    0   16    5   15    3   11    8   14    7  ...   15    6   
TIM3            0    3    2    0    1    0    3    2    2    1  ...    2    0   

residue_name  ALA  HIS  ASN  PHE  ASP  CYS  TRP  NAG  
CTLA4           7    6    3    2    2    1    0    0  
KIR             0    3    0    2    2    0    0    0  
LAG3            3    0    0    2    0    0    0    0  
PD1            36    0   17   15   21    0    3    1  
PDL1           21    7    6    3   22    0    1    0  
TIM3            1    0    2    1    1    2    0    0  

[6 rows x 21 columns]

Chi

### Chi2 Test Results
- Chi2 = 340.86, p = 5e-28 -> Highly significant difference.
- Amino acid patterns are not random — some proteins clearly prefer certain residues.

- Overrepresented residues (standardized residual ≥ +2):
    - CTLA4: TYR, MET, LEU
    - PD1: SER, ALA, ASN
    - KIR: HIS
    - PDL1: ASP
    - TIM3: CYS (very strong, +6.55)
- Underrepresented residues (residual ≤ –2):
    - PDL1: PRO
    - CTLA4: ALA, ASP
    - PD1: TYR, MET

## Fisher Exact Test

- Tests whether two categorical variables in a 2x2 contingency table are independent of each other
- Determines whether there is a statistically significant association between them
- Answers the question: Does amino acid X occur significantly more often in the paratope than in the epitope, or is the difference purely due to chance?

In [None]:
var = 'PDL1'
epitope_path = f"data/{var}/{var}_epitope_residues.csv"
paratope_path = f"data/{var}/{var}_paratope_residues.csv"

# load csvs
df_epi = pd.read_csv(epitope_path)
df_para = pd.read_csv(paratope_path)

# Only select aminoacid-information which is residue_name
residue_col = "residue_name"

aa_list = sorted(set(df_epi[residue_col]) | set(df_para[residue_col]))

# save results
results = []

for aa in aa_list:
    # Count aminoacids
    epi_aa = (df_epi[residue_col] == aa).sum()
    epi_not_aa = (df_epi[residue_col] != aa).sum()
    para_aa = (df_para[residue_col] == aa).sum()
    para_not_aa = (df_para[residue_col] != aa).sum()

    # 2x2-table
    table = [[para_aa, para_not_aa],
             [epi_aa, epi_not_aa]]

    # Fisher-Test
    _, p = fisher_exact(table)

    results.append({
        "amino_acid": aa,
        "paratope_count": para_aa,
        "epitope_count": epi_aa,
        "p_value": p
    })
    
results_df = pd.DataFrame(results)

# sort from lowest to highest p-value (significance)
results_df = results_df.sort_values("p_value")

print(results_df)

   amino_acid  paratope_count  epitope_count   p_value
14        SER              34              6  0.000001
11        MET               0             15  0.000048
17        TRP              15              1  0.000163
13        PRO               8              0  0.003042
0         ALA               6             21  0.004588
3         ASP               8             22  0.012849
4         GLN               0              7  0.014910
16        THR              12              3  0.016861
19        VAL               4             14  0.027700
5         GLU               6             16  0.046899
8         ILE               4             11  0.112399
12        PHE               8              3  0.132959
18        TYR              30             22  0.182333
6         GLY              17             11  0.239557
9         LEU               8              5  0.407307
1         ARG              10             15  0.411093
15        SO4               1              0  0.489418
7         

### Fisher Test Results

- CTLA4:
- SER, TRP, GLY, TYR → häufiger im Paratope → wichtig für Antikörperbindung.
- PRO, MET, GLU → häufiger im Epitope → typisch für das Antigen.
- to-do!!