# add p‑values to correlation results for selected smoothing windows

this notebook:

1. reads one **`all_windows_summary.txt`** file (contains the rₑ and rₗ columns).  
2. reads one or more per‑window **`summary_window*.txt`** files (those list every phrase label → lets us infer *n*).  
3. for each requested window it computes two‑tailed *p*-values for the entropy (**rₑ**) and duration (**rₗ**) correlations using the exact sample size extracted from the window file.  
4. spits out a tidy dataframe and a `p_values_by_window.csv` for easy copy‑pasting into figure legends or SI.

> **assumptions**  
> * the window id is the integer in the file‑name (`summary_window200.txt` → 200).  
> * each line that starts with `Label` inside the window file corresponds to one phrase label ⇒ sample size *n* is the total number of such lines across all birds.


In [17]:
import re, math, pathlib, pandas as pd
from typing import Dict

def r_to_p(r: float, n: int) -> float:
    """
    convert a Pearson correlation coefficient to a two-tailed p-value
    
    parameters
    ----------
    r : float
        Pearson correlation coefficient.
    n : int
        sample size used to compute r (must be > 2).

    returns
    -------
    float
        two-tailed p-value (0-1). returns 0.0 when n ≤ 2 or |r| == 1.
    """
    if n <= 2 or abs(r) >= 1.0:
        return 0.0
    t_stat = abs(r) * math.sqrt((n - 2) / (1 - r**2))
    from scipy.stats import t as t_dist
    return 2.0 * t_dist.sf(t_stat, df=n - 2)


def parse_all_windows_summary(path: pathlib.Path) -> Dict[int, Dict[str, float]]:
    """
    parse `all_windows_summary.txt` and collect r_e / r_l per window
    
    returns
    -------
    dict[int, dict[str,float]]
        {window_size: {'r_e': <float>, 'r_l': <float>}}
    """
    # match:  optional leading spaces → window int → pipe → r_e → pipe → r_l → pipe
    row_re = re.compile(r'^\s*(\d+)\s*\|\s*([-+]?\d*\.?\d+)\s*\|\s*([-+]?\d*\.?\d+)\s*\|')
    table: Dict[int, Dict[str, float]] = {}
    
    for line in path.read_text().splitlines():
        m = row_re.match(line)
        if m:
            w   = int(m.group(1))
            r_e = float(m.group(2))
            r_l = float(m.group(3))
            table[w] = {'r_e': r_e, 'r_l': r_l}
    return table


def sample_size_from_window(path: pathlib.Path) -> int:
    """
    infer sample size n for a given `summary_window*.txt`
    by counting lines that begin with 'Label <int>:' (ignores -1 rows)

    returns
    -------
    int
        number of phrase labels across all birds in that window file
    """
    lbl_re = re.compile(r'^\s*Label\s+\d+:')
    return sum(1 for ln in path.read_text().splitlines() if lbl_re.match(ln))


In [21]:
# ==== user inputs ====
all_windows_summary_path = '/media/george-vengrovski/Desk SSD/TweetyBERT/proxy_metrics/all_windows_summary.txt'
summary_window_files = [
    '/media/george-vengrovski/Desk SSD/TweetyBERT/proxy_metrics/window_50/summary_window50.txt',
    '/media/george-vengrovski/Desk SSD/TweetyBERT/proxy_metrics/window_150/summary_window150.txt',
    '/media/george-vengrovski/Desk SSD/TweetyBERT/proxy_metrics/window_200/summary_window200.txt'
    ]
# =====================

In [22]:
# parse big summary to fetch r-values
aws_path = pathlib.Path(all_windows_summary_path).expanduser()
r_table = parse_all_windows_summary(aws_path)

records=[]
for sw in summary_window_files:
    sw_path = pathlib.Path(sw).expanduser()
    window_id = int(re.search(r'summary_window(\d+)', sw_path.name).group(1))
    n = sample_size_from_window(sw_path)
    
    if window_id not in r_table:
        raise ValueError(f'window {window_id} not found in all_windows_summary.txt')
    r_e = r_table[window_id]['r_e']
    r_l = r_table[window_id]['r_l']
    
    records.append({
        'window': window_id,
        'n': n,
        'r_entropy': r_e,
        'p_entropy': r_to_p(r_e, n),
        'r_length': r_l,
        'p_length': r_to_p(r_l, n)
    })
    
df = pd.DataFrame(records).sort_values('window')
df

Unnamed: 0,window,n,r_entropy,p_entropy,r_length,p_length
0,50,57,0.658,2.669144e-08,0.904,5.959747e-22
1,150,57,0.745,3.026518e-11,0.947,8.466665e-29
2,200,56,0.771,3.622371e-12,0.957,1.068345e-30


In [23]:
out_csv = '/home/george-vengrovski/Documents/projects/tweety_bert_paper/results/proxy_metrics/p_values_by_window.csv'
df.to_csv(out_csv, index=False)
print('saved', out_csv)

saved /home/george-vengrovski/Documents/projects/tweety_bert_paper/results/proxy_metrics/p_values_by_window.csv
