In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from aging.plotting import format_plots, PlotConfig, save_factory, figure, legend, format_pizza_plots, COLORMAPS
from collections import Counter
from matplotlib.lines import Line2D
from aging.organization.dataframes import load_male_long_df, load_female_long_df, DF_PATHS
from tqdm import tqdm
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.tools import pinv_extended  
from statsmodels.stats.anova import anova_lm
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from tabulate import tabulate

In [2]:
format_plots()

In [3]:
# prepare long_v2 data

In [4]:
cmm = COLORMAPS.ont_male
cmf = COLORMAPS.ont_female
m_df = load_male_long_df(average_weeks=True, merge_size=False, merge_ages=True, df_path=DF_PATHS.usage_male).groupby(['age','mouse']).mean()
f_df = load_female_long_df(average_weeks=True, merge_size=False, filter_female=True, merge_ages=True, df_path=DF_PATHS.usage_female).groupby(['age','mouse']).mean()
m_df['sex'] = 'm'
m_df.set_index('sex', inplace = True, append=True)
f_df['sex'] = 'f'
f_df.set_index('sex', inplace = True, append=True)

# Combine male and female data into a single DataFrame
combined_df = pd.concat([m_df, f_df])
#combined_df.set_index('sex', inplace=True, append=True)

In [5]:
## joint data
m_df_long=m_df.copy()

#f_data=f_norm.copy()
f_df_long=f_df.copy()

data_long= combined_df.copy()

## change in syllable over age males - mixed effect linear model parametric anova

In [6]:
def fit_1mixed_linear_models(data, dependent_var, factor1, group):
    
    align_data = data.copy()
    align_data['ranked_response'] = align_data[[dependent_var]]

    #formula = f'ranked_response ~ {factor1}*{group}'
    formula = f'ranked_response ~ {factor1}'
        
    # Fit the mixed linear model
    model = smf.mixedlm(formula, align_data, groups=f'{group}')
    result = model.fit()
    return result


def fit_2mixed_linear_models(data, dependent_var, factor1,factor2, group):
    
    align_data = data.copy()
    align_data['ranked_response'] = align_data[[dependent_var]]

    #formula = f'ranked_response ~ {factor1}*{group}'
    formula = f'ranked_response ~ {factor1}*C({factor2})'
        
    # Fit the mixed linear model
    model = smf.mixedlm(formula, align_data, groups=f'{group}')
    result = model.fit()
    return result

def extract_summary_to_dataframe(results, num_tests):
    summary_data = []

    for var, result in results.items():
        summary = result.summary()
        coefs = result.params
        pvalues = result.pvalues
        conf_int = result.conf_int()
        
        for param in coefs.index:
            pvalue_corrected = min(pvalues[param] * num_tests, 1.0)  # Apply Bonferroni correction
            
            summary_data.append({
                'Dependent Variable': var,
                'Parameter': param,
                'Coefficient': coefs[param],
                'P-value': pvalues[param],
                'Corrected P-value': pvalue_corrected,
                'CI Lower': conf_int[0][param],
                'CI Upper': conf_int[1][param]
            })
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df

In [7]:
#males
data=m_df_long
a=0.05
s=[]
p=[]
n=len(data.columns)
temp = data.copy()
temp.reset_index(inplace=True)
results={}
if 'level_0' in temp.columns:
    temp.drop(columns=['level_0'], inplace=True)

syll = data.columns
for i in syll:
    result=fit_1mixed_linear_models(temp, i, 'age','mouse')
    results[i] = result
    cp = result.pvalues['age']*n
    if cp<a:
        s.extend([i])
        p.extend([cp])
print('number of syllables changing over time for males:', str(len(s)))

# Extract the summary to a DataFrame with Bonferroni correction
num_tests = len(data.columns)  # Number of tests is the number of dependent variables (excluding 'age' and 'mouse')
summary_df = extract_summary_to_dataframe(results, num_tests)

# Print the DataFrame nicely
summary_df.to_csv('/n/groups/datta/win/longtogeny/data/ontogeny/version_11-1/stats/mxlm_male_longv2_syllable_over_time.csv', index=False)





number of syllables changing over time for males: 50


In [8]:
## females
data=f_df_long
a=0.05
s=[]
p=[]
n=len(data.columns)
temp = data.copy()
temp.reset_index(inplace=True)
results={}
if 'level_0' in temp.columns:
    temp.drop(columns=['level_0'], inplace=True)

syll = data.columns
for i in syll:
    result=fit_1mixed_linear_models(temp, i, 'age','mouse')
    results[i] = result
    cp = result.pvalues['age']*n
    if cp<a:
        s.extend([i])
        p.extend([cp])
print('number of syllables changing over time for females:', str(len(s)))

# Extract the summary to a DataFrame with Bonferroni correction
num_tests = len(data.columns)  # Number of tests is the number of dependent variables (excluding 'age' and 'mouse')
summary_df = extract_summary_to_dataframe(results, num_tests)

# Print the DataFrame nicely
print(summary_df)
summary_df.to_csv('/n/groups/datta/win/longtogeny/data/ontogeny/version_11-1/stats/mxlm_female_longv2_syllable_over_time.csv', index=False)





number of syllables changing over time for females: 47
     Dependent Variable  Parameter  Coefficient       P-value  \
0                     0  Intercept     0.001132  1.428968e-02   
1                     0        age     0.000094  2.142344e-94   
2                     0  mouse Var     1.091763  3.600999e-03   
3                     2  Intercept     0.043389  2.087986e-65   
4                     2        age    -0.000357  2.216866e-39   
..                  ...        ...          ...           ...   
169                  97        age     0.000005  4.179221e-01   
170                  97  mouse Var     1.178156  3.541300e-03   
171                  98  Intercept     0.029064  1.422848e-54   
172                  98        age    -0.000213  1.292744e-24   
173                  98  mouse Var     0.832927  3.812332e-03   

     Corrected P-value  CI Lower  CI Upper  
0         8.288017e-01  0.000226  0.002038  
1         1.242560e-92  0.000085  0.000103  
2         2.088579e-01  0.356

In [9]:
## sex-specific change in syllable over age 2-way anova

In [10]:
data = combined_df.copy()
temp = data.copy()
temp.reset_index(inplace=True)

if 'level_0' in temp.columns:
    temp.drop(columns=['level_0'], inplace=True)

results={}
syll = data.columns
n=len(syll)
ss=[]
ps=[]
ints=[]
intp=[]

for i in syll:
    result=fit_2mixed_linear_models(temp, i, 'age','sex','mouse')
    results[i] = result
    cp = result.pvalues['C(sex)[T.m]']*n
    if cp<a:
        ss.extend([i])
        ps.extend([cp])
    cp = result.pvalues['age:C(sex)[T.m]']*n
    if cp<a:
        ints.extend([i])
        intp.extend([cp])
print('number of syllables different between males and females:', str(len(ss)))
print('number of syllables with sex/age interactions: ', str(len(ints)))

# Extract the summary to a DataFrame with Bonferroni correction
num_tests = len(data.columns)  # Number of tests is the number of dependent variables (excluding 'age' and 'mouse')
summary_df = extract_summary_to_dataframe(results, num_tests)

# Print the DataFrame nicely
print(summary_df)
summary_df.to_csv('/n/groups/datta/win/longtogeny/data/ontogeny/version_11-1/stats/mxlm_fvm_longv2_syllable_divergence_over_time.csv', index=False)





number of syllables different between males and females: 5
number of syllables with sex/age interactions:  38
     Dependent Variable        Parameter  Coefficient        P-value  \
0                     0        Intercept     0.001132   3.033776e-03   
1                     0      C(sex)[T.m]     0.000133   8.026876e-01   
2                     0              age     0.000094  1.645127e-121   
3                     0  age:C(sex)[T.m]    -0.000056   2.123383e-23   
4                     0        mouse Var     0.952832   3.172566e-05   
..                  ...              ...          ...            ...   
285                  98        Intercept     0.029064   2.858152e-60   
286                  98      C(sex)[T.m]    -0.002157   3.843182e-01   
287                  98              age    -0.000213   1.651973e-25   
288                  98  age:C(sex)[T.m]    -0.000206   8.098421e-13   
289                  98        mouse Var     0.774757   3.517720e-05   

     Corrected P-value  C