In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from aging.plotting import format_plots, PlotConfig, save_factory, figure, legend, format_pizza_plots, COLORMAPS
from collections import Counter
from matplotlib.lines import Line2D
from aging.organization.dataframes import load_male_long_df, load_female_long_df, DF_PATHS
from tqdm import tqdm
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.tools import pinv_extended  
from statsmodels.stats.anova import anova_lm
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from tabulate import tabulate

In [None]:
format_plots()

In [None]:
# prepare long_v2 data

In [None]:
cmm = COLORMAPS.ont_male
cmf = COLORMAPS.ont_female
m_df = pd.read_parquet('/n/groups/datta/win/longtogeny/data/ontogeny/version_11-1/beh_age_df/2024-07-30-longtogeny_v2_males_raw_usage_df_beh_age.parquet').groupby(['age','mouse']).mean()
f_df =  pd.read_parquet('/n/groups/datta/win/longtogeny/data/ontogeny/version_11-1/beh_age_df/2024-07-30-longtogeny_v2_females_raw_usage_df_beh_age.parquet').groupby(['age','mouse']).mean()
f_df = f_df.query('mouse!="F4_03"')
m_df['sex'] = 'm'
m_df.set_index('sex', inplace = True, append=True)
f_df['sex'] = 'f'
f_df.set_index('sex', inplace = True, append=True)

combined_df = pd.concat([m_df,f_df])

In [None]:
## joint data
m_df_long=m_df.copy()

#f_data=f_norm.copy()
f_df_long=f_df.copy()

data_long= combined_df.copy()

## change in syllable over age males - mixed effect linear model parametric anova

In [None]:
def fit_1mixed_linear_models(data, dependent_var, factor1, group):
    
    align_data = data.copy()
    align_data['ranked_response'] = align_data[[dependent_var]]

    #formula = f'ranked_response ~ {factor1}*{group}'
    formula = f'ranked_response ~ {factor1}'
        
    # Fit the mixed linear model
    model = smf.mixedlm(formula, align_data, groups=f'{group}')
    result = model.fit()
    return result


def fit_2mixed_linear_models(data, dependent_var, factor1,factor2, group):
    
    align_data = data.copy()
    align_data['ranked_response'] = align_data[[dependent_var]]

    #formula = f'ranked_response ~ {factor1}*{group}'
    formula = f'ranked_response ~ {factor1}*C({factor2})'
        
    # Fit the mixed linear model
    model = smf.mixedlm(formula, align_data, groups=f'{group}')
    result = model.fit()
    return result

def extract_summary_to_dataframe(results, num_tests):
    summary_data = []

    for var, result in results.items():
        summary = result.summary()
        coefs = result.params
        pvalues = result.pvalues
        conf_int = result.conf_int()
        
        for param in coefs.index:
            pvalue_corrected = min(pvalues[param] * num_tests, 1.0)  # Apply Bonferroni correction
            
            summary_data.append({
                'Dependent Variable': var,
                'Parameter': param,
                'Coefficient': coefs[param],
                'P-value': pvalues[param],
                'Corrected P-value': pvalue_corrected,
                'CI Lower': conf_int[0][param],
                'CI Upper': conf_int[1][param]
            })
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df

In [None]:
#males
data=m_df_long
a=0.05
s=[]
p=[]
n=len(data.columns)
temp = data.copy()
temp.reset_index(inplace=True)
results={}
if 'level_0' in temp.columns:
    temp.drop(columns=['level_0'], inplace=True)

syll = data.columns
for i in syll:
    result=fit_1mixed_linear_models(temp, i, 'age','mouse')
    results[i] = result
    cp = result.pvalues['age']*n
    if cp<a:
        s.extend([i])
        p.extend([cp])
print('number of syllables changing over time for males:', str(len(s)))

# Extract the summary to a DataFrame with Bonferroni correction
num_tests = len(data.columns)  # Number of tests is the number of dependent variables (excluding 'age' and 'mouse')
summary_df = extract_summary_to_dataframe(results, num_tests)

# Print the DataFrame nicely
summary_df.to_csv('/n/groups/datta/win/longtogeny/data/ontogeny/version_11-1/stats/mxlm_male_longv2_syllable_over_time_beh_age.csv', index=False)

In [None]:
## females
data=f_df_long
a=0.05
s=[]
p=[]
n=len(data.columns)
temp = data.copy()
temp.reset_index(inplace=True)
results={}
if 'level_0' in temp.columns:
    temp.drop(columns=['level_0'], inplace=True)

syll = data.columns
for i in syll:
    result=fit_1mixed_linear_models(temp, i, 'age','mouse')
    results[i] = result
    cp = result.pvalues['age']*n
    if cp<a:
        s.extend([i])
        p.extend([cp])
print('number of syllables changing over time for females:', str(len(s)))

# Extract the summary to a DataFrame with Bonferroni correction
num_tests = len(data.columns)  # Number of tests is the number of dependent variables (excluding 'age' and 'mouse')
summary_df = extract_summary_to_dataframe(results, num_tests)

# Print the DataFrame nicely
#print(summary_df)
summary_df.to_csv('/n/groups/datta/win/longtogeny/data/ontogeny/version_11-1/stats/mxlm_female_longv2_syllable_over_time_beh_age.csv', index=False)

In [None]:
## sex-specific change in syllable over age 2-way anova

In [None]:
data = combined_df.copy()
temp = data.copy()
temp.reset_index(inplace=True)

if 'level_0' in temp.columns:
    temp.drop(columns=['level_0'], inplace=True)

results={}
syll = data.columns
n=len(syll)
ss=[]
ps=[]
ints=[]
intp=[]

for i in syll:
    result=fit_2mixed_linear_models(temp, i, 'age','sex','mouse')
    results[i] = result
    cp = result.pvalues['C(sex)[T.m]']*n
    if cp<a:
        ss.extend([i])
        ps.extend([cp])
    cp = result.pvalues['age:C(sex)[T.m]']*n
    if cp<a:
        ints.extend([i])
        intp.extend([cp])
print('number of syllables different between males and females:', str(len(ss)))
print('number of syllables with sex/age interactions: ', str(len(ints)))

# Extract the summary to a DataFrame with Bonferroni correction
num_tests = len(data.columns)  # Number of tests is the number of dependent variables (excluding 'age' and 'mouse')
summary_df = extract_summary_to_dataframe(results, num_tests)

# Print the DataFrame nicely
#print(summary_df)
summary_df.to_csv('/n/groups/datta/win/longtogeny/data/ontogeny/version_11-1/stats/mxlm_fvm_longv2_syllable_divergence_over_time_beh_age.csv', index=False)