In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import matplotlib as mpl
import os
from toolz import concat
from matplotlib.lines import Line2D
from collections import Counter
import math
from tqdm import tqdm
%matplotlib inline
import warnings
warnings.simplefilter('ignore')
import random
import scipy
from aging.plotting import format_plots, PlotConfig, save_factory, figure, legend, format_pizza_plots
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.tools import pinv_extended  
from statsmodels.stats.anova import anova_lm
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from tabulate import tabulate

In [2]:
format_plots()
#format_pizza_plots()

In [3]:
def mm_norm_col(column):
    return (column - column.min()) / (column.max() - column.min())

In [4]:
def tw_anova(data, dependent_var, factor1, factor2):
    """
    Perform two-way ANOVA.
    
    Parameters:
    data (pd.DataFrame): The input data frame.
    dependent_var (str): The name of the dependent variable column.
    factor1 (str): The name of the first factor column.
    factor2 (str): The name of the second factor column.
    
    Returns:
    pd.DataFrame: The ANOVA results.
    """
    
    # Align the data
    align_data = data.copy()
    align_data['ranked_response'] = align_data[[dependent_var]]
    # Fit an OLS model on the ranked data
    #model = ols(f'ranked_response ~ {factor1} * {factor2}', data=align_data).fit()
    model = ols(f'ranked_response ~ {factor1} * C({factor2})', data=align_data).fit()

    # Perform ANOVA on the fitted model
    anova_results = sm.stats.anova_lm(model, typ=3)
    
    # Convert results to a DataFrame
    anova_df = pd.DataFrame(anova_results)
    
    return anova_df

def ow_anova(data, dependent_var, factor1):
    """
    Perform two-way ANOVA.
    
    Parameters:
    data (pd.DataFrame): The input data frame.
    dependent_var (str): The name of the dependent variable column.
    factor1 (str): The name of the first factor column.
    factor2 (str): The name of the second factor column.
    
    Returns:
    pd.DataFrame: The ANOVA results.
    """
    
    # Align the data
    align_data = data.copy()
    align_data['ranked_response'] = align_data[[dependent_var]]
    # Fit an OLS model on the ranked data
    model = ols(f'ranked_response ~ {factor1}', data=align_data).fit()
    # Perform ANOVA on the fitted model
    anova_results = sm.stats.anova_lm(model, typ=3)
    
    # Convert results to a DataFrame
    anova_df = pd.DataFrame(anova_results)
    
    return anova_df

In [5]:
# prepare long_v2 data

In [6]:
keep_syllables = np.loadtxt('/n/groups/datta/win/longtogeny/data/ontogeny/version_11/to_keep_syllables_raw.txt', dtype=int)

df = pd.read_parquet('/n/groups/datta/win/longtogeny/data/ontogeny/version_11-1/longtogeny_v2_females_raw_usage_matrix_v00.parquet').astype(float)
df = df[keep_syllables].groupby(['age','uuid','mouse']).mean()

# take only first sample from each week
ages= df.index.get_level_values('age')
weeks = (ages * 7) // 7
df['binned_age'] = weeks

sample = df.groupby(['mouse','binned_age']).first().reset_index()
df_female = sample.copy()
df_female.rename(columns={'binned_age': 'age'}, inplace=True)
df_female.set_index(['age', 'mouse'], inplace=True)
f_df_long= df_female.groupby(['age','mouse']).mean()

In [7]:
keep_syllables = np.loadtxt('/n/groups/datta/win/longtogeny/data/ontogeny/version_11/to_keep_syllables_raw.txt', dtype=int)

df = pd.read_parquet('/n/groups/datta/win/longtogeny/data/ontogeny/version_11-1/longtogeny_v2_males_raw_usage_matrix_v00.parquet').astype(float)
df = df[keep_syllables].groupby(['age','uuid','mouse']).mean()

ages= df.index.get_level_values('age')
weeks = (ages * 7) // 7
df['binned_age'] = weeks
sample = df.groupby(['mouse','binned_age']).first().reset_index()
df_male=sample.copy()
df_male.rename(columns={'binned_age': 'age'}, inplace=True)

df_male.set_index(['age', 'mouse'], inplace=True)
m_df_long= df_male.groupby(['age','mouse']).mean()

In [8]:
## joint data
#m_data=m_norm.copy()
m_data_long=m_df_long.copy()
m_data_long['sex'] = 'm'
m_data_long.set_index('sex',inplace=True,append=True)

#f_data=f_norm.copy()
f_data_long=f_df_long.copy()
f_data_long['sex'] = 'f'
f_data_long.set_index('sex',inplace=True,append=True)

data_long=pd.concat([m_data_long,f_data_long])

## change in syllable over age males - mixed effect linear model parametric anova

In [9]:
import statsmodels.formula.api as smf



def ow_anova(data, dependent_var, factor1):
    """
    Perform two-way ANOVA.
    
    Parameters:
    data (pd.DataFrame): The input data frame.
    dependent_var (str): The name of the dependent variable column.
    factor1 (str): The name of the first factor column.
    factor2 (str): The name of the second factor column.
    
    Returns:
    pd.DataFrame: The ANOVA results.
    """
    
    # Align the data
    align_data = data.copy()
    align_data['ranked_response'] = align_data[[dependent_var]]
    # Fit an OLS model on the ranked data
    model = ols(f'ranked_response ~ {factor1}', data=align_data).fit()
    # Perform ANOVA on the fitted model
    anova_results = sm.stats.anova_lm(model, typ=3)
    
    # Convert results to a DataFrame
    anova_df = pd.DataFrame(anova_results)
    
    return anova_df



def fit_1mixed_linear_models(data, dependent_var, factor1, group):
    
    align_data = data.copy()
    align_data['ranked_response'] = align_data[[dependent_var]]

    #formula = f'ranked_response ~ {factor1}*{group}'
    formula = f'ranked_response ~ {factor1}'
        
    # Fit the mixed linear model
    model = smf.mixedlm(formula, align_data, groups=f'{group}')
    result = model.fit()
    return result


def fit_2mixed_linear_models(data, dependent_var, factor1,factor2, group):
    
    align_data = data.copy()
    align_data['ranked_response'] = align_data[[dependent_var]]

    #formula = f'ranked_response ~ {factor1}*{group}'
    formula = f'ranked_response ~ {factor1}*C({factor2})'
        
    # Fit the mixed linear model
    model = smf.mixedlm(formula, align_data, groups=f'{group}')
    result = model.fit()
    return result

def extract_summary_to_dataframe(results, num_tests):
    summary_data = []

    for var, result in results.items():
        summary = result.summary()
        coefs = result.params
        pvalues = result.pvalues
        conf_int = result.conf_int()
        
        for param in coefs.index:
            pvalue_corrected = min(pvalues[param] * num_tests, 1.0)  # Apply Bonferroni correction
            
            summary_data.append({
                'Dependent Variable': var,
                'Parameter': param,
                'Coefficient': coefs[param],
                'P-value': pvalues[param],
                'Corrected P-value': pvalue_corrected,
                'CI Lower': conf_int[0][param],
                'CI Upper': conf_int[1][param]
            })
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df

In [11]:
#males
data=m_df_long
a=0.05
s=[]
p=[]
n=len(data.columns)
temp = data.copy()
temp.reset_index(inplace=True)
results={}
if 'level_0' in temp.columns:
    temp.drop(columns=['level_0'], inplace=True)

syll = data.columns
for i in syll:
    result=fit_1mixed_linear_models(temp, i, 'age','mouse')
    results[i] = result
    cp = result.pvalues['age']*n
    if cp<a:
        s.extend([i])
        p.extend([cp])
print('number of syllables changing over time for males:', str(len(s)))

# Extract the summary to a DataFrame with Bonferroni correction
num_tests = len(data.columns)  # Number of tests is the number of dependent variables (excluding 'age' and 'mouse')
summary_df = extract_summary_to_dataframe(results, num_tests)

# Print the DataFrame nicely
print(summary_df)
summary_df.to_csv('mxlm_male_longv2_syllable_over_time.csv', index=False)





number of syllables changing over time for males: 50
     Dependent Variable  Parameter  Coefficient       P-value  \
0                     0  Intercept     0.001185  1.774368e-05   
1                     0        age     0.000040  9.021226e-28   
2                     0  mouse Var     0.534906  3.402537e-03   
3                     2  Intercept     0.038642  6.768722e-64   
4                     2        age    -0.000521  1.360294e-53   
..                  ...        ...          ...           ...   
169                  97        age    -0.000070  6.270593e-10   
170                  97  mouse Var     0.999508  2.868216e-03   
171                  98  Intercept     0.027346  2.866919e-63   
172                  98        age    -0.000413  2.678963e-78   
173                  98  mouse Var     0.519496  3.468842e-03   

     Corrected P-value  CI Lower  CI Upper  
0         1.029134e-03  0.000644  0.001727  
1         5.232311e-26  0.000033  0.000048  
2         1.973472e-01  0.17694

In [12]:
## females
data=f_df_long
a=0.05
s=[]
p=[]
n=len(data.columns)
temp = data.copy()
temp.reset_index(inplace=True)
results={}
if 'level_0' in temp.columns:
    temp.drop(columns=['level_0'], inplace=True)

syll = data.columns
for i in syll:
    result=fit_1mixed_linear_models(temp, i, 'age','mouse')
    results[i] = result
    cp = result.pvalues['age']*n
    if cp<a:
        s.extend([i])
        p.extend([cp])
print('number of syllables changing over time for females:', str(len(s)))

# Extract the summary to a DataFrame with Bonferroni correction
num_tests = len(data.columns)  # Number of tests is the number of dependent variables (excluding 'age' and 'mouse')
summary_df = extract_summary_to_dataframe(results, num_tests)

# Print the DataFrame nicely
print(summary_df)
summary_df.to_csv('mxlm_female_longv2_syllable_over_time.csv', index=False)





number of syllables changing over time for females: 46
     Dependent Variable  Parameter  Coefficient       P-value  \
0                     0  Intercept     0.001164  9.253107e-03   
1                     0        age     0.000092  7.360140e-72   
2                     0  mouse Var     0.737040  3.051570e-03   
3                     2  Intercept     0.044224  8.289854e-39   
4                     2        age    -0.000272  4.431194e-16   
..                  ...        ...          ...           ...   
169                  97        age     0.000018  9.337706e-03   
170                  97  mouse Var     0.969113  2.845376e-03   
171                  98  Intercept     0.028933  1.444553e-52   
172                  98        age    -0.000157  4.853335e-10   
173                  98  mouse Var     0.523025  3.415128e-03   

     Corrected P-value  CI Lower  CI Upper  
0         5.366802e-01  0.000287  0.002040  
1         4.268881e-70  0.000082  0.000102  
2         1.769910e-01  0.249

In [None]:
## sex-specific change in syllable over age 2-way anova

In [13]:
data = pd.concat([m_data_long,f_data_long])
temp = data.copy()
temp.reset_index(inplace=True)

if 'level_0' in temp.columns:
    temp.drop(columns=['level_0'], inplace=True)

results={}
syll = data.columns
n=len(syll)
ss=[]
ps=[]
ints=[]
intp=[]

for i in syll:
    result=fit_2mixed_linear_models(temp, i, 'age','sex','mouse')
    results[i] = result
    cp = result.pvalues['C(sex)[T.m]']*n
    if cp<a:
        ss.extend([i])
        ps.extend([cp])
    cp = result.pvalues['age:C(sex)[T.m]']*n
    if cp<a:
        ints.extend([i])
        intp.extend([cp])
print('number of syllables different between males and females:', str(len(ss)))
print('number of syllables with sex/age interactions: ', str(len(ints)))

# Extract the summary to a DataFrame with Bonferroni correction
num_tests = len(data.columns)  # Number of tests is the number of dependent variables (excluding 'age' and 'mouse')
summary_df = extract_summary_to_dataframe(results, num_tests)

# Print the DataFrame nicely
print(summary_df)
summary_df.to_csv('mxlm_fvm_longv2_syllable_divergence_over_time.csv', index=False)





number of syllables different between males and females: 5
number of syllables with sex/age interactions:  34
     Dependent Variable        Parameter  Coefficient       P-value  \
0                     0        Intercept     0.001164  1.741480e-03   
1                     0      C(sex)[T.m]     0.000022  9.670265e-01   
2                     0              age     0.000092  1.128691e-94   
3                     0  age:C(sex)[T.m]    -0.000052  3.679766e-16   
4                     0        mouse Var     0.667040  2.960130e-05   
..                  ...              ...          ...           ...   
285                  98        Intercept     0.028933  3.038514e-60   
286                  98      C(sex)[T.m]    -0.001587  5.255168e-01   
287                  98              age    -0.000157  2.639469e-11   
288                  98  age:C(sex)[T.m]    -0.000256  2.688935e-14   
289                  98        mouse Var     0.520708  3.516919e-05   

     Corrected P-value  CI Lower  CI 

In [25]:
print(temp.age.values.dtype)
print(temp.mouse.values.dtype)
print(temp.sex.values.dtype)
print(temp[i].values.dtype)

float32
object
object
float64
