In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import matplotlib as mpl
import os
from matplotlib.lines import Line2D
from collections import Counter
import math
from sklearn.decomposition import PCA
from aging.behavior.syllables import relabel_by_usage
from tqdm import tqdm
%matplotlib inline
import warnings
warnings.simplefilter('ignore')
import random
import scipy
from scipy import stats
from aging.plotting import format_plots, PlotConfig, save_factory, figure, legend, format_pizza_plots

In [2]:
format_plots()
#format_pizza_plots()

In [3]:
from matplotlib.colors import LinearSegmentedColormap
colors = ['#c7eae5','#008C8D']
custom_cmap = LinearSegmentedColormap.from_list("custom_purples", colors, N=256)
cmm=custom_cmap

In [4]:
## update data
def filter_df(df):
    max=39
    syll=df.columns.values[df.columns.values>max]
    df.drop(syll,axis=1, inplace=True)
    return df[~df.index.get_level_values('session_name').str.contains('CRL')]

In [5]:
path = Path('/n/groups/datta/win/longtogeny/data/ontogeny/version_11/dana_ontogeny_males_relabeled_usage_matrix_v00.parquet')
df_dana = pd.read_parquet(path)

path = Path('/n/groups/datta/win/longtogeny/data/ontogeny/version_11/ontogeny_males_relabeled_usage_matrix_v00.parquet')
df = pd.read_parquet(path)

In [6]:
# arrange data
data = df.astype(float, errors='ignore')
data= filter_df(data).groupby(['age','uuid']).mean()


data_dana = df_dana.astype(float, errors='ignore')
data_dana= filter_df(data_dana).groupby(['age','uuid']).mean()


data_all = pd.concat([data,data_dana])
# choose 16 mice from all dataset
random_seed = 0
n=16
data_all = data_all.groupby('age').apply(lambda x: x.sample(n=n,replace=False,random_state=random_seed) if len(x)>n else x.sample(n=len(x),replace=False,random_state=random_seed)).reset_index(level=0, drop=True)
data_all = data_all[sorted(data_all.columns)]

In [7]:
temp = data_all.copy()
temp.reset_index(inplace=True)

if 'level_0' in temp.columns:
    temp.drop(columns=['level_0'], inplace=True)

In [18]:
# for all syllables
syll = data_all.columns
n=len(syll)
s=[]
p=[]
h=[]
a = 0.05

for i in syll:
    # Melt the DataFrame
    melted_df = pd.melt(temp, id_vars='age', value_vars=[i], value_name='value')
    melted_df.drop(columns=['variable'], inplace=True)
    h_statistic, p_value = stats.kruskal(*[group['value'] for name, group in melted_df.groupby('age')])

    # Print the results
    print("H-statistic:", h_statistic)
    print("p-value:", p_value)
    cp = p_value*n
    if cp<0.05:
        s.extend([i])
        p.extend([cp])
        h.extend([h])

H-statistic: 249.65669651486087
p-value: 2.026060799866141e-38
H-statistic: 212.9601322327182
p-value: 2.8479720534505984e-31
H-statistic: 110.31434543987984
p-value: 2.3352917555431583e-12
H-statistic: 193.10866499026133
p-value: 1.8232105703082772e-27
H-statistic: 115.19854968627783
p-value: 3.3783637279023696e-13
H-statistic: 141.54473979000932
p-value: 7.264744406363365e-18
H-statistic: 189.50682356154275
p-value: 8.830517728018593e-27
H-statistic: 134.6985252062413
p-value: 1.2411857129312394e-16
H-statistic: 148.12680221613493
p-value: 4.624304898978107e-19
H-statistic: 124.52418642297448
p-value: 7.96678080483509e-15
H-statistic: 234.2297604075466
p-value: 2.123967595074595e-35
H-statistic: 181.19388877700183
p-value: 3.3125410844982847e-25
H-statistic: 223.547432906981
p-value: 2.545908144648086e-33
H-statistic: 130.20081418772259
p-value: 7.882072490029997e-16
H-statistic: 146.93127368194152
p-value: 7.639733972444476e-19
H-statistic: 118.79911095176085
p-value: 8.016225505437

In [8]:
temp.reset_index(inplace=True)

if 'level_0' in temp.columns:
    temp.drop(columns=['level_0'], inplace=True)
    
# Melt the DataFrame
melted_df = pd.melt(temp, id_vars='age', value_vars=[0], value_name='value')
melted_df.drop(columns=['variable'], inplace=True)

In [10]:
'''
from statsmodels.stats.anova import AnovaRM

#perform the repeated measures ANOVA
print(AnovaRM(data=melted_df, depvar='value', within=['age']).fit())
'''

TypeError: AnovaRM.__init__() missing 1 required positional argument: 'subject'

In [12]:
'''
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Fit the repeated measures ANOVA model
model = ols('value ~ C(age)', data=melted_df).fit()

# Print the ANOVA table
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)
'''

            sum_sq     df         F        PR(>F)
C(age)    0.245482   26.0  17.66126  4.211768e-51
Residual  0.211699  396.0       NaN           NaN
