In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import matplotlib as mpl
import os
from toolz import concat
from matplotlib.lines import Line2D
from collections import Counter
import math
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from aging.behavior.syllables import relabel_by_usage
from tqdm import tqdm
%matplotlib inline
import warnings
warnings.simplefilter('ignore')
import random
import scipy
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score
from kneed import KneeLocator
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import jensenshannon
%matplotlib inline
from aging.plotting import format_plots, PlotConfig, save_factory, figure, legend, format_pizza_plots

In [2]:
format_plots()

In [3]:
def mm_norm_col(column):
    return (column - column.min()) / (column.max() - column.min())

In [4]:
from matplotlib.colors import LinearSegmentedColormap
# Define the color map
colors = ['#008C8D','white','#d94801']
custom_cmap = LinearSegmentedColormap.from_list("custom_purples", colors, N=256)
cmmf=custom_cmap

colors = ['#dadaeb','#6a51a3']
custom_cmap = LinearSegmentedColormap.from_list("custom_purples", colors, N=256)
cma=custom_cmap

colors = ['white','gray']
custom_cmap = LinearSegmentedColormap.from_list("custom_purples", colors, N=256)
cma=custom_cmap

colors = ['#ccece6','#99d8c9','#66c2a4','#41ae76','#238b45','#006d2c']
cml = sns.blend_palette(colors, n_colors=256, as_cmap=True)

# Define the color map
colors = ['#fee6ce','#d94801']
custom_cmap = LinearSegmentedColormap.from_list("custom_purples", colors, N=256)
cmf=custom_cmap

colors = ['#c7eae5','#008C8D']
custom_cmap = LinearSegmentedColormap.from_list("custom_purples", colors, N=256)
cmm=custom_cmap

In [5]:
## arrange data
keep_syllables = np.loadtxt('/n/groups/datta/win/longtogeny/data/ontogeny/version_11/to_keep_syllables_raw.txt', dtype=int)

male_df = pd.read_parquet('/n/groups/datta/win/longtogeny/data/ontogeny/version_11/ontogeny_males_raw_usage_matrix_v00.parquet').astype(float)
male_df = male_df[keep_syllables]

## arrange data
female_df = pd.read_parquet('/n/groups/datta/win/longtogeny/data/ontogeny/version_11/ontogeny_females_raw_usage_matrix_v00.parquet').astype(float)
female_df = female_df[keep_syllables]

In [6]:
# avarage per age or session
m_df= male_df.groupby(['age','uuid']).mean()
avg_m_df = male_df.groupby(['age']).mean()

# normalize the data
m_norm = mm_norm_col(m_df)
avg_m_norm = mm_norm_col(avg_m_df)

# for female:
f_df= female_df.groupby(['age','uuid']).mean()
avg_f_df = female_df.groupby(['age']).mean()

# normalize the data
f_norm = mm_norm_col(f_df)
avg_f_norm = mm_norm_col(avg_f_df)

#m_data= m_data.groupby('age').mean()
f_data= f_norm.groupby(['age','uuid']).mean()
m_data= m_norm.groupby(['age','uuid']).mean()

uuids = ['f1d5dce5-f5cf-4b03-b07d-d9b8c1f034b0','6fa50ac7-6d49-4ef9-9059-3d90bdd8c5d4','e9e05da8-fc59-40f3-a9c8-f57c492c6141','327e7fa1-2237-43d2-b25f-c1801912df33']
f_data.reset_index(inplace=True)

# Update age value to 52 for rows with uuid in the list
f_data.loc[f_data['uuid'].isin(uuids), 'age'] = 52

# Set the index back
f_data.set_index(['age', 'uuid'], inplace=True)

# use only common ages
f_ages = f_data.index.get_level_values('age').unique().to_numpy()
m_ages = m_data.index.get_level_values('age').unique().to_numpy()
cages = list(np.intersect1d(f_ages,m_ages))
if 90 in cages:
    cages.remove(90)

m_data = m_data.loc[m_data.index.get_level_values('age').isin(cages)]
m_data['sex'] = 'm'
m_data.set_index('sex',inplace=True,append=True)

f_data = f_data.loc[f_data.index.get_level_values('age').isin(cages)]
f_data['sex'] = 'f'
f_data.set_index('sex',inplace=True,append=True)

data=pd.concat([m_data,f_data])

In [7]:
# Group by 'sex' and 'age' and count the number of items
grouped_counts = data.groupby(['sex', 'age']).size().unstack(fill_value=0)

# Find the maximum count for each 'sex'
max_counts = grouped_counts.max(axis=1)

# Filter the DataFrame to ensure each 'sex' has the same number of items for each 'age'
data = data.groupby(['sex', 'age']).head(max_counts.min())

In [8]:
counts = f_data.index.get_level_values('age').value_counts()
print(counts)
counts = m_data.index.get_level_values('age').value_counts()
print(counts)

age
4     16
8     16
78    16
72    16
68    16
64    16
60    16
56    16
48    16
44    16
40    16
36    16
32    16
24    16
20    16
12    16
52    16
84    16
28    15
80    15
Name: count, dtype: int64
age
4     16
8     16
80    16
78    16
72    16
68    16
64    16
60    16
56    16
52    16
48    16
44    16
40    16
36    16
32    16
24    16
20    16
12    16
84    16
28    15
Name: count, dtype: int64


In [9]:
# Calculate p-values using scipy's pearsonr function
from scipy.stats import spearmanr
correlation_matrix = m_data.corrwith(f_data, axis=0, method='spearman')

p_values = m_data.apply(lambda col: spearmanr(col, f_data[col.name])[1]) #change by number of syllables used

#colors = ['purple' if value > 0 else 'orange' for value in new_diff]

# Combine results into a DataFrame
result_df = pd.DataFrame({'Correlation': correlation_matrix, 'P-Value': p_values})

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 319 and the array at index 1 has size 318

In [None]:
sorted_result_df = result_df.sort_values(by='Correlation', ascending=False)

In [None]:
sorted_result_df['P-Value']*len(keep_syllables)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix.to_frame(), annot=True, cmap="coolwarm", cbar=True, square=True)
plt.title("Spearman Correlation Heatmap")
plt.show()

In [None]:
# Create a DataFrame from the correlation values list
from matplotlib.colors import TwoSlopeNorm
# Plotting
fig, ax = plt.subplots()
fig.set_size_inches(3,3)

# Create a DataFrame from the correlation values list
sorted_result_df = result_df.sort_values(by='Correlation', ascending=False)
df = sorted_result_df.copy()
df['category'] =  -df['Correlation'].rank()
df['value'] = df['Correlation']

# Assume you have your data in a DataFrame called 'df'
# with columns 'category' and 'value'

# Get the minimum and maximum values for normalization
#min_val = df['value'].min()
#max_val = df['value'].max()

min_val = -1
max_val = 1

# Create a TwoSlopeNorm object with the desired center value (0 in this case)
norm = TwoSlopeNorm(vmin=min_val, vcenter=0, vmax=max_val)

# Create a colormap object based on 'RdBu_r'
cmap = plt.cm.coolwarm

# Map the normalized values to colors using the colormap
colors = cmap(norm(df['value']))

# Create a custom color palette using the mapped colors
palette = sns.color_palette(colors)

# Create the barplot using Seaborn with the custom color palette
ax = sns.barplot(x=list(range(len(sorted_result_df))), y='value', data=df, palette=palette)

# Add labels and title
plt.xlabel('Category')
plt.ylabel('Value')
plt.title('Barplot with Custom Color Palette')
plt.ylim([-0.5,1])
sns.despine()
# Show the plot
plt.show()
fig.savefig(c.dana_save_path / "fig3"/ 'fvm_syll_crosscorr.pdf')