In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import matplotlib as mpl
import os
from toolz import concat
from matplotlib.lines import Line2D
from collections import Counter
import math
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from aging.behavior.syllables import relabel_by_usage
from tqdm import tqdm
%matplotlib inline
import warnings
warnings.simplefilter('ignore')
import random
import scipy
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score
from kneed import KneeLocator
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import jensenshannon
%matplotlib inline
from aging.plotting import format_plots, PlotConfig, save_factory, figure, legend, format_pizza_plots

In [2]:
format_plots()
#format_pizza_plots()

In [3]:
from matplotlib.colors import LinearSegmentedColormap
colors = ['#fee6ce','#d94801']
custom_cmap = LinearSegmentedColormap.from_list("custom_purples", colors, N=256)
cmf=custom_cmap

colors = ['#dadaeb','#6a51a3']
custom_cmap = LinearSegmentedColormap.from_list("custom_purples", colors, N=256)
cmm=custom_cmap

In [4]:
## update data
def remove_crl_male(df):
    max=39
    df.drop('9c060c9b-9eee-4788-90be-803700bbacd8',level=3, axis=0, inplace=True)
    syll=df.columns.values[df.columns.values>max]
    #syll=0
    df.drop(syll,axis=1, inplace=True)
    return df[~df.index.get_level_values('session_name').str.contains('CRL')]

def remove_crl_female(df):
    max=39
    #df.drop('9c060c9b-9eee-4788-90be-803700bbacd8',level=3, axis=0, inplace=True)
    syll=df.columns.values[df.columns.values>max]
    #syll=0
    df.drop(syll,axis=1, inplace=True)
    return df[~df.index.get_level_values('session_name').str.contains('CRL')]

In [5]:
# arrange data
# pseudo data males
path = Path('/n/groups/datta/win/longtogeny/data/ontogeny/version_07/ontogeny_males_clean_pseudotime_v02.parquet')
m_psd = pd.read_parquet(path)
m_psd=m_psd.reset_index()
m_psd=m_psd[['uuid','pseudotime_rank','age','pseudo_age']]

#make sure only animals in clean v2 are in the male dataframe:
path = Path('/n/groups/datta/win/longtogeny/data/ontogeny/version_07/ontogeny_males_clean_v2.parquet')
df_males = pd.read_parquet(path)
df_males = remove_crl_male(df_males)
m_psd = df_males.merge(m_psd, on='uuid', how='left')

#do some cleaning up
m_psd.rename(columns={'pseudotime_rank': 'psd'}, inplace=True)
m_psd.set_index(['psd','uuid','age'], append=True, inplace=True)
m_psd['remapped_bins'] = m_psd['pseudo_age']
m_psd.set_index(['remapped_bins'], 
                append=True, inplace=True)
m_psd = m_psd.filter(regex=r'\d')

In [6]:
# arrange data
# pseudo data females
path = Path('/n/groups/datta/win/longtogeny/data/ontogeny/version_07/ontogeny_females_clean_pseudotime_v02.parquet')
f_psd = pd.read_parquet(path)
f_psd=f_psd.reset_index()
f_psd=f_psd[['uuid','pseudotime_rank','age']]

#make sure only animals in clean v2 are in the female dataframe:
path = Path('/n/groups/datta/win/longtogeny/data/ontogeny/version_07/ontogeny_females_clean_v2.parquet')
df_females = pd.read_parquet(path)
df_females = remove_crl_female(df_females)
f_psd = df_females.merge(f_psd, on='uuid', how='left')

#do some cleaning up
f_psd.rename(columns={'pseudotime_rank': 'psd'}, inplace=True)
f_psd.set_index(['psd','uuid','age'], append=True, inplace=True)
f_psd = f_psd.filter(regex=r'\d')

## convert male pseudotime to bins and remap the bins
f_psd['bins'] = pd.qcut(f_psd.index.get_level_values('psd'), len(f_psd.index.get_level_values('age').unique()),labels=False)

ages = f_psd.index.get_level_values('age').unique().to_numpy()
mapdr_females = dict(zip(list(range(len(ages))), ages))
y_temp = f_psd['bins'].to_numpy()
y_temp = [mapdr_females[item] for item in y_temp]

f_psd['remapped_bins'] = y_temp
f_psd.set_index(['bins','remapped_bins'], 
                append=True, inplace=True)

In [7]:
# pseudo regression
from sklearn.cross_decomposition import PLSRegression
from sklearn import neighbors
mdl= neighbors.KNeighborsRegressor(n_neighbors=8)
#mdl = PLSRegression(n_components=3)
x = m_psd.filter(regex=r'\d')
y = x.index.get_level_values('psd')
mdl.fit(x, y)
print(mdl.score(x, y))

#if you want to log transform
#x = np.log(x + 1e-6) # convert to linear
x = f_psd.filter(regex=r'\d')
psd=mdl.predict(x)

In [8]:
# add female psd projected on males
f_psd['proj_psd'] = psd
f_psd['proj_bins'] = pd.qcut(f_psd.proj_psd, len(f_psd.index.get_level_values('age').unique()),labels=False)
y_temp = f_psd['proj_bins'].to_numpy()
y_temp = [mapdr_females[item] for item in y_temp]

f_psd['proj_remapped_bins'] = y_temp
f_psd.set_index(['proj_psd','proj_bins','proj_remapped_bins'], 
                append=True, inplace=True)

In [9]:
fig = figure(2, 2)
ax = sns.scatterplot(
    data=f_psd,
    x="age",
    y="proj_remapped_bins",
    legend=False,
)
ax.plot([0, 80], [0, 80], color="k", ls="--")
ax.set(title='proj psd')
sns.despine()

In [10]:
fig = figure(2, 2)
ax = sns.scatterplot(
    data=f_psd,
    x="age",
    y="remapped_bins",
    legend=False,
)
ax.plot([0, 80], [0, 80], color="k", ls="--")
ax.set(title='psd')
sns.despine()

In [11]:
'''
## project female data onto male pseudo time
ont_df = pd.read_parquet('/n/groups/datta/win/longtogeny/data/ontogeny/version_07/ontogeny_males_clean_pseudotime_v02.parquet')
count_map = ont_df.groupby('age')['0'].count()
ages = list(concat([i] * n for i, n in count_map.items()))
ont_sort = ont_df.sort_values(by='pseudotime_rank')
ont_sort['pseudo_age'] = ages

pipeline = make_pipeline(KNeighborsRegressor(n_neighbors=4, metric=jensenshannon))
pipeline = pipeline.fit(ont_sort.iloc[:, :-3], y=ont_sort["pseudo_age"])

psd = pipeline.predict(df_females.reindex(columns=list(map(int, ont_sort.columns[:-3]))).fillna(0))
psd_df = pd.Series(psd, index=df_females.index, name='pseudoage').reset_index()
'''

In [12]:
#m_data = m_psd.query('age != 90').reset_index()
m_data = m_psd.reset_index()
f_data = f_psd.reset_index()

In [13]:
#sns.scatterplot(data=f_psd, x='age', y='remapped_bins')
sns.lineplot (data=f_data, x='age', y='proj_remapped_bins', estimator='mean', 
              errorbar='se',
              err_style="bars",
              markers=True, 
              marker='o',
              markersize=10,
              palette = cmf,
              hue='age',
              linestyle='',
              markeredgecolor='grey', markeredgewidth=0.25,
             legend=False)

sns.lineplot (data=f_data, x='age', y='remapped_bins', estimator='mean', 
              errorbar='se',
              err_style="bars",
              markers=True, 
              marker='o',
              markersize=10,
              palette = 'Greens',
              hue='age',
              linestyle='',
              markeredgecolor='grey', markeredgewidth=0.25,
             legend=False)
sns.despine()
plt.plot(f_data['age'], f_data['age'], linestyle='--', color='gray')
plt.title(' predicted age vs actual age - orange projected males, green calculated female')

In [14]:
#sns.scatterplot(data=f_psd, x='age', y='remapped_bins')
sns.lineplot (data=f_data, x='age', y='remapped_bins', estimator='mean', 
              errorbar='se',
              err_style="bars",
              markers=True, 
              marker='o',
              markersize=10,
              palette = cmf,
              hue='age',
              linestyle='',
              markeredgecolor='grey', markeredgewidth=0.25,
             legend=False)
sns.despine()
plt.plot(f_data['age'], f_data['age'], linestyle='--', color='gray')
plt.title(' predicted age vs actual age - orange projected males, green calculated female')

In [15]:
c = PlotConfig()
fig.savefig(c.dana_save_path / "fig4"/ 'female_pred_vs_actual.pdf')

In [16]:
## panel for fig

In [17]:
#sns.scatterplot(data=f_psd, x='age', y='remapped_bins')
fig, ax = plt.subplots()
fig.set_size_inches(3,3)
sns.lineplot (data=f_data, x='age', y='proj_remapped_bins', estimator='mean', 
              errorbar='se',
              err_style="bars",
              markers=True, 
              marker='o',
              markersize=8,
              palette = cmf,
              hue='age',
              linestyle='',
              alpha=0.75,
              markeredgecolor='silver', markeredgewidth=0.25,
             legend=False)
sns.despine()
plt.plot(f_data['age'], f_data['age'], linestyle='--', color='gray')
#plt.title(' predicted age vs actual age - orange females, purple males')
plt.ylim([0,75])
plt.xlim([0,75])

In [18]:
c = PlotConfig()
fig.savefig(c.dana_save_path / "fig4"/ 'female_proj_pred_vs_actual.pdf')

In [19]:
#sns.scatterplot(data=f_psd, x='age', y='remapped_bins')
fig, ax = plt.subplots()
fig.set_size_inches(3,3)
sns.lineplot (data=f_data, x='age', y='remapped_bins', estimator='mean', 
              errorbar='se',
              err_style="bars",
              markers=True, 
              marker='o',
              markersize=8,
              palette = cmf,
              hue='age',
              linestyle='',
              alpha=0.75,
              markeredgecolor='silver', markeredgewidth=0.25,
             legend=False)

sns.lineplot (data=m_data, x='age', y='remapped_bins', estimator='mean', 
              errorbar='se',
              err_style="bars",
              markers=True, 
              marker='o',
              markersize=8,
              palette = cmm,
              #norm='log',
              alpha=0.75,
              hue='age',
              linestyle='',
              markeredgecolor='silver', markeredgewidth=0.25,
             legend=False)
sns.despine()
plt.plot(m_data['age'], m_data['age'], linestyle='--', color='gray')
#plt.title(' predicted age vs actual age - orange females, purple males')
plt.ylim([0,95])
plt.xlim([0,95])

In [20]:
c = PlotConfig()
fig.savefig(c.dana_save_path / "fig4"/ 'male_female_pred_vs_actual.pdf')

In [21]:
from scipy.stats import pearsonr
from scipy.stats import spearmanr
# Calculate Pearson correlation coefficient
correlation_coefficient, p_value = pearsonr(m_data.age.values, m_data.remapped_bins.values)
correlation_coefficient
correlation_coefficient, p_value = spearmanr(m_data.age.values, m_data.remapped_bins.values)
correlation_coefficient

In [22]:
correlation_coefficient, p_value = spearmanr(f_data.age.values, f_data.remapped_bins.values)
correlation_coefficient

In [23]:
# plot residuals

#X=np.log(X + 1e-6) # convert to linear
res = m_data.age.values - m_data.remapped_bins.values
res=pd.DataFrame(res)
res['age'] = m_data.age.values
#fig = plt.figure(figsize=(2, 2))
#fig = sns.boxplot(data=res, x='age',y=0)
x_labels = m_data.age.values
fig = plt.plot(res[0], color='purple', lw=1, alpha=0.7)
plt.xticks(range(len(x_labels)), x_labels, fontsize=3)
#plt.ylim([-20, 5])
sns.despine()
c = PlotConfig()
plt.savefig(c.dana_save_path / "fig4"/ 'male_age_behavioral_residues.pdf', bbox_inches='tight')

In [24]:
correlation_coefficient, p_value = pearsonr(f_data.age.values, f_data.proj_remapped_bins.values)
correlation_coefficient

In [25]:
# plot residuals

#X=np.log(X + 1e-6) # convert to linear
res = f_data.age.values - f_data.remapped_bins.values
res=pd.DataFrame(res)
res['age'] = f_data.age.values
#fig = plt.figure(figsize=(2, 2))
#fig = sns.boxplot(data=res, x='age',y=0)
x_labels = f_data.age.values
fig = plt.plot(res[0], color='orange', lw=1, alpha=0.7)
plt.xticks(range(len(x_labels)), x_labels, fontsize=3)
#plt.ylim([-20, 5])
sns.despine()
c = PlotConfig()
plt.savefig(c.dana_save_path / "fig4"/ 'female_pseudoage_residues.pdf', bbox_inches='tight')