In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from aging.behavior.syllables import relabel_by_usage
import colorcet as cc
from pathlib import Path
import math

In [2]:
def age_map(string) -> int:
    '''returns age in weeks'''
    if '3m' in string:
        return 12
    if '6m' in string:
        return 24
    if '9m' in string:
        return 35
    if '12m' in string:
        return 52
    if '18m' in string:
        return 78
    if '22m' in string:
        return 90
    if '3w' in string:
        return 3
    if '5w' in string:
        return 5
    if '7w' in string:
        return 7
    if '9w' in string:
        return 9
    else:
        return int(string[:2])

In [3]:
def age_map_fun(age):
    try:
        return int(age.split("w")[0])
    except ValueError:
        return {"3": 12, "6": 24, "9": 35, "12": 52, "18": 78, "22": 90}[
            age.split("m")[0]
        ]

## upload data and housekeeping

In [4]:
df = pd.read_parquet('/n/groups/datta/win/longtogeny/data/ontogeny/version_07/ontogeny_males_syllable_df_v00.parquet')
df=df.query('timestamps<1200')

In [5]:
# relabel syllables by usage and add age
df = relabel_by_usage(df, return_map=False)
#df['age'] = df['age'].map(age_map)
# map CRL mice as 91 age to make them different
df = df[~df['session_name'].str.contains('CRL')]

In [6]:
unique_ages = df['age'].unique()
age_map = dict(zip(unique_ages, map(age_map_fun, unique_ages)))
df['age'] = df['age'].map(age_map).astype('int16[pyarrow]')

In [7]:
# clean up the data by removing short sessions
t = 600
session_length = df.groupby("uuid", sort=False)['timestamps'].max()
remove_uuids = session_length[session_length < t].index
session_length.hist()
df = df[~df['uuid'].isin(remove_uuids)]

In [8]:
# remove duplicated files due to cleaning extractions
new_df = []
for (date, subject), _df in df.groupby(['date', 'subject_name'], sort=False):
    if _df['uuid'].nunique() > 1:
        _df = _df[_df['file'].str.contains('proc_cleaned')]
    new_df.append(_df)
    
df = pd.concat(new_df)

In [9]:
# compute usge

In [10]:
usage_df = df.query('onsets').groupby(['age', 'subject_name', 'session_name','uuid','date'])['relabeled_syllables'].value_counts(normalize=True)
usage_df.name = 'usage'
usage_df = usage_df.reset_index()
usage_mtx = usage_df.pivot_table(values='usage', columns='relabeled_syllables', index=['age','subject_name','session_name', 'uuid','date']).fillna(0)

In [11]:
v = df.groupby(['age','subject_name','session_name', 'uuid','date']).velocity_2d_mm.mean()
usage_mtx['velocity'] = v
usage_mtx.set_index('velocity', append=True, inplace=True)
usage_mtx = usage_mtx.drop(35, level='age')

In [12]:
from collections import Counter
Counter(usage_mtx.index.get_level_values(0))

In [13]:
plt.plot(usage_mtx.values.T, c='k', alpha=0.2);

In [14]:
# remove sessions with abnormally high syllable usage
us = 0.2
usage_mtx = usage_mtx[~(usage_mtx > us).any(axis=1)]
plt.plot(usage_mtx.values.T, c='k', alpha=0.2);

In [15]:
sorted(usage_mtx.index.get_level_values(1).unique())

In [16]:
from collections import Counter
Counter(usage_mtx.index.get_level_values(0))

In [17]:
data_folder = Path('/n/groups/datta/win/longtogeny/data/ontogeny/version_07')
usage_mtx.to_parquet(data_folder / "ontogeny_males_clean_20mins_v1.parquet", engine='pyarrow') 

In [18]:
#get random 16 mice for ages in which there are more than 16 mice
n=16
usage_mtx = usage_mtx.groupby('age').apply(lambda x: x.sample(n=n,replace=False) if len(x)>n else x.sample(n=len(x),replace=False)).reset_index(level=0, drop=True)

In [19]:
Counter(usage_mtx.index.get_level_values(0))

In [20]:
data_folder = Path('/n/groups/datta/win/longtogeny/data/ontogeny/version_07')
usage_mtx.to_parquet(data_folder / "ontogeny_males_clean_20mins_v2.parquet", engine='pyarrow') 