In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import matplotlib as mpl
import os
from matplotlib.lines import Line2D
from collections import Counter
import math
from sklearn.decomposition import PCA
from aging.behavior.syllables import relabel_by_usage, compute_usage
%matplotlib inline
import datetime

In [2]:
# plot/colors definitions
cpath = '/n/groups/datta/win/longtogeny/code/notebooks/exploration/Dana'
data_loc=cpath+'/figs/'
try:
    os.mkdir(data_loc)
except FileExistsError:
    # directory already exists
    pass

sns.set_style('white')

In [3]:
def compute_usage(df):
    onsets = np.where(np.diff(df['syllables']) != 0)[0]
    usage = df.loc[df.index[onsets], 'syllables'].value_counts(normalize=True)
    return usage

In [4]:
# filter by syllable usage
def syll_index_above_threshold(mean_df, threshold = 0.01):
    syll_mean_usage = mean_df.groupby(['syllables']).mean()
    return syll_mean_usage[syll_mean_usage > threshold].index.to_list()

In [5]:
## for males

In [6]:
## upload data frame females
data_folder = Path('/n/groups/datta/win/longtogeny/data/ontogeny/version_02')
path = Path('/n/groups/datta/win/longtogeny/data/ontogeny/version_02/longtogeny_males.parquet')
df = pd.read_parquet(path)

In [7]:
# calculate age for males
first= pd.Timestamp(year=2021, month=3, day=30)
age = (df.date-first).dt.days+21
df.age=age

In [8]:
# calculate mouse name for males and replace with 1-16)
df['mouse'] = df.subject_name.str[:5]

# some manual curation

In [9]:
## keep only mice that are part of the male experiment
keep_mice = ['01_01',
 '01_02',
 '01_03',
 '01_04',
 '02_01',
 '02_02',
 '02_03',
 '02_04',
 '03_01',
 '03_02',
 '03_03',
 '03_04',
 '04_01',
 '04_02',
 '04_03',
 '04_04']
df = df[df['mouse'].isin(keep_mice)].reset_index()
#df['mouse'] = df['mouse'].replace(to_replace=df['mouse'].unique(), value=list(range(1,len(df['mouse'].unique())+1)))

In [10]:
# figure out if male and female data mixed by checking if for similar age we have a different session number
# calculate usage
temp = df.groupby(['age','uuid','subject_name','date'], observed=True, sort=False).apply(compute_usage)
temp = pd.pivot_table(temp.reset_index(), index=['age','uuid','subject_name','date'], columns='syllables', values='proportion').fillna(0).reset_index()

ages = temp.age.unique()
for age in ages:
    curr = temp[temp.age==age]
    sessions = curr.subject_name.str[-3:] #get session number
    if len(sessions.unique())>1: #if the same age has different session numbers
        curr_print = curr[['subject_name','age','date']]
        print(curr_print)
        print(len(curr_print))

syllables subject_name  age                date
28           04_03_023   23 2021-04-01 16:55:00
29           03_03_023   23 2021-04-01 16:11:12
30           04_02_023   23 2021-04-01 16:54:55
31           02_02_023   23 2021-04-01 15:28:56
32            01_01_23   23 2021-04-01 14:48:07
33            02_01_23   23 2021-04-01 15:28:49
34            03_01_23   23 2021-04-01 16:10:58
35           03_02_023   23 2021-04-01 16:11:07
36           02_03_023   23 2021-04-01 15:29:01
37           01_02_023   23 2021-04-01 14:48:19
38           01_03_023   23 2021-04-01 14:48:28
39            04_01_23   23 2021-04-01 16:54:47
12
syllables subject_name  age                date
52           02_01_025   25 2021-04-03 15:52:36
53         01_03_025_1   25 2021-04-03 18:48:02
54         02_02_025_1   25 2021-04-03 18:20:11
55           04_01_025   25 2021-04-03 17:46:19
56           02_04_025   25 2021-04-03 15:52:59
57           01_03_025   25 2021-04-03 14:40:48
58           03_03_025   25 2021-04-0

In [11]:
# replace a time stamp where mouse 04_03 is accidently marked as 03_03
t1 = pd.Timestamp('2021-04-03 17:46:45')
df['subject_name'].loc[df.date==t1]='04_03_025'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subject_name'].loc[df.date==t1]='04_03_025'


In [12]:
# find short sessions and filter them out
from collections import Counter
counter = Counter(df['subject_name'])
remove_sessions = ([k for k, v in counter.items() if v < 30000])
df = df[~df['subject_name'].isin(remove_sessions)]

In [13]:
counter = Counter(df['subject_name'])
counter

Counter({'02_04_032': 71960,
         '01_03_031': 71952,
         '01_02_060': 71939,
         '02_02_032': 71930,
         '01_01_030': 71930,
         '01_02_031': 71927,
         '03_01_200': 71926,
         '03_02_200': 71926,
         '03_04_032': 71923,
         '02_04_200': 71923,
         '03_02_060': 71920,
         '02_02_031': 71920,
         '02_03_031': 71919,
         '01_01_200': 71919,
         '03_01_031': 71919,
         '02_02_200': 71919,
         '03_04_200': 71918,
         '04_02_031': 71918,
         '01_04_200': 71918,
         '03_03_031': 71917,
         '04_02_200': 71917,
         '04_01_031': 71916,
         '02_01_032': 71916,
         '01_03_032': 71916,
         '04_04_200': 71915,
         '04_01_200': 71914,
         '02_02_060': 71914,
         '02_01_200': 71913,
         '01_04_030': 71912,
         '03_04_030': 71911,
         '03_01_030': 71911,
         '04_04_030': 71911,
         '03_01_032': 71910,
         '04_01_030': 71909,
         '03_0

In [14]:
# remove session copies that i think are females
# have the same date but different sessions number - 60 instead of 200
fsessions = [pd.Timestamp('2021-10-01 15:47:32'),
             pd.Timestamp('2021-10-01 16:46:10'),
             pd.Timestamp('2021-10-01 17:22:43'),
             pd.Timestamp('2021-10-01 16:17:56')]
df=df[~df.date.isin(fsessions)]

In [15]:
# double check all is good - each date should have less than 16 sessions
temp = df.groupby(['age','uuid','subject_name','date'], observed=True, sort=False).apply(compute_usage)
temp = pd.pivot_table(temp.reset_index(), index=['age','uuid','subject_name','date'], columns='syllables', values='proportion').fillna(0).reset_index()

ages = temp.age.unique()
for age in ages:
    curr = temp[temp.age==age]
    sessions = curr.subject_name.str[-3:] #get session number
    if len(sessions.unique())>1: #if the same age has different session numbers
        curr_print = curr[['subject_name','age','date']]
        print(curr_print)
        print(len(curr_print))

syllables subject_name  age                date
28           04_02_023   23 2021-04-01 16:54:55
29           02_02_023   23 2021-04-01 15:28:56
30            01_01_23   23 2021-04-01 14:48:07
31            02_01_23   23 2021-04-01 15:28:49
32            03_01_23   23 2021-04-01 16:10:58
33           03_02_023   23 2021-04-01 16:11:07
34           01_02_023   23 2021-04-01 14:48:19
35            04_01_23   23 2021-04-01 16:54:47
8
syllables subject_name  age                date
44           02_01_025   25 2021-04-03 15:52:36
45         01_03_025_1   25 2021-04-03 18:48:02
46         02_02_025_1   25 2021-04-03 18:20:11
47           04_01_025   25 2021-04-03 17:46:19
48           02_04_025   25 2021-04-03 15:52:59
49           04_02_025   25 2021-04-03 17:46:30
50           04_03_025   25 2021-04-03 17:46:45
51           01_02_025   25 2021-04-03 14:40:00
52           01_01_025   25 2021-04-03 14:39:28
53           03_04_025   25 2021-04-03 16:32:39
54           01_04_025   25 2021-04-03

In [16]:
# find sessions in which a single syllable is exceuted more than half the time and filter them out
# find ages where a syllable is used more than 30% of the time and remove them
thresh=0.35
temp = df.groupby(['age','uuid','subject_name','date'], observed=True, sort=False).apply(compute_usage).reset_index()
remove_sessions = temp.subject_name[temp['proportion']>thresh]
df = df[~df['subject_name'].isin(remove_sessions)]

In [None]:
# remove sessions when only 10 syllables are used

In [None]:
# find sessions in which a single syllable is exceuted more than half the time and filter them out
# find ages where a syllable is used more than 30% of the time and remove them
thresh=0.35
temp = df.groupby(['age','uuid','subject_name','date'], observed=True, sort=False).apply(compute_usage).reset_index()
remove_sessions = temp.subject_name[temp['proportion']>thresh]
df = df[~df['subject_name'].isin(remove_sessions)]

In [17]:
# find ages where less than 10 data points exists and filter them out
thresh=10
temp = df.groupby(['age','uuid','subject_name','date'], observed=True, sort=False).apply(compute_usage)
temp = pd.pivot_table(temp.reset_index(), index=['age','uuid','subject_name','date'], columns='syllables', values='proportion').fillna(0).reset_index()
counter = Counter(temp['age'])
remove_sessions = ([k for k, v in counter.items() if v < thresh])
df = df[~df['age'].isin(remove_sessions)]

In [18]:
#check
temp = df.groupby(['age','uuid','subject_name','date'], observed=True, sort=False).apply(compute_usage)
temp = pd.pivot_table(temp.reset_index(), index=['age','uuid','subject_name','date'], columns='syllables', values='proportion').fillna(0).reset_index()
counter = Counter(temp['age'])
counter

Counter({21: 16,
         25: 16,
         27: 16,
         48: 16,
         55: 16,
         58: 16,
         62: 16,
         66: 16,
         73: 16,
         76: 16,
         84: 16,
         87: 16,
         90: 16,
         93: 16,
         97: 16,
         107: 16,
         114: 16,
         121: 16,
         128: 16,
         135: 16,
         149: 16,
         157: 16,
         165: 16,
         183: 16,
         194: 16,
         199: 16,
         206: 16,
         215: 16,
         221: 16,
         228: 16,
         42: 15,
         45: 15,
         70: 15,
         250: 15,
         270: 15,
         285: 15,
         354: 15,
         360: 15,
         376: 15,
         381: 15,
         385: 15,
         395: 15,
         449: 15,
         26: 14,
         28: 14,
         80: 14,
         243: 14,
         366: 14,
         404: 14,
         411: 14,
         418: 14,
         431: 14,
         439: 14,
         460: 14,
         468: 14,
         479: 14,
         488:

## end curation

In [19]:
# define cage
df['cage'] = df.subject_name.str[:2]

In [20]:
# calculate usage
musages = df.groupby(['age','mouse','uuid','experiment','subject_name','cage'], observed=True, sort=False).apply(compute_usage)
musage_mtx = pd.pivot_table(musages.reset_index(), index=['age', 'mouse','cage','experiment','uuid'], columns='syllables', values='proportion').fillna(0)

#filter most used syllables just males
musage_mtx.to_parquet(data_folder / "longtogeny_musages_mtx_all.parquet", engine='pyarrow') 
include_syll = syll_index_above_threshold(musages)
musage_mtx = musage_mtx[include_syll]
musage_mtx.to_parquet(data_folder / "longtogeny_musages_mtx_most_used.parquet", engine='pyarrow') 