In [1]:
import os
import time
from pathlib import Path
import pandas as pd
import pyabf

In [2]:
# The following provides a dataframe for all abf files found recursively within a directory
# it outputs the filename, path to file and date created

p = Path("./data/raw/recordings")
human2019_abf_dir = Path("./data/raw/recordings/Human tissue/Human tissue-White noise_2019/")
metadata_file = "./data/raw/Metadata/Human19_metadata.csv"

def summary_df(path):
    all_files = []
    for i in path.rglob('*[!.DS_Store].abf'): # searches for all files recursively excluding those named .DS_Store
        all_files.append((i.name,  i, time.ctime(i.stat().st_ctime)))

    columns = ["file_name", "path", "created"]
    df = pd.DataFrame.from_records(all_files, columns=columns)
    df.path = df.path.astype('str')
    return df

# ABF file information
Contains recordings from mouse and human and protocols include: long-square pulses, ramps, white noise...

In [3]:
# File info for all ABF files in data directory
df = summary_df(p)

In [4]:
df.shape

(946, 3)

## Focus on human ABF recordings from 2019

In [5]:
human2019_file_info = summary_df(human2019_abf_dir)

In [6]:
human2019_file_info.shape

(503, 3)

In [7]:
human2019_file_info

Unnamed: 0,file_name,path,created
0,19219023.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
1,19219021.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
2,19219005.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
3,19219004.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
4,19219016.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
...,...,...,...
498,19o10047.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
499,19o10043.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
500,19o10048.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
501,19o10051.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021


In [8]:
human19_meta_df = pd.read_csv(metadata_file)
human19_meta_df['file_name'] = human19_meta_df.loc[:, 'ABF File'] + '.abf'

In [9]:
human19_meta_df.shape

(536, 12)

In [10]:
human19_meta_df.head()

Unnamed: 0.1,Unnamed: 0,ABF File,Data Type,Exp. Date,Cell #,Cell Layer,Stim Type,Threshold,DC,ZD,Hold,file_name
0,1,2019_11_04_0069,Human,Nov_04_2019,C1,L3C,White noise,n.a,n.a,After ZD,n.a,2019_11_04_0069.abf
1,2,2019_11_04_0069,Human,Nov_04_2019,C1,L3C,White noise,n.a,n.a,After ZD,n.a,2019_11_04_0069.abf
2,3,2019_11_04_0073,Human,Nov_04_2019,C1,L3C,White noise,n.a,n.a,After ZD,n.a,2019_11_04_0073.abf
3,4,2019_11_04_0073,Human,Nov_04_2019,C1,L3C,White noise,n.a,n.a,After ZD,n.a,2019_11_04_0073.abf
4,5,2019_11_04_0041,Human,Nov_04_2019,C1,L3C,White noise,n.a,n.a,Before ZD,n.a,2019_11_04_0041.abf


In [11]:
# there are 42 duplicates in the metadata
human19_meta_df[human19_meta_df.duplicated(subset='ABF File', keep=False)].shape

(42, 12)

In [12]:
# add new column for duplicated files
human19_meta_df['is_duplicate'] = human19_meta_df.duplicated(subset='ABF File', keep=False)

## Focus on "intrinsic properties" experiments with long-square pulses

In [13]:
ip_meta_df = human19_meta_df[human19_meta_df['Stim Type'] == 'Intrinsic Properties']

In [14]:
ip_meta_df

Unnamed: 0.1,Unnamed: 0,ABF File,Data Type,Exp. Date,Cell #,Cell Layer,Stim Type,Threshold,DC,ZD,Hold,file_name,is_duplicate
213,214,2019_11_04_0095,Human,Nov_04_2019,C2,L3C,Intrinsic Properties,n.a,n.a,ZD,n.a,2019_11_04_0095.abf,False
214,215,2019_11_04_0096,Human,Nov_04_2019,C2,L3C,Intrinsic Properties,n.a,n.a,ZD,n.a,2019_11_04_0096.abf,False
215,216,2019_11_04_0097,Human,Nov_04_2019,C2,L3C,Intrinsic Properties,n.a,n.a,ZD,n.a,2019_11_04_0097.abf,False
216,217,2019_11_04_0098,Human,Nov_04_2019,C2,L3C,Intrinsic Properties,n.a,n.a,ZD,n.a,2019_11_04_0098.abf,False
217,218,2019_11_04_0099,Human,Nov_04_2019,C2,L3C,Intrinsic Properties,n.a,n.a,ZD,n.a,2019_11_04_0099.abf,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,514,2019_09_03_0085,Human,Sep_03-2019,C9,L2,Intrinsic Properties,n.a,n.a,n.a,n.a,2019_09_03_0085.abf,False
514,515,2019_09_03_0086,Human,Sep_03-2019,C9,L2,Intrinsic Properties,n.a,n.a,n.a,n.a,2019_09_03_0086.abf,False
515,516,2019_09_03_0087,Human,Sep_03-2019,C9,L2,Intrinsic Properties,n.a,n.a,n.a,n.a,2019_09_03_0087.abf,False
516,517,2019_09_03_0088,Human,Sep_03-2019,C9,L2,Intrinsic Properties,n.a,n.a,n.a,n.a,2019_09_03_0088.abf,False


In [15]:
ip_meta_df[ip_meta_df.duplicated(subset='ABF File', keep=False)]

Unnamed: 0.1,Unnamed: 0,ABF File,Data Type,Exp. Date,Cell #,Cell Layer,Stim Type,Threshold,DC,ZD,Hold,file_name,is_duplicate
351,352,19129000,Human,Jan_29_2019,C1,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19129000.abf,True
352,353,19129002,Human,Jan_29_2019,C1,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19129002.abf,True
353,354,19129003,Human,Jan_29_2019,C1,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19129003.abf,True
354,355,19129004,Human,Jan_29_2019,C1,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19129004.abf,True
355,356,19129010,Human,Jan_29_2019,C1,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19129010.abf,True
378,379,19319019,Human,March_19_2019,C4,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19319019.abf,True
379,380,19319020,Human,March_19_2019,C4,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19319020.abf,True
380,381,19319019,Human,March_19_2019,C6,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19319019.abf,True
381,382,19319020,Human,March_19_2019,C6,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19319020.abf,True
480,481,19129000,Human,Sep_03-2019,C1,L2,Intrinsic Properties,n.a,n.a,n.a,n.a,19129000.abf,True


### Select file information for long-squares using metadata from above

In [16]:
human2019_file_info.head()

Unnamed: 0,file_name,path,created
0,19219023.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
1,19219021.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
2,19219005.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
3,19219004.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021
4,19219016.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021


In [17]:
# select file_info with files of interest (for long-square pulses)
long_squares2019 = human2019_file_info[human2019_file_info.file_name.isin(ip_meta_df.file_name)]

In [18]:
long_squares2019.shape

(221, 3)

### Identify ABF version of files of interest

In [19]:
version_info = []

for i, abf_info in long_squares2019.iterrows():
    abf = pyabf.ABF(abf_info.path)
    abf_version = abf.abfVersionString
    version_info.append((abf_info.file_name, abf_version, abf_info.path))

In [20]:
version_df = pd.DataFrame(version_info, columns=['file_name', 'version', 'path'])

In [21]:
version_df[version_df.duplicated(subset='file_name', keep=False)]

Unnamed: 0,file_name,version,path
49,19129003.abf,1.8.3.0,data/raw/recordings/Human tissue/Human tissue-...
50,19129002.abf,1.8.3.0,data/raw/recordings/Human tissue/Human tissue-...
51,19129000.abf,1.8.3.0,data/raw/recordings/Human tissue/Human tissue-...
52,19129004.abf,1.8.3.0,data/raw/recordings/Human tissue/Human tissue-...
53,19129010.abf,1.8.3.0,data/raw/recordings/Human tissue/Human tissue-...
68,19319019.abf,1.8.3.0,data/raw/recordings/Human tissue/Human tissue-...
69,19319020.abf,1.8.3.0,data/raw/recordings/Human tissue/Human tissue-...
71,19319019.abf,1.8.3.0,data/raw/recordings/Human tissue/Human tissue-...
72,19319020.abf,1.8.3.0,data/raw/recordings/Human tissue/Human tissue-...
181,19129003.abf,1.8.3.0,data/raw/recordings/Human tissue/Human tissue-...


In [22]:
long_squares2019 = long_squares2019.merge(version_df)

In [23]:
# add a column for files I believe should be dropped from analysis
# this also allows for merging back to original metadata without creating extra rows when merging on 'file_name'
long_squares2019['to_drop'] = long_squares2019.duplicated('file_name', keep='last')

In [24]:
ip_meta_df[ip_meta_df.is_duplicate]

Unnamed: 0.1,Unnamed: 0,ABF File,Data Type,Exp. Date,Cell #,Cell Layer,Stim Type,Threshold,DC,ZD,Hold,file_name,is_duplicate
351,352,19129000,Human,Jan_29_2019,C1,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19129000.abf,True
352,353,19129002,Human,Jan_29_2019,C1,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19129002.abf,True
353,354,19129003,Human,Jan_29_2019,C1,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19129003.abf,True
354,355,19129004,Human,Jan_29_2019,C1,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19129004.abf,True
355,356,19129010,Human,Jan_29_2019,C1,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19129010.abf,True
378,379,19319019,Human,March_19_2019,C4,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19319019.abf,True
379,380,19319020,Human,March_19_2019,C4,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19319020.abf,True
380,381,19319019,Human,March_19_2019,C6,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19319019.abf,True
381,382,19319020,Human,March_19_2019,C6,L5,Intrinsic Properties,n.a,n.a,n.a,n.a,19319020.abf,True
480,481,19129000,Human,Sep_03-2019,C1,L2,Intrinsic Properties,n.a,n.a,n.a,n.a,19129000.abf,True


In [25]:
# has half as many rows to drop as are listed as duplicate
long_squares2019[long_squares2019.to_drop]

Unnamed: 0,file_name,path,created,version,to_drop
49,19129003.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021,1.8.3.0,True
50,19129002.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021,1.8.3.0,True
51,19129000.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021,1.8.3.0,True
52,19129004.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021,1.8.3.0,True
53,19129010.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021,1.8.3.0,True
68,19319019.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021,1.8.3.0,True
69,19319020.abf,data/raw/recordings/Human tissue/Human tissue-...,Fri Apr 9 16:26:54 2021,1.8.3.0,True


In [26]:
updated_meta = ip_meta_df.merge(long_squares2019.drop('created', axis=1), on='file_name')

In [27]:
meta_output_path = Path('./data/processed/meta')
meta_output_path.mkdir(parents=True, exist_ok=True)
updated_meta.to_csv(meta_output_path/'human_2019_intrinsic_properties_meta.csv', index=None)

In [28]:
updated_meta = updated_meta[updated_meta.to_drop == False]

In [29]:
updated_meta.version.value_counts()

2.6.0.0    116
1.8.3.0    105
Name: version, dtype: int64

In [30]:
#group_1 = IP_df[(IP_df['Unnamed: 0'] >= 312) & (IP_df['Unnamed: 0'] <= 414)]

# check to see whether version is same as maggie found
test = updated_meta[(updated_meta['Unnamed: 0'] >= 312) & (updated_meta['Unnamed: 0'] <= 414)]

In [31]:
test.version.value_counts()

1.8.3.0    100
Name: version, dtype: int64