In [4]:
import glob
import os


# Reminder to install s3fs to read files from aws
import s3fs

# import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import xarray as xr

import seaborn as sns

# import scanpy.api as sc

# import holoviews as hv
# hv.extension('bokeh')
# hv.archive.auto()

%matplotlib inline

In [5]:
prefix = 's3://olgabot-maca/facs/sourmash_compare_no_track_abundance/'
txt = 'aws_s3_ls_tabula_muris_compare_no_track_abundance.txt'

In [6]:
! aws s3 ls $prefix > $txt

In [7]:
aws_s3_ls = pd.read_table(txt, delim_whitespace=True, header=None, 
                          names=['date', 'time', 'bytes', 'basename'])
print(aws_s3_ls.shape)
aws_s3_ls.head()

(910, 4)


Unnamed: 0,date,time,bytes,basename
0,2018-09-23,16:46:50,19045449,trim=false_scaled=1000_dna_ksize=12.csv
1,2018-09-24,12:18:09,19873413,trim=false_scaled=1000_dna_ksize=15.csv
2,2018-09-24,00:00:49,20133019,trim=false_scaled=1000_dna_ksize=18.csv
3,2018-09-24,01:43:50,10499576,trim=false_scaled=1000_dna_ksize=21.csv
4,2018-09-24,03:23:29,20258815,trim=false_scaled=1000_dna_ksize=24.csv


In [None]:
aws_s3_ls.tail()

In [8]:
from tqdm import tqdm

In [9]:

def extract_cell_metadata(columns):
    cell_metadata = pd.Series(columns).str.split('|', expand=True)
#     cell_metadata.index = columns
    cell_metadata.columns = ['cell_id', 'tissue', 
                             'subtissue', 'cell_ontology_class', 
                             'free_annotation']
    cell_metadata = cell_metadata.set_index('cell_id')
    cell_metadata.index.name = 'cell_id'
    cell_metadata = cell_metadata.applymap(lambda x: x.split(':')[-1])
#     print('\tcell_metadata.shape', cell_metadata.shape)
#     cell_metadata = cell_metadata.sort_index()
#     cell_metadata.head()
    return cell_metadata



dfs = {}

metadatas = []

def to_key_value_pair(attribute):
    if len(attribute) > 1:
        try:
            return attribute[0], int(attribute[1])
        except ValueError:
            return attribute[0], attribute[1] 
    else:
        return 'comparison_sequence', attribute[0]



def extract_experiment_metadata(basename):
    key = basename.split('.csv')[0]
    split = key.split('_')
    attributes = [x.split('=') for x in split]
    attributes = dict(to_key_value_pair(x) for x in attributes)
    return key, attributes


experiment_metadatas = {}

cell_metadata = pd.DataFrame()

max_cells = 1000
tidy_dfs = []

for basename in tqdm(aws_s3_ls['basename']):
    csv = f'{prefix}{basename}'
    df = pd.read_csv(csv)
#     aprint(f'{basename}\tdf.shape: {df.shape}')
    
    key, attributes = extract_experiment_metadata(basename)
    experiment_metadatas[key] = attributes

    metadata = extract_cell_metadata(df.columns)
    
    mask = np.triu(np.ones(df.shape)).astype(np.bool)

    df.index = metadata.index.copy()
    df.columns = metadata.index.copy()
    
    df = df.where(mask)
    df.index.name = 'cell_id'
    df.columns.name = 'other_cell'
    tidy = df.stack().reset_index()
    tidy['experiment'] = key
#     tidy = tidy.rename(columns={'level_0': 'cell_id', 'level_1': 'other_cell'})
        
    tidy_dfs.append(tidy)

    # Add new cells to metadata
    if len(df.index.difference(cell_metadata.index)) > 0:
        cell_metadata = cell_metadata.append(metadata)
        cell_metadata = cell_metadata.loc[~cell_metadata.index.duplicated()]
        print(f'\tcell_metadata.shape, {cell_metadata.shape}')



  0%|          | 1/910 [00:02<30:43,  2.03s/it]

	cell_metadata.shape, (997, 4)


  2%|▏         | 21/910 [00:35<24:43,  1.67s/it]

	cell_metadata.shape, (1000, 4)


100%|██████████| 910/910 [26:45<00:00,  1.76s/it]


In [10]:
compare_results = pd.concat(tidy_dfs, ignore_index=True)
compare_results = compare_results.rename(columns={0: 'similarity'})
print(compare_results.shape)
compare_results.head()

(338632046, 4)


Unnamed: 0,cell_id,other_cell,similarity,experiment
0,B22-MAA000871-3_11_M-1-1,B22-MAA000871-3_11_M-1-1,1.0,trim=false_scaled=1000_dna_ksize=12
1,B22-MAA000871-3_11_M-1-1,L11-MAA000586-3_8_M-1-1,0.398906,trim=false_scaled=1000_dna_ksize=12
2,B22-MAA000871-3_11_M-1-1,O20-MAA001632-3_56_F-1-1,0.407873,trim=false_scaled=1000_dna_ksize=12
3,B22-MAA000871-3_11_M-1-1,D2-MAA000452-3_8_M-1-1,0.386677,trim=false_scaled=1000_dna_ksize=12
4,B22-MAA000871-3_11_M-1-1,P1-D042473-3_10_M-1-1,0.38418,trim=false_scaled=1000_dna_ksize=12


In [11]:
compare_results.tail()

Unnamed: 0,cell_id,other_cell,similarity,experiment
338632041,L16-MAA000913-3_9_M-1-1,A12-MAA000388-3_11_M-1-1,1.0,trim=true_scaled=900_dna_ksize=9
338632042,L16-MAA000913-3_9_M-1-1,N6-B002423-3_39_F-1-1,1.0,trim=true_scaled=900_dna_ksize=9
338632043,A12-MAA000388-3_11_M-1-1,A12-MAA000388-3_11_M-1-1,1.0,trim=true_scaled=900_dna_ksize=9
338632044,A12-MAA000388-3_11_M-1-1,N6-B002423-3_39_F-1-1,1.0,trim=true_scaled=900_dna_ksize=9
338632045,N6-B002423-3_39_F-1-1,N6-B002423-3_39_F-1-1,1.0,trim=true_scaled=900_dna_ksize=9


In [12]:
def combine_cell_ontology_free_annotation(row):
    if pd.notnull(row['free_annotation']):
        return '{cell_ontology_class} ({free_annotation})'.format(**row)
    else:
        return row['cell_ontology_class']

In [13]:
# cell_metadata = pd.concat(metadatas)
# print(cell_metadata.shape)
# cell_metadata = cell_metadata.loc[~cell_metadata.index.duplicated()]
cell_metadata = cell_metadata.sort_index()
cell_metadata['cell_ontology_free_annotation'] = cell_metadata.apply(combine_cell_ontology_free_annotation, axis=1)
print(cell_metadata.shape)
cell_metadata.head()

(1000, 5)


Unnamed: 0_level_0,tissue,subtissue,cell_ontology_class,free_annotation,cell_ontology_free_annotation
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A1-B002427-3_39_F-1-1,Heart,LA,myofibroblast_cell,,myofibroblast_cell (NA)
A1-D041914-3_8_M-1-1,Bladder,,bladder_cell,Bladder_mesenchymal_cell,bladder_cell (Bladder_mesenchymal_cell)
A1-MAA000496-3_10_M-1-1,Trachea,,mesenchymal_cell,,mesenchymal_cell (NA)
A1-MAA000549-3_8_M-1-1,Skin,Anagen,epidermal_cell,Intermediate_IFE,epidermal_cell (Intermediate_IFE)
A1-MAA001869-3_38_F-1-1,Large_Intestine,Proximal,epithelial_cell_of_large_intestine,Lgr5-_amplifying_undifferentiated_cell,epithelial_cell_of_large_intestine (Lgr5-_ampl...


In [14]:
experiment_metadata = pd.DataFrame(experiment_metadatas).T
experiment_metadata['trim'] = experiment_metadata['trim'].map(lambda x: x == 'true')
experiment_metadata['ksize'] = experiment_metadata['ksize'].astype(int)
experiment_metadata['scaled'] = experiment_metadata['scaled'].astype(int)
print(experiment_metadata.shape)
experiment_metadata.head()

(910, 4)


Unnamed: 0,comparison_sequence,ksize,scaled,trim
trim=false_scaled=1000_dna_ksize=12,dna,12,1000,False
trim=false_scaled=1000_dna_ksize=15,dna,15,1000,False
trim=false_scaled=1000_dna_ksize=18,dna,18,1000,False
trim=false_scaled=1000_dna_ksize=21,dna,21,1000,False
trim=false_scaled=1000_dna_ksize=24,dna,24,1000,False


In [15]:
sorted(experiment_metadata.scaled.unique())

[100,
 200,
 300,
 400,
 500,
 600,
 700,
 800,
 900,
 1000,
 1100,
 1200,
 1300,
 1400,
 1500,
 1600,
 1700,
 1800,
 1900,
 2000,
 2500,
 3000,
 3500,
 4000,
 4500,
 5000]

In [16]:
csv_prefix = 's3://olgabot-maca/facs/sourmash_compare_no_track_abundance_combined/'

In [17]:
# with fs.open(f'{csv_prefix}experiment-metadata.csv', 'w') as f:
#     experiment_metadata.to_csv(f)
# # %time experiment_metadata.to_csv(f'{csv_prefix}experiment-metadata.csv')

In [18]:
# pd.read_csv(f'{csv_prefix}experiment-metadata.csv').head()

In [19]:
%%file s3_utils.py
import s3fs

def write_s3(df, filename, fmt='csv', **kwargs):
    fs = s3fs.S3FileSystem(anon=False)
    if fmt == 'csv':
        # csv is a text format
        with fs.open(filename, 'w') as f:
            return df.to_csv(f, **kwargs)
    elif fmt == 'parquet':
        # Parquet is a binary format and needs the "b" flag
        with fs.open(filename, 'wb') as f:
            return df.to_parquet(f, **kwargs)
        

Overwriting s3_utils.py


In [20]:
import s3fs

fs = s3fs.S3FileSystem(anon=False)

with fs.open(f'{csv_prefix}similarity.parquet', 'wb') as f:
    %time compare_results.to_parquet(f)

CPU times: user 2min 57s, sys: 2min 2s, total: 4min 59s
Wall time: 6min 50s


In [21]:
from s3_utils import write_s3

%time write_s3(cell_metadata, f'{csv_prefix}cell-metadata.csv', 'csv')
%time write_s3(experiment_metadata, f'{csv_prefix}experiment-metadata.csv', 'csv')


CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 74 ms
CPU times: user 12 ms, sys: 4 ms, total: 16 ms
Wall time: 76.8 ms


In [None]:
%time write_s3(compare_results, f'{csv_prefix}similarity.csv', 'csv')


In [None]:
! aws s3 ls --human-readable $csv_prefix

In [None]:
# write_s3(compare_results, f'{csv_prefix}-similarity.parquet', 'parquet')

In [None]:
# with fs.open(f'{csv_prefix}cell-metadata.csv', 'w') as f:
#     cell_metadata.to_csv(f)

In [None]:
# with fs.open(f'{csv_prefix}similarity.csv', 'w') as f:
#     %time compare_results.to_csv(f, index=False)

In [None]:
# ls -lha $csv_prefix*

In [32]:
compare_results.head()

Unnamed: 0,cell_id,other_cell,similarity,experiment
0,B22-MAA000871-3_11_M-1-1,B22-MAA000871-3_11_M-1-1,1.0,trim=false_scaled=1000_dna_ksize=12
1,B22-MAA000871-3_11_M-1-1,L11-MAA000586-3_8_M-1-1,0.398906,trim=false_scaled=1000_dna_ksize=12
2,B22-MAA000871-3_11_M-1-1,O20-MAA001632-3_56_F-1-1,0.407873,trim=false_scaled=1000_dna_ksize=12
3,B22-MAA000871-3_11_M-1-1,D2-MAA000452-3_8_M-1-1,0.386677,trim=false_scaled=1000_dna_ksize=12
4,B22-MAA000871-3_11_M-1-1,P1-D042473-3_10_M-1-1,0.38418,trim=false_scaled=1000_dna_ksize=12


In [34]:
scaled_subset = 500, 1000, 2000, 5000

scaled_subset_experiments = experiment_metadata.query('scaled in @scaled_subset')
print(scaled_subset_experiments.shape)
scaled_subset_experiments.head()

(146, 4)


Unnamed: 0,comparison_sequence,ksize,scaled,trim
trim=false_scaled=1000_dna_ksize=12,dna,12,1000,False
trim=false_scaled=1000_dna_ksize=15,dna,15,1000,False
trim=false_scaled=1000_dna_ksize=18,dna,18,1000,False
trim=false_scaled=1000_dna_ksize=21,dna,21,1000,False
trim=false_scaled=1000_dna_ksize=24,dna,24,1000,False


In [36]:
compare_results_scaled_subset = compare_results.query(
    'experiment in @scaled_subset_experiments.index')
print(compare_results_scaled_subset.shape)
compare_results_scaled_subset.head()

(56618998, 4)


Unnamed: 0,cell_id,other_cell,similarity,experiment
0,B22-MAA000871-3_11_M-1-1,B22-MAA000871-3_11_M-1-1,1.0,trim=false_scaled=1000_dna_ksize=12
1,B22-MAA000871-3_11_M-1-1,L11-MAA000586-3_8_M-1-1,0.398906,trim=false_scaled=1000_dna_ksize=12
2,B22-MAA000871-3_11_M-1-1,O20-MAA001632-3_56_F-1-1,0.407873,trim=false_scaled=1000_dna_ksize=12
3,B22-MAA000871-3_11_M-1-1,D2-MAA000452-3_8_M-1-1,0.386677,trim=false_scaled=1000_dna_ksize=12
4,B22-MAA000871-3_11_M-1-1,P1-D042473-3_10_M-1-1,0.38418,trim=false_scaled=1000_dna_ksize=12


In [38]:
%time write_s3(compare_results_scaled_subset, f'{csv_prefix}similarity-scaled-subset.parquet', 'parquet')

CPU times: user 36.9 s, sys: 19.5 s, total: 56.4 s
Wall time: 1min 22s


In [40]:
! aws s3 ls --human-readable $csv_prefix

2018-09-23 16:03:11  334.5 MiB -similarity.parquet
2018-10-15 19:03:59  102.3 KiB cell-metadata.csv
2018-10-15 19:03:59   46.4 KiB experiment-metadata.csv
2018-10-16 14:42:59  592.5 MiB similarity-scaled-subset.parquet
2018-10-15 19:04:00   34.4 GiB similarity.csv
2018-10-15 19:00:25    3.5 GiB similarity.parquet


In [41]:
csv_prefix

's3://olgabot-maca/facs/sourmash_compare_no_track_abundance_combined/'