In [1]:
import glob
import os


# Reminder to install s3fs to read files from aws
import s3fs

# import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import xarray as xr

import seaborn as sns

# import scanpy.api as sc

# import holoviews as hv
# hv.extension('bokeh')
# hv.archive.auto()

%matplotlib inline

In [2]:
prefix = 's3://olgabot-maca/facs/sourmash_compare_no_track_abundance/'
txt = 'aws_s3_ls_tabula_muris_compare_no_track_abundance.txt'

In [3]:
! aws s3 ls $prefix > $txt

In [4]:
aws_s3_ls = pd.read_table(txt, delim_whitespace=True, header=None, 
                          names=['date', 'time', 'bytes', 'basename'])
print(aws_s3_ls.shape)
aws_s3_ls.head()

(63, 4)


Unnamed: 0,date,time,bytes,basename
0,2018-09-18,01:32:22,19873413,trim=false_scaled=1000_dna_ksize=15.csv
1,2018-09-17,22:08:11,20376123,trim=false_scaled=1000_dna_ksize=42.csv
2,2018-09-17,21:12:04,20432677,trim=false_scaled=1000_dna_ksize=57.csv
3,2018-09-18,02:57:36,19595448,trim=false_scaled=1100_dna_ksize=15.csv
4,2018-09-18,00:19:51,11427393,trim=false_scaled=1200_dna_ksize=60.csv


In [5]:
from tqdm import tqdm

In [6]:

def extract_cell_metadata(columns):
    cell_metadata = pd.Series(columns).str.split('|', expand=True)
#     cell_metadata.index = columns
    cell_metadata.columns = ['cell_id', 'tissue', 
                             'subtissue', 'cell_ontology_class', 
                             'free_annotation']
    cell_metadata = cell_metadata.set_index('cell_id')
    cell_metadata.index.name = 'cell_id'
    cell_metadata = cell_metadata.applymap(lambda x: x.split(':')[-1])
#     print('\tcell_metadata.shape', cell_metadata.shape)
#     cell_metadata = cell_metadata.sort_index()
#     cell_metadata.head()
    return cell_metadata



dfs = {}

metadatas = []

def to_key_value_pair(attribute):
    if len(attribute) > 1:
        try:
            return attribute[0], int(attribute[1])
        except ValueError:
            return attribute[0], attribute[1] 
    else:
        return 'comparison_sequence', attribute[0]



def extract_experiment_metadata(basename):
    key = basename.split('.csv')[0]
    split = key.split('_')
    attributes = [x.split('=') for x in split]
    attributes = dict(to_key_value_pair(x) for x in attributes)
    return key, attributes


experiment_metadatas = {}

cell_metadata = pd.DataFrame()

max_cells = 1000
tidy_dfs = []

for basename in tqdm(aws_s3_ls['basename']):
    csv = f'{prefix}{basename}'
    df = pd.read_csv(csv)
#     aprint(f'{basename}\tdf.shape: {df.shape}')
    
    key, attributes = extract_experiment_metadata(basename)
    experiment_metadatas[key] = attributes

    metadata = extract_cell_metadata(df.columns)
    
    mask = np.triu(np.ones(df.shape)).astype(np.bool)

    df.index = metadata.index.copy()
    df.columns = metadata.index.copy()
    
    df = df.where(mask)
    df.index.name = 'cell_id'
    df.columns.name = 'other_cell'
    tidy = df.stack().reset_index()
    tidy['experiment'] = key
#     tidy = tidy.rename(columns={'level_0': 'cell_id', 'level_1': 'other_cell'})
        
    tidy_dfs.append(tidy)

    # Add new cells to metadata
    if len(df.index.difference(cell_metadata.index)) > 0:
        cell_metadata = cell_metadata.append(metadata)
        cell_metadata = cell_metadata.loc[~cell_metadata.index.duplicated()]
        print(f'\tcell_metadata.shape, {cell_metadata.shape}')



  2%|▏         | 1/63 [00:01<01:46,  1.72s/it]

	cell_metadata.shape, (997, 4)


  6%|▋         | 4/63 [00:06<01:35,  1.61s/it]

	cell_metadata.shape, (1000, 4)


100%|██████████| 63/63 [01:56<00:00,  1.85s/it]


In [17]:
compare_results = pd.concat(tidy_dfs, ignore_index=True)
compare_results = compare_results.rename(columns={0: 'similarity'})
print(compare_results.shape)
compare_results.head()

(28620778, 4)


Unnamed: 0,cell_id,other_cell,similarity,experiment
0,K16-MAA001861-3_39_F-1-1,K16-MAA001861-3_39_F-1-1,1.0,trim=false_scaled=1000_dna_ksize=15
1,K16-MAA001861-3_39_F-1-1,G3-D042103-3_11_M-1,0.048418,trim=false_scaled=1000_dna_ksize=15
2,K16-MAA001861-3_39_F-1-1,B11-MAA000934-3_9_M-1-1,0.046684,trim=false_scaled=1000_dna_ksize=15
3,K16-MAA001861-3_39_F-1-1,O9-MAA001632-3_56_F-1-1,0.056352,trim=false_scaled=1000_dna_ksize=15
4,K16-MAA001861-3_39_F-1-1,H12-B002436-3_39_F-1-1,0.068042,trim=false_scaled=1000_dna_ksize=15


In [18]:
compare_results.tail()

Unnamed: 0,cell_id,other_cell,similarity,experiment
28620773,J9-MAA001892-3_38_F-1-1,O2-MAA001454-3_38_F-1-1,0.053038,trim=true_scaled=900_dna_ksize=21
28620774,J9-MAA001892-3_38_F-1-1,F6-MAA000871-3_11_M-1-1,0.05392,trim=true_scaled=900_dna_ksize=21
28620775,O2-MAA001454-3_38_F-1-1,O2-MAA001454-3_38_F-1-1,1.0,trim=true_scaled=900_dna_ksize=21
28620776,O2-MAA001454-3_38_F-1-1,F6-MAA000871-3_11_M-1-1,0.052045,trim=true_scaled=900_dna_ksize=21
28620777,F6-MAA000871-3_11_M-1-1,F6-MAA000871-3_11_M-1-1,1.0,trim=true_scaled=900_dna_ksize=21


In [19]:
def combine_cell_ontology_free_annotation(row):
    if pd.notnull(row['free_annotation']):
        return '{cell_ontology_class} ({free_annotation})'.format(**row)
    else:
        return row['cell_ontology_class']

In [20]:
# cell_metadata = pd.concat(metadatas)
# print(cell_metadata.shape)
# cell_metadata = cell_metadata.loc[~cell_metadata.index.duplicated()]
cell_metadata = cell_metadata.sort_index()
cell_metadata['cell_ontology_free_annotation'] = cell_metadata.apply(combine_cell_ontology_free_annotation, axis=1)
print(cell_metadata.shape)
cell_metadata.head()

(1000, 5)


Unnamed: 0_level_0,tissue,subtissue,cell_ontology_class,free_annotation,cell_ontology_free_annotation
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A1-B002427-3_39_F-1-1,Heart,LA,myofibroblast_cell,,myofibroblast_cell (NA)
A1-D041914-3_8_M-1-1,Bladder,,bladder_cell,Bladder_mesenchymal_cell,bladder_cell (Bladder_mesenchymal_cell)
A1-MAA000496-3_10_M-1-1,Trachea,,mesenchymal_cell,,mesenchymal_cell (NA)
A1-MAA000549-3_8_M-1-1,Skin,Anagen,epidermal_cell,Intermediate_IFE,epidermal_cell (Intermediate_IFE)
A1-MAA001869-3_38_F-1-1,Large_Intestine,Proximal,epithelial_cell_of_large_intestine,Lgr5-_amplifying_undifferentiated_cell,epithelial_cell_of_large_intestine (Lgr5-_ampl...


In [21]:
experiment_metadata = pd.DataFrame(experiment_metadatas).T
experiment_metadata['trim'] = experiment_metadata['trim'].map(lambda x: x == 'true')
experiment_metadata['ksize'] = experiment_metadata['ksize'].astype(int)
experiment_metadata['scaled'] = experiment_metadata['scaled'].astype(int)
print(experiment_metadata.shape)
experiment_metadata.head()

(63, 4)


Unnamed: 0,comparison_sequence,ksize,scaled,trim
trim=false_scaled=1000_dna_ksize=15,dna,15,1000,False
trim=false_scaled=1000_dna_ksize=42,dna,42,1000,False
trim=false_scaled=1000_dna_ksize=57,dna,57,1000,False
trim=false_scaled=1100_dna_ksize=15,dna,15,1100,False
trim=false_scaled=1200_dna_ksize=60,dna,60,1200,False


In [22]:
experiment_metadata.scaled.unique()

array([1000, 1100, 1200, 1500, 1600, 1800,  200, 3000,  400, 4500,  500,
        600,  700,  800,  900, 1300, 1700, 1900])

In [23]:
csv_prefix = 's3://olgabot-maca/facs/sourmash_compare_no_track_abundance_combined/'

In [24]:
# with fs.open(f'{csv_prefix}experiment-metadata.csv', 'w') as f:
#     experiment_metadata.to_csv(f)
# # %time experiment_metadata.to_csv(f'{csv_prefix}experiment-metadata.csv')

In [25]:
# pd.read_csv(f'{csv_prefix}experiment-metadata.csv').head()

In [26]:
%%file s3_utils.py
import s3fs

def write_s3(df, filename, fmt='csv', **kwargs):
    fs = s3fs.S3FileSystem(anon=False)
    if fmt == 'csv':
        # csv is a text format
        with fs.open(filename, 'w') as f:
            return df.to_csv(f, **kwargs)
    elif fmt == 'parquet':
        # Parquet is a binary format and needs the "b" flag
        with fs.open(filename, 'wb') as f:
            return df.to_parquet(f, **kwargs)
        

Overwriting s3_utils.py


In [34]:
import s3fs

fs = s3fs.S3FileSystem(anon=False)

with fs.open(f'{csv_prefix}similarity.parquet', 'wb') as f:
    %time compare_results.to_parquet(f)

CPU times: user 25.4 s, sys: 11.2 s, total: 36.6 s
Wall time: 32.3 s


In [28]:
from s3_utils import write_s3

%time write_s3(cell_metadata, f'{csv_prefix}cell-metadata.csv', 'csv')
%time write_s3(experiment_metadata, f'{csv_prefix}experiment-metadata.csv', 'csv')


CPU times: user 12 ms, sys: 8 ms, total: 20 ms
Wall time: 260 ms
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 61.3 ms


In [31]:
%time write_s3(compare_results, f'{csv_prefix}similarity.csv', 'csv')


CPU times: user 2min 20s, sys: 1.72 s, total: 2min 21s
Wall time: 5min 4s


In [33]:
! aws s3 ls --human-readable $csv_prefix

2018-09-23 16:03:11  334.5 MiB -similarity.parquet
2018-09-23 15:51:07  102.3 KiB cell-metadata.csv
2018-09-23 15:51:08    3.2 KiB experiment-metadata.csv
2018-09-23 15:51:36    2.9 GiB similarity.csv


In [None]:
# write_s3(compare_results, f'{csv_prefix}-similarity.parquet', 'parquet')

In [None]:
# with fs.open(f'{csv_prefix}cell-metadata.csv', 'w') as f:
#     cell_metadata.to_csv(f)

In [None]:
# with fs.open(f'{csv_prefix}similarity.csv', 'w') as f:
#     %time compare_results.to_csv(f, index=False)

In [1]:
# ls -lha $csv_prefix*

-rwxrwxr-x 1 olga olga 4.3M Jul 16 09:03 [0m[01;32m000_tabula-muris_reflow_fastqs.ipynb[0m*
-rwxrwxr-x 1 olga olga 170K Aug  7 13:01 [01;32m004_ashley_cancer_fastq.ipynb[0m*
-rwxrwxr-x 1 olga olga 116K Jul 16 09:03 [01;32m005_why_did_some_reflow_runs_fail.ipynb[0m*
-rw-rw-r-- 1 olga olga 138K Jul 27 12:27 006_tabula-muris_reflow_fastqs_v5.ipynb
-rw-rw-r-- 1 olga olga  67K Jul 27 12:27 006_tabula-muris_reflow_fastqs_v6.ipynb
-rwxrwxr-x 1 olga olga  43K Sep 19 10:59 [01;32m007_reflow_runbatch_compare.ipynb[0m*
-rw-rw-r-- 1 olga olga  17K Sep 19 09:25 008_ashley_cancer_compute_v2.ipynb
-rw-rw-r-- 1 olga olga  15K Sep 20 12:21 009_ashley_cancer_compare.ipynb
-rw-rw-r-- 1 olga olga 1.3M Aug  2 14:49 010_sourmash_compare_analysis.ipynb
-rw-rw-r-- 1 olga olga 303K Aug 14 12:15 011_kmers_in_single_bladder_cell_vs_gencode_transcripts.ipynb
-rw-rw-r-- 1 olga olga 6.0M Aug 14 12:15 012_sourmash_compare_csv_scale1200_k15.ipynb
-rw-rw-r-- 1 olga olga 1.6M Sep 21 15:08 013_read_a