In [65]:
import pandas as pd
import seaborn as sns
import numpy as np
from plotnine import *
import boto3
import os

In [66]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install s3fs

Collecting s3fs
[?25l  Downloading https://files.pythonhosted.org/packages/68/8a/a2430bda6106aaaee6d53fa9be914ff3023d3f9d547b959a47641addad33/s3fs-0.2.1.tar.gz (46kB)
[K    100% |████████████████████████████████| 51kB 1.2MB/s ta 0:00:01
Building wheels for collected packages: s3fs
  Building wheel for s3fs (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/phoenix.logan/Library/Caches/pip/wheels/f5/2d/de/6e81a0885c7c5c4731bfc8f1d27abf80414d3633d6d6b103ef
Successfully built s3fs
Installing collected packages: s3fs
Successfully installed s3fs-0.2.1
[33mYou are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [52]:
s3 = boto3.resource('s3')
mosquito_bucket = s3.Bucket('czbiohub-mosquito')
cms001_filtered_bucket = mosquito_bucket.objects.filter(Prefix='sequences/CMS001_fastq.gz/')
cms002_filtered_bucket = mosquito_bucket.objects.filter(Prefix='sequences/CMS002_fastq.gz/')
signatures_bucket = s3.Bucket('phoenixlogan-data').objects.filter(Prefix='nf-kmer-similarity/sourmash/sketches/')

In [53]:
# get original fastq files from s3 bucket
cms001_sequences = [os.path.basename(i.key) for i in cms001_filtered_bucket if os.path.basename(i.key) != '']
cms002_sequences = [os.path.basename(i.key) for i in cms002_filtered_bucket if os.path.basename(i.key) != '']
computed_signatures = [os.path.basename(i.key) for i in signatures_bucket if os.path.basename(i.key) != '']

In [35]:
# load in species data
species_id = pd.read_csv("~/code/skeeters/data/sample_genus_and_species.csv", header=0, index_col = 0)
species_id.head()

Unnamed: 0,genus,species,corrected genus,corrected species
CMS_001_RNA_A_S1,Culex,erythrothorax,Culex,erythrothorax
CMS_002_10a_Rb_S119_L004,Culex,quinquefasciatus,Culex,quinquefasciatus
CMS_002_13a_Rb_S120_L004,Culex,quinquefasciatus,Culex,quinquefasciatus
CMS_002_16a_Rb_S121_L004,Aedes,aegypti,Aedes,aegypti
CMS_002_17a_Rb_S122_L004,Aedes,aegypti,Aedes,aegypti


In [58]:
suffix = '_R1_001.fastq.gz'
cms002_sequences = [x.split()[-1][:-len(suffix)] for x in cms002_sequences[1:]]
print(cms002_sequences[:3])

suffix = '_R1_001.fastq.gz'
cms001_sequences = [x.split()[-1][:-len(suffix)] for x in cms001_sequences[1:]]
print(cms001_sequences[:3])

# get prefixes from sketch names
computed_signatures = [x.split("_molecule")[0] for x in computed_signatures]
print(computed_signatures[:3])

['CMS_002_10a_Rb_S119_L004', 'CMS_002_13a_Rb_S120_L004', 'CMS_002_13a_Rb_S120_L004']
['CMS_001_RNA_A_S1', 'CMS_002_RNA_A_S1', 'CMS_002_RNA_A_S1']
['CMS_001_RNA_A_S1', 'CMS_001_RNA_A_S1', 'CMS_001_RNA_A_S1']


In [63]:
needed_signatures = (set(cms001_sequences) | set(cms002_sequences)) - set(computed_signatures)
#needed_signatures

In [68]:
k21sim = pd.read_csv("s3://czbiohub-mosquito/references/sourmash/comparison/k21abun.csv")
k31sim = pd.read_csv("s3://czbiohub-mosquito/references/sourmash/comparison/k31abun.csv")
k51sim = pd.read_csv("s3://czbiohub-mosquito/references/sourmash/comparison/k51abun.csv")

k21sim_noabun = pd.read_csv("s3://czbiohub-mosquito/references/sourmash/comparison/k21.csv")
k31sim_noabun = pd.read_csv("s3://czbiohub-mosquito/references/sourmash/comparison/k31.csv")
k51sim_noabun = pd.read_csv("s3://czbiohub-mosquito/references/sourmash/comparison/k51.csv")

In [71]:
k21sim.head()

Unnamed: 0,tarsalis,erythrothorax,aegypti,erythrothorax.1,albopictus,quinquefasciatus,erythrothorax.2,inornata,particeps,albopictus.1,...,erythrothorax.3,tarsalis.1,tarsalis.2,tarsalis.3,albopictus.2,incidens,particeps.1,albopictus.3,tarsalis.4,erythrothorax.4
CMS_002_41a_Rb_S176_L004,1.0,0.85003,0.567976,0.918905,0.540307,0.723136,0.889797,0.460967,0.435114,0.524022,...,0.832002,0.907998,0.895414,0.925792,0.522949,0.437978,0.280821,0.545159,0.951666,0.915453
CMS_013_RNA_A_S5,0.85003,1.0,0.62043,0.889246,0.587391,0.725459,0.916714,0.492395,0.453794,0.576902,...,0.93049,0.871768,0.882787,0.895898,0.578107,0.45709,0.264986,0.594984,0.869177,0.860814
CMS_002_25c_Rb_S142_L004,0.567976,0.62043,1.0,0.588017,0.63464,0.607007,0.597577,0.488284,0.447317,0.622865,...,0.613528,0.587302,0.582833,0.597829,0.625468,0.452074,0.269575,0.644679,0.581928,0.603221
CMS_002_28d_Rb_S157_L004,0.918905,0.889246,0.588017,1.0,0.555626,0.708836,0.941458,0.473218,0.44267,0.5413,...,0.861606,0.898085,0.898392,0.93102,0.542175,0.446003,0.273058,0.56164,0.927187,0.911123
CMS_002_45d_Rb_S186_L004,0.540307,0.587391,0.63464,0.555626,1.0,0.579809,0.562201,0.464721,0.425481,0.935243,...,0.584527,0.553673,0.553,0.566994,0.924523,0.429373,0.257697,0.911959,0.551046,0.573016


In [69]:
for sim in [k21sim, k31sim, k51sim, k21sim_noabun, k31sim_noabun, k51sim_noabun]:
    sim.index = sim.columns
    sim.columns = species_id.loc[sim.columns.tolist()]['corrected species'].values

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  This is separate from the ipykernel package so we can avoid doing imports until


In [70]:
# Drop waters

keep = k21sim.columns.notnull()
k21sim = (k21sim[keep].transpose()[keep]).transpose()
keep = k31sim.columns.notnull()
k31sim = (k31sim[keep].transpose()[keep]).transpose()
keep = k51sim.columns.notnull()
k51sim = (k51sim[keep].transpose()[keep]).transpose()
keep = k21sim_noabun.columns.notnull()
k21sim_noabun = (k21sim_noabun[keep].transpose()[keep]).transpose()
keep = k31sim_noabun.columns.notnull()
k31sim_noabun = (k31sim_noabun[keep].transpose()[keep]).transpose()
keep = k51sim_noabun.columns.notnull()
k51sim_noabun = (k51sim_noabun[keep].transpose()[keep]).transpose()

In [None]:
def get_consensus(sim):
    neighbors = sim.columns[np.argsort(-sim.values, axis = 1)]
    consensus = pd.DataFrame({'label': neighbors[:,0].values, 'label1': neighbors[:,1].values, 'label2': neighbors[:,2].values})
    consensus.index = sim.index
    consensus['consensus1'] = (consensus['label'] == consensus['label1'])
    consensus['consensus2'] = (consensus['label'] == consensus['label1']) & (consensus['label'] == consensus['label2'])

    consensus['dist1'] = sim.values[np.arange(len(sim)),np.argsort(-sim.values, axis = 1)[:,1]]
    consensus['dist2'] = sim.values[np.arange(len(sim)),np.argsort(-sim.values, axis = 1)[:,2]]
    
    print(len(consensus), consensus['consensus1'].sum(), consensus['consensus2'].sum(), )
    return consensus