In [None]:
import os
import re
import io
import sys
import glob
import enum
import json
import dask
import xlrd
import base64
import time
import imageio
import requests
import datetime
import psycopg2
import numpy as np
import pandas as pd
import skimage

import dask.diagnostics
import sqlalchemy as db
import sqlalchemy.orm
import sqlalchemy.ext.declarative
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 1

sys.path.append('../..')
%aimport opencell.imaging.managers
%aimport opencell.imaging.processors
%aimport opencell.file_utils

from opencell import constants, file_utils
from opencell.cli import database_cli
from opencell.cli import fov_cli
from opencell.database import models
from opencell.database import operations
from opencell.database import utils as db_utils
from opencell.imaging import utils as im_utils
from opencell.imaging import images, managers, processors, viz

In [None]:
url = db_utils.url_from_credentials('../../db-credentials-dev.json')
url = db_utils.url_from_credentials('../../db-credentials-cap.json')
#url = db_utils.url_from_credentials('/Users/keith.cheveralls/aws/aws-db-credentials.json')
url

In [None]:
engine = db.create_engine(url)
session_factory = db.orm.sessionmaker(bind=engine)
Session = db.orm.scoped_session(session_factory)
session = Session()

In [None]:
session.rollback()

In [None]:
fov_counts = (
    Session.query(
        models.CellLine.id,
        db.func.count(models.MicroscopyFOV.id).label('num_fovs'),
        db.func.count(models.MicroscopyFOVAnnotation.id).label('num_annotated_fovs'),
    )
    .outerjoin(models.CellLine.fovs)
    .outerjoin(models.MicroscopyFOV.annotation)
    .filter(models.CellLine.id.in_([701, 702]))
    .group_by(models.CellLine.id)
)

# limit the counted FOVs to those from the new acquisition script
da_pmls = ['PML%04d' % ind for ind in range(196, 999)]
fov_counts_da = fov_counts.filter(models.MicroscopyFOV.pml_id == db.any_([da_pmls]))

fov_counts = pd.DataFrame(data=fov_counts.all())
fov_counts_da = pd.DataFrame(data=fov_counts_da.all())
fov_counts_da.rename(
    columns={column: '%s_da' % column for column in fov_counts_da.columns},
    inplace=True
)

fov_counts = pd.merge(fov_counts, fov_counts_da, left_on='id', right_on='id_da', how='left')

In [None]:
json.loads(fov_counts.iloc[0].to_json())

### All FOV scores

In [None]:
d = pd.read_sql("select id, data->'score' as score from microscopy_fov_result where kind = 'fov-features'", engine)

In [None]:
_ = plt.hist(d.score, bins=100)
plt.gca().set_xlabel('FOV score')
plt.gca().set_ylabel('Count')

### All cell line annotations

In [None]:
result = pd.read_sql(
    '''
    select * from(
        select cell_line_id, json_array_elements_text(categories::json) as cat 
        from cell_line_annotation
    ) tmp 
    ''',
    Session.get_bind()
)
len(result.cell_line_id.tolist())

In [None]:
set(result.cat)

In [None]:
result.loc[~result.cat.apply(lambda s: True if re.match('.*_[1,2,3]$', s) else False)].cell_line_id.unique()

### Append FACS grade and sequencing results to Manu's annotations CSV

This is on 2020-02-26 for Hera.

In [None]:
d_raw = pd.read_csv('/Users/keith.cheveralls/2020-02-21-17-05-12_cell-line-annotations_ML.csv')

In [None]:
list(d_raw.loc[d_raw.no_gfp == 1].target_name)

In [None]:
with open('/Users/keith.cheveralls/projects/opencell-vis/src/demo/data/facs_grades.json', 'r') as file:
    facs_grades = json.load(file)

In [None]:
facs_grades

In [None]:
for ind, row in d.iterrows():
    d.at[ind, 'facs_grade'] = facs_grades.get('%s-%s' % (row.plate_id, row.well_id))

In [None]:
d['hdr_all'] = None
d['hdr_modified'] = None

for ind, row in d.iterrows():
    try:
        sequencing = (
            Session.query(models.SequencingDataset)
            .filter(models.SequencingDataset.cell_line_id == row.cell_line_id)
            .first()
        )
        d.at[ind, 'hdr_all'] = sequencing.scalars.get('hdr_all')
        d.at[ind, 'hdr_modified'] = sequencing.scalars.get('hdr_modified')
    except Exception as error:
        print(row.cell_line_id)

In [None]:
d.to_csv('/Users/keith.cheveralls/projects/opencell/cache/2020-02-26_annotations-w-facs-seq.csv')

### FOV flags for Hiro

This is on 2020-04-30. This is a list of flags for each FOV. The flags are: 
- the 'no_gfp' and 'publication_quality' target annotations
- whether the FOV has a manually annotated ROI
- whether there are nuclei in the FOV

In [None]:
def all_results_fast(kind):
    query = '''
        select fov.*, res.kind as kind, res.data as data from microscopy_fov fov
        left join (select * from microscopy_fov_result where kind = '%s') res 
        on fov.id = res.fov_id;'''
    df = pd.read_sql(query % kind, engine)
    return df

In [None]:
df = all_results_fast('fov-features')
df.shape

In [None]:
df

In [None]:
fov = Session.query(models.MicroscopyFOV).first()
p = processors.FOVProcessor.from_database(fov)

In [None]:
fov.cell_line.annotation.categories

In [None]:
df['score'] = [data.get('score') for data in df.data]

In [None]:
for col in [
    'filepath', 
    'fov_has_nuclei', 
    'fov_is_annotated', 
    'target_name', 
    'target_is_gfp_negative', 
    'target_is_pub_ready'
]:
    df[col] = None

In [None]:
for ind, row in df.iterrows():
    if not ind % 1000:
        print(ind)

    fov = Session.query(models.MicroscopyFOV).filter(models.MicroscopyFOV.id == row.id).one()
    p = processors.FOVProcessor.from_database(fov)

    # the TIFF filepath relative the the clean/ directory
    df.at[ind, 'filepath'] = p.dst_filepath(kind='clean', ext='tif')

    # whether there are any nuclei in the FOV
    df.at[ind, 'fov_has_nuclei'] = not pd.isna(row.score)

    # whether the FOV has an annotated ROI
    df.at[ind, 'fov_is_annotated'] = True if fov.annotation else False

    # the target name
    df.at[ind, 'target_name'] = fov.cell_line.crispr_design.target_name

    # target annotation flags
    df.at[ind, 'target_is_annotated'] = fov.cell_line.annotation is not None
    if fov.cell_line.annotation is not None:
        df.at[ind, 'target_is_gfp_negative'] = 'no_gfp' in fov.cell_line.annotation.categories
        df.at[ind, 'target_is_publication_quality'] = 'publication_ready' in fov.cell_line.annotation.categories

In [None]:
# the flag that determines whether the FOV should be included in the training data
df['final_fov_flag'] = (df.fov_has_nuclei & df.target_is_publication_quality)

In [None]:
dff = df.drop(labels=['pml_id', 'imaging_round_id', 'site_num', 'raw_filename', 'kind', 'data'], axis=1)
dff.rename(columns={'id': 'fov_id'}, inplace=True)

In [None]:
dff = dff.sort_values(by='cell_line_id')

In [None]:
dff.to_csv('/Volumes/ml_group/opencell-microscopy/2020-04-30_fov-flags.csv', header=True, index=False)

In [None]:
dff.to_csv('2020-04-30_fov-flags.csv', header=True, index=False)

In [None]:
# CSV of only the FOVs to include in the training data
(
    dff.loc[dff.final_fov_flag][['cell_line_id', 'target_name', 'fov_id', 'filepath']]
    .to_csv('/Volumes/ml_group/opencell-microscopy/2020-04-30_good-fovs.csv', header=True, index=False)
)

In [None]:
(
    (dff.fov_has_nuclei & dff.target_is_publication_quality).sum(),
    (dff.fov_has_nuclei & dff.fov_is_annotated).sum(),
)

### Append localization annotations to the 'good-FOVs' CSV (generated above)
This is on 2020-05-27.

In [None]:
fov_flags = pd.read_csv('/Volumes/ml_group/opencell-microscopy/2020-04-30_fov-flags.csv')
good_fovs = pd.read_csv('/Volumes/ml_group/opencell-microscopy/2020-04-30_good-fovs.csv')

In [None]:
good_fovs.head()

In [None]:
raw_ants = pd.read_sql('''select * from cell_line_annotation''', engine)

In [None]:
# localization categories
localization_categories = [
    'cytoplasmic',
    'nuclear',
    'vesicles',
    'membrane',
    'chromatin',
    'textured',
    'er',
    'small_aggregates',
    'nuclear_punctae',
    'nucleus_cytoplasm_variation',
    'golgi',
    'diffuse',
    'nucleolus_gc',
    'cytoskeleton',
    'cell_contact',
    'centrosome',
    'nuclear_membrane',
    'nucleolus_fc_dfc',
    'big_aggregates',
    'nucleolus',
    'nucleolar_ring',
    'mitochondria',
    'cilia',
]

In [None]:
final_ants = raw_ants.copy()
for ind, row in raw_ants.iterrows():
    cats = list(set(localization_categories).intersection(row.categories))
    for cat_ind, cat in enumerate(sorted(cats)):
        column = 'label_%s' % cat_ind
        if column not in ants.columns:
            ants[column] = None
        final_ants.at[ind, column] = cat

In [None]:
final_ants.columns

In [None]:
final_ants = final_ants[[
    'cell_line_id', 'label_0', 'label_1', 'label_2', 'label_3', 'label_4', 'label_5'
]]

final_ants.head()

In [None]:
good_fovs_w_ants = pd.merge(good_fovs, final_ants, on='cell_line_id', how='inner')
good_fovs_w_ants.to_csv(
    '/Volumes/ml_group/opencell-microscopy/2020-05-27_good-fovs-with-labels.csv',
    index=False
)

In [None]:
# targets without any localization annotations
targets_wo_locz = good_fovs_w_ants.loc[good_fovs_w_ants.label_0.isna()].target_name.unique()
targets_wo_locz.sort()
targets_wo_locz

In [None]:
# list of targets that have more than one cell line
ntargets = (
    fov_flags.groupby(['target_name', 'cell_line_id'])
    .count()
    .reset_index()
    .groupby('target_name')
    .count()
    .cell_line_id
)

repeated_targets = list(ntargets.loc[ntargets > 1].index)
np.array(sorted(repeated_targets))

In [None]:
# targets without any localization annotations that are unique
np.array(sorted(list(set(targets_wo_locz).difference(repeated_targets))))

### uniprot metadata for crispr designs

Finding crispr designs with non-unique uniprot gene name synonyms.

In [None]:
d = pd.read_sql(
    '''
    select cd.plate_design_id, cd.well_id, cd.target_name, md.* 
    from crispr_design cd inner join uniprot_metadata md on cd.uniprot_id = md.uniprot_id
    ''',
    engine
)
d.shape

In [None]:
d.head()

In [None]:
d['gene_name'] = d.gene_names.apply(lambda s: s.split(' ') if s else [])
d = d.explode('gene_name')

In [None]:
nunique = d.groupby('gene_name').uniprot_id.nunique()

In [None]:
# crispr designs with non-unique gene name synonyms
dd = (
    d.loc[d.gene_name.isin(nunique[nunique > 1].index)]
    .groupby(['plate_design_id', 'well_id'])
    .first()
    .reset_index()
    .sort_values(by='gene_name')
    .rename(columns={'gene_name': 'nonunique_gene_name'})
    [['plate_design_id', 'well_id', 'target_name', 'gene_names', 'nonunique_gene_name', 'uniprot_id', 'ensg_id']]
)
dd.shape

In [None]:
dd.to_csv('crispr-designs-with-nonunique-gene-name-synonyms.csv')

In [None]:
dd

### Uniprot metadata for protein groups

In [None]:
d = pd.read_sql(
    '''
    select pg.id, md.uniprot_id, md.gene_names, md.ensg_id
    from mass_spec_protein_group pg 
    inner join protein_group_uniprot_metadata_association ass on pg.id = ass.protein_group_id
    inner join uniprot_metadata md on md.uniprot_id = ass.uniprot_id
    ''',
    engine
)
d.shape

In [None]:
d.head()

In [None]:
# all uniprot gene names
d['gene_name'] = d.gene_names.apply(lambda s: s.split(' ') if s else [])
d = d.explode('gene_name')

In [None]:
# the 'primary' uniprot gene name (the first one in the list)
d['gene_name'] = d.gene_names.apply(lambda s: s.split(' ')[0] if s else None)

In [None]:
d.shape

In [None]:
# number of 'primary' uniprot gene names for each protein group
nunique_genes = d.groupby('id').nunique().gene_name

# number of protein groups with more than one 'primary' gene name
(nunique_genes > 1).sum(), (nunique_genes > 2).sum()

In [None]:
# number of ensg_ids for each protein group
nunique_ensgs = d.groupby('id').nunique().ensg_id

(nunique_ensgs > 1).sum()

In [None]:
d.loc[d.id.isin(nunique_genes.loc[nunique_genes > 2].index) & d.gene_name.apply(lambda s: s.startswith('POLR2J'))]

### MS heatmap clusters

In [None]:
d = pd.read_sql(
    '''
    select cluster_id, protein_group_id from mass_spec_cluster_heatmap heatmap
    inner join mass_spec_hit hit on hit.id = heatmap.hit_id;
    ''',
    engine
)
d.shape, d.protein_group_id.unique().shape, d.cluster_id.unique().shape

In [None]:
# cluster_id - pg_id map
d.groupby(['cluster_id', 'protein_group_id']).first().reset_index()

In [None]:
# number of protein groups in each cluster
d.groupby(['cluster_id']).protein_group_id.nunique().reset_index().sort_values(by='protein_group_id')

In [None]:
# number of protein groups that appear in more than one cluster
nclusters = d.groupby(['protein_group_id']).cluster_id.nunique().reset_index().sort_values(by='cluster_id')
(nclusters.cluster_id > 1).sum()

In [None]:
d.loc[d.cluster_id == 1].iloc[0].cluster_id == 1

In [None]:
clusters = pd.read_sql(
    '''
    select protein_group_id, cluster_id, subcluster_id 
    from mass_spec_cluster_heatmap heatmap
    inner join mass_spec_hit hit on hit.id = heatmap.hit_id
    where analysis_type = 'primary:mcl_i2.0_haircut:keepcore_subcluster:newman_eigen_corecomplex:newman_eigen'
    and subcluster_id is not null
    order by protein_group_id
    ''', 
    engine
)
clusters = clusters.groupby(['protein_group_id']).first().reset_index()
clusters.shape

In [None]:
clusters.loc[(clusters.cluster_id == 1) & (clusters.subcluster_id == 0)]

In [None]:
cluster_sizes = clusters.groupby('cluster_id').count().reset_index()
cluster_sizes.loc[cluster_sizes.protein_group_id > 1].cluster_id.values

In [None]:
clusters['ids'] = clusters.apply(
    lambda row: '%s-%s' % (row.cluster_id, row.subcluster_id), axis=1
)

In [None]:
# number of protein groups in more than one cluster-subcluster combination
n = clusters.groupby('protein_group_id').nunique().sort_values(by='ids')
(n.cluster_id > 1).sum()

In [None]:
# number of clusters with more than one subcluster
n = clusters.groupby('cluster_id').nunique()
(n.subcluster_id > 1).sum()

In [None]:
# number of subclusters in more than one cluster
n = clusters.groupby('subcluster_id').nunique()
(n.cluster_id > 1).sum()

In [None]:
clusters.loc[
    clusters.protein_group_id == '214906eb06e98662ad3e290bc9d63d4e183b1e4e691677956c5f1b79221c7f6c'
]