In [None]:
import vpolo

In [None]:
#Make knee plot
import pandas as pd
cb_freq = pd.read_csv("/input_dir/corona_analysis/alignment_out/kidney_scRNA_control/salmon_out_jain_s1/alevin/raw_cb_frequency.txt",
                      sep="\t",index_col=0, header=None, names=["cell-barcode","read_freq"])
                      
                      
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np


In [None]:
def generKneePlot(cb_freq,
                     save2disk=False,
                     prefix_name="scRNA"):
     
    #Toss those cell barcodes with less than 100 reads
    cb_freq = cb_freq[cb_freq["read_freq"] > 100]
    ## 2D scatter for knee cell-barcode plot ##
    xaxis=dict(
        showline=False,
        zeroline=False,
        showgrid=False,
        showticklabels=False,
        showbackground=False,
        ticks='',
        title="Unique Cell-barcodes")
    yaxis=dict(
        showline=False,
        zeroline=False,
        showgrid=False,
        showticklabels=False,
        showbackground=False,
        type="log",
        ticks='',
        title="log10 number of reads")
    layout = go.Layout(
         scene = dict(
                      xaxis=xaxis,
                      yaxis=yaxis),
        title=dict(text='Knee plot of Cell barcodes',
            y=0.9,
            x=0.5,
            font=dict(size=30)),

        legend = dict(x=.8,y=1),
        showlegend=False,
        width = 1200,
        height = 800
    )
    cb_x = [int(x) for x in range(len(cb_freq.index))]
    knee_trace = go.Scatter(
        x=cb_x,
        y=np.log10(cb_freq["read_freq"]),
        name='Knee plot of Cell barcodes',
        mode="lines+markers",
        marker=go.scatter.Marker(color="red",
                                   opacity=1,
                                   size=5,
                                   line=dict(width=.4,
                                             color='grey')),
        text=[ str(cb_freq.index[x] + ":" + str(cb_freq["read_freq"][x])) for x in range(len(cb_freq["read_freq"])-1) ],
        hoverinfo="text"
    )

    #Render output
    data = [knee_trace]
    fig = go.Figure(data=data, layout=layout)
    fig.update_layout(template="plotly_white+xgridoff+ygridoff")
    if (save2disk):
        pio.write_html(fig, file = prefix_name+'_knee_cb.html', auto_open=False)
#        fig.update_layout(template="presentation+xgridoff+ygridoff")
#        fig.write_image(prefix_name+'_UMAP_.pdf')
    
    #Set up background for live viewing
    fig.update_layout(template="plotly_dark+xgridoff+ygridoff",
                      paper_bgcolor='rgba(0,0,0,0)',
                      plot_bgcolor='rgba(0,0,0,0)',
                      width = 600, margin=dict(l=0,r=0,b=0,t=20))
    return fig

In [None]:
knee_out = generKneePlot(cb_freq,save2disk=True)


In [None]:
import parser

In [None]:
kidney_s1_control = parser.read_quants_bin("/input_dir/corona_analysis/alignment_out/kidney_scRNA_control/salmon_out/")


In [None]:
#Get version from paper
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4008nnn/GSM4008621/suppl/GSM4008621_Adult-Kidney4-1_dge.txt.gz

In [None]:

#Read in from the paper text out
kidney_s1_control_paper = pd.read_csv("/input_dir/corona_analysis/alignment_out/kidney_scRNA_control/paper_out/GSM4008621_Adult-Kidney4-1_dge.txt",
                                     sep="\t", index_col=0, header=0)



In [None]:
import plotly.express as px

df = kidney_s1_control["ENSG00000198938.2"]
fig = px.violin(df, y="ENSG00000198938.2")
fig.show()


In [None]:
#See top most expressed genes on average
(kidney_s1_control.mean(axis=0)).sort_values()[-5:]


In [None]:
import numpy as np
#Simple function to return rgb rainbow values for given categories
def rainbow_cols(cats):
    uniq_cats = np.unique(cats)
    mult_rain = (int(len(uniq_cats)/9))+1
    col_map = px.colors.sequential.Rainbow * mult_rain
    chrom_rainbow = {uniq_cats[i]: col_map[i] for i in range(0,len(uniq_cats))}
    mapped_cols = [chrom_rainbow[x] for x in cats]
    return(mapped_cols)


In [None]:
import plotly.graph_objects as go
import plotly.io as pio


def generateUMAPplot(scRNA_inframe,
                     save2disk=False,
                     prefix_name="",
                    hgl_gene="none"):
    # Add labels for clusters and top 5 genes
    # Add coloring by input gene
    ## 3D Scatter for UMAP ##
    axis=dict(
        showline=False,
        zeroline=False,
        showgrid=False,
        showticklabels=False,
        showbackground=False,
        ticks='',
        title="UMAP dim")
    layout = go.Layout(
         scene = dict(
                      xaxis=axis,
                      yaxis=axis,
                      zaxis=axis),
        title=dict(text='UMAP of gene <br> clustered w/ HDBSCAN',
            y=0.9,
            x=0.5,
            font=dict(size=30)),

        bargap=0.2,
        bargroupgap=0.1,
        legend = dict(x=.8,y=1),
        showlegend=False,
        width = 1400,
        height = 900
    )
#    labels_text = TAD_labels(tad_features)
    hdb_labels = scRNA_inframe["UMAP_hdbscan_labels"]
    hdb_labels_cols = rainbow_cols(hdb_labels)
    
    UMAP_trace = go.Scatter3d(
        x=scRNA_inframe["UMAP_1"],
        y=scRNA_inframe["UMAP_2"],
        z=scRNA_inframe["UMAP_3"],
        name='UMAP of scRNA-seq genes',
        mode="markers",
        marker=go.scatter3d.Marker(color=hdb_labels_cols,
                                   opacity=1,
                                   size=5,
                                   line=dict(width=.4,
                                             color='grey')),
#        text=labels_text,
        hoverinfo="text"
    )

    #Render output
    data = [UMAP_trace]
    fig = go.Figure(data=data, layout=layout)
    fig.update_layout(template="plotly_white+xgridoff+ygridoff")
    if (save2disk):
        pio.write_html(fig, file = prefix_name+'_UMAP_genes.html', auto_open=False)
#        fig.update_layout(template="presentation+xgridoff+ygridoff")
#        fig.write_image(prefix_name+'_UMAP_.pdf')
    
    #Set up background for live viewing
    fig.update_layout(template="plotly_dark+xgridoff+ygridoff",
                      paper_bgcolor='rgba(0,0,0,0)',
                      plot_bgcolor='rgba(0,0,0,0)',
                      width = 600, margin=dict(l=0,r=0,b=0,t=20))
    return fig

In [None]:
def generatePercVar(var_out,
                    save2disk=False,
                    prefix_name=""):
    perc_explained = var_out["Percentage_variance"]
    ## Cumulative percent variance explained by PCA line plot ##
    layout = go.Layout(
        title=dict(text='Explained variance by <br> different principal components',
            y=0.9,
            x=0.5,
            font=dict(size=30)),
        xaxis=dict(
            title='Explained Variance'
        ),
        yaxis=dict(
            title='Explained variance in percent'
        ),
        bargap=0.2,
        bargroupgap=0.1,
        legend = dict(x=.8,y=1)
    )

    pca_trace = dict(
        type='scatter',
        x=['PC %s' %i for i in range(1,len(var_out))], 
        y=perc_explained,
        name='Cumulative'
    )

    data = [pca_trace]
    fig = dict(data=data, layout=layout)
    fig = go.Figure(data=data, layout=layout)
    fig.update_layout(template="plotly_white+ygridoff")
    if (save2disk):
        pio.write_html(fig, file = prefix_name+'_PCA_variance.html', auto_open=False)
        #fig.update_layout(template="presentation+ygridoff")
        #fig.write_image(prefix_name+'_PCA_variance.pdf')

    #Set up background for live viewing
    fig.update_layout(template="plotly_dark+xgridoff+ygridoff",
                      paper_bgcolor='rgba(0,0,0,0)',
                      plot_bgcolor='rgba(0,0,0,0)',
                      width = 600)
    return fig


In [None]:
def generatePropVar(var_out,
                    save2disk=False,
                    prefix_name=""):

    prop_var = var_out["Proportion_variance"]

    layout = go.Layout(
        title=dict(text='Scree Plot of scRNA-seq PCA',
            y=0.95,
            x=0.5,
            font=dict(size=30)),
        xaxis=dict(
            title='Principal Component'
        ),
        yaxis=dict(
            title='Proportion of Variance Explained'
        ),
        bargap=0.2,
        bargroupgap=0.1,
        legend = dict(x=.8,y=1)
    )

    pca_trace = dict(
        type='scatter',
        x=['PC %s' %i for i in range(1,len(var_out))], 
        y=prop_var,
        name='Per PC variance'
    )

    data = [pca_trace]
    fig = dict(data=data, layout=layout)
    fig = go.Figure(data=data, layout=layout)
    fig.update_layout(template="plotly_white+ygridoff")
    if (save2disk):
        pio.write_html(fig, file = prefix_name+'_PCA_scree.html', auto_open=False)
        #fig.update_layout(template="presentation+ygridoff")
        #fig.write_image(prefix_name+'_PCA_scree.pdf')
        
    #Set up background for live viewing
    fig.update_layout(template="plotly_dark+xgridoff+ygridoff",
                      paper_bgcolor='rgba(0,0,0,0)',
                      plot_bgcolor='rgba(0,0,0,0)',
                      width = 1200)
    return fig

In [None]:
import re
import pandas as pd
import os
import sys
import numpy as np
import umap
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.preprocessing import StandardScaler
import hdbscan


In [None]:

#in_mat_exp = kidney_s1_control_paper.transpose().to_numpy()
in_mat_exp = kidney_s1_control.to_numpy()

#Run PCA using scikit
num_pcs = 200
X_std = StandardScaler().fit_transform(in_mat_exp)
sklearn_pca = sklearnPCA(n_components=num_pcs)
Y_sklearn = sklearn_pca.fit_transform(X_std)
prop_var = sklearn_pca.explained_variance_ratio_
perc_explained = np.cumsum(prop_var)
var_out = pd.DataFrame({"Percentage_variance":perc_explained, "Proportion_variance":prop_var})
#Save PCs to txt file for later viewing
#PCs_out = pd.DataFrame(sklearn_pca.components_[0:num_pcs,:], columns=kidney_s1_control.columns,index = ["PC-" + str(i) for i in range(1,num_pcs+1)])
#PCs_out.to_csv(prefix_name+"_PCs.txt", sep='\t', header=True, index=False)



In [None]:
generatePropVar(var_out, save2disk=True, prefix_name="/data/Euplotid/Scree_controlKidney")

In [None]:
generatePercVar(var_out, save2disk=True, prefix_name="/data/Euplotid/CumVar_controlKidney")

In [None]:
#Make UMAP embedding of PCs and cluster using HDBSCAN
umap_embedding = umap.UMAP(
    n_neighbors=3,
    min_dist=0.05,
    n_components=3,
    random_state=42,
    metric='minkowski'
).fit_transform(Y_sklearn[:,0:num_pcs])

#Get clusters using HDBSCAN
hdb_labels = hdbscan.HDBSCAN(
    min_samples=5,
    min_cluster_size=10,
).fit_predict(umap_embedding)

#Save embeddings along w/ feature matrix
PC_embeds = pd.DataFrame( {"PC_1":Y_sklearn[:,0],
                           "PC_2":Y_sklearn[:,1],
                           "UMAP_1":umap_embedding[:,0],
                           "UMAP_2":umap_embedding[:,1],
                           "UMAP_3":umap_embedding[:,2],
                           "UMAP_hdbscan_labels":hdb_labels })
#kidney_s1_control = pd.concat([ kidney_s1_control, PC_embeds], axis=1)
#Output calculated files
#var_out.to_csv(prefix_name+"_PCA_exp.txt", sep="\t", header=True, index=False)


In [None]:
#Plot
generateUMAPplot(PC_embeds, save2disk=True, prefix_name="/data/Euplotid/UMAP_controlKidney")