In [1]:
!date

Mon Jan 23 11:09:42 PST 2023


In [2]:
!echo $CONDA_PREFIX

/c4/home/derek/miniconda3/envs/squint_2


In [3]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
import sys
import os

In [5]:
barcodes = pd.read_csv('/c4/home/derek/data1/derek/data_scSLR/prenatal_brain/GW16_2_barcodes.csv',header=None)

In [6]:
input_dir = '/c4/home/derek/data1/HDD-2/derek/SLR_temp/GW16_2/split/collision_tables/'

In [7]:
output_dir = '/c4/home/derek/data1/derek/data_scSLR/prenatal_brain/'

In [None]:
%%time
collision_table = pd.DataFrame()

for file in os.listdir(input_dir):
    df = pd.read_csv(input_dir+file, index_col=0)
    
    #remove non gene aligned reads 
    df = df[df['gene_ID'] != '-']
    
    ##remove polyG beads
    df = df[df['bead_ID'] != 1]
    
    #include only reads aligning to cells
    df = df[df.cell.isin(barcodes[0])]
    
    #get per-gene values
    df = df.groupby('gene_ID')['collision'].value_counts(normalize=True).unstack().fillna(0)
    
    #remove index name
    df = df.rename_axis(None, axis=1)
    
    collision_table = pd.concat([collision_table,df])
    
    del df 
    print(file+" processed")

    

chr22.barcoded.sort.bam_collision_table.csv processed
chr18.barcoded.sort.bam_collision_table.csv processed
chr5.barcoded.sort.bam_collision_table.csv processed
chr15.barcoded.sort.bam_collision_table.csv processed


In [None]:
collision_table.to_csv(output_dir+'collision_table')

In [None]:
import scanpy as sc

In [None]:
sc_input_dir = '/c4/home/derek/data1/derek/data_scSLR/prenatal_brain/STAR_outs/GW16_2/StarOut/Solo.out/GeneFull/raw/'

In [None]:
adata = sc.read_mtx(sc_input_dir + 'matrix.mtx').T
adata.obs_names = pd.read_csv(sc_input_dir+'barcodes.tsv',header=None)[0]
adata.var_names = pd.read_csv(sc_input_dir+'features.tsv',header=None,sep='\t')[0]

In [None]:
adata=adata[adata.obs_names.isin(barcodes[0])]

In [None]:
adata

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)


In [None]:
df_HVG = sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, inplace=False)


In [None]:
df_HVG.head()

In [None]:
sns.scatterplot(data=df_HVG,
                x=df_HVG.means, 
                y=df_HVG.dispersions,
                s=1,
                linewidth=0
               )

In [None]:
ID_dict = dict(zip(pd.read_csv(sc_input_dir+'features.tsv',header=None,sep='\t')[0],
         pd.read_csv(sc_input_dir+'features.tsv',header=None,sep='\t')[1])
    )

In [None]:
df_HVG.index = adata.var_names 

In [None]:
df_HVG['gene_name'] = df_HVG.index.map(ID_dict)

In [None]:
df_HVG.head()

In [None]:
df_HVG_ = df_HVG[df_HVG.index.isin(collision_table.index)]

In [None]:
df_HVG_

In [None]:
collision_table = collision_table.reindex(df_HVG_.index)

In [None]:
collision_table

In [None]:
collision_table = collision_table.rename_axis(None, axis=1)

In [None]:
df_collisions = pd.concat([df_HVG_,collision_table],axis=1)

In [None]:
df_collisions

In [None]:
df_collisions.columns = ['means','dispersions','mean_bin','dispersions_norm', 'highly_variable','gene_name','False_rate','True_rate']

In [None]:
df_collisions = df_collisions.sort_values(by='True_rate')

In [None]:
lengths_transcripts = pd.read_csv('/c4/home/derek/data1/HDD-2/derek/reference/human_hp3_reference/genes.gtf.genelength',sep='\t',
                                 index_col=0)

In [None]:
lengths_transcripts

In [None]:
lengths_transcripts.index.name = None 

In [None]:
lengths_transcripts

In [None]:
lengths_transcripts = lengths_transcripts[lengths_transcripts.index.isin(df_collisions.index)]

In [None]:
lengths_transcripts

In [None]:
df_collisions = df_collisions[df_collisions.index.isin(lengths_transcripts.index)]

In [None]:
df_collisions

In [None]:
lengths_transcripts = lengths_transcripts.reindex(df_collisions.index)

In [None]:
df_collisions['length_mean'] = lengths_transcripts['mean']
df_collisions['length_merged'] = lengths_transcripts['merged']

In [None]:
df_collisions['True_clipped'] = df_collisions['True_rate'].clip(upper=0.1)

In [None]:
df_collisions = df_collisions.sort_index()

In [None]:
df_collisions

In [None]:
plt.rcParams['figure.figsize'] = 5,4
fig, axes = plt.subplots(1,1)

sm = sns.scatterplot(data=df_collisions,
                x=df_collisions.means, 
                y=df_collisions.dispersions,
                s=15,
                linewidth=0,
                hue=df_collisions.True_rate,
                palette='Spectral_r',
                alpha=1,
                ax=axes,
               )



norm = plt.Normalize(df_collisions['True_rate'].min(), df_collisions['True_rate'].max())
sm = plt.cm.ScalarMappable(cmap="Spectral_r",norm=norm)
sm.set_array([])


axes.get_legend().remove()
axes.figure.colorbar(sm,shrink=.25,aspect=6,
                    # orientation='horizontal',
                     label='mean percent collisions')

# axes.spines.right.set_visible(False)
# axes.spines.top.set_visible(False)

# axes.spines.bottom.set_linewidth(1)
# axes.spines.left.set_linewidth(1)

axes.set_xlabel('mean expression \n (normalized)')
axes.set_ylabel('Dispersion')

#fig.savefig('collision_rates.pdf')

In [None]:
df_collisions[df_collisions.True_rate > .5]

In [None]:
plt.rcParams['figure.figsize'] = 5,4
fig, axes = plt.subplots(1,1)

sm = sns.scatterplot(data=df_collisions,
                x=df_collisions.means, 
                y=df_collisions.dispersions,
                s=10,
                linewidth=0,
                hue=df_collisions.True_clipped,
                palette='viridis',
                alpha=1,
                ax=axes,
               )

norm = plt.Normalize(df_collisions['True_clipped'].min(), df_collisions['True_clipped'].max())
sm = plt.cm.ScalarMappable(cmap="viridis",norm=norm)
sm.set_array([])


axes.get_legend().remove()
axes.figure.colorbar(sm,shrink=.25,aspect=4,
                    # orientation='horizontal',
                     label='mean percent collisions \n 0.1 clipped')

# axes.spines.right.set_visible(False)
# axes.spines.top.set_visible(False)

# axes.spines.bottom.set_linewidth(1)
# axes.spines.left.set_linewidth(1)

axes.set_xlabel('mean expression \n (normalized)')
axes.set_ylabel('Dispersion')


In [None]:
!conda list