In [2]:
import pandas as pd
from Bio.Seq import Seq
import numpy as np
from sgRNAalign_util import *

### Prepare sgRNA alignment file

In [2]:
convert_csv_to_fasta('genome/GWCD4CRISPRi_nontargeting_sgRNA_list.csv', 'genome/GWCD4CRISPRi_nontargeting_sgRNA_list.fa')

Reading from 'genome/GWCD4CRISPRi_nontargeting_sgRNA_list.csv'...
Successfully converted 992 records.
FASTA file saved as 'genome/GWCD4CRISPRi_nontargeting_sgRNA_list.fa'


bowtie2 -x /Users/rzhu/Gladstone Dropbox/Ronghui Zhu/GRNPerturbSeq/2_files/hg38_genome/hg38 \
        -f genome/GWCD4CRISPRi_noncoding_sgRNA_list.fa \
        -S results/GWCD4CRISPRi_noncoding_sgRNA_alignment.sam \
        --end-to-end \
        -D 20 \
        -R 3 \
        -L 5 \
        -N 0 \
        -k 100 \
        -i S,1,0.50 \
        --score-min "C,0,0" \
        --no-1mm-upfront \
        --no-unal

992 reads; of these:
  992 (100.00%) were unpaired; of these:
    978 (98.59%) aligned 0 times
    12 (1.21%) aligned exactly 1 time
    2 (0.20%) aligned >1 times
1.41% overall alignment rate

### Extract alignment info

In [3]:
alignment_df = sam_to_dataframe('results/GWCD4CRISPRi_nontargeting_sgRNA_alignment.sam')

In [None]:
# Check whether there are PAMs
alignment_df['PAM'] = True

from pyfaidx import Fasta

# Load the reference genome
try:
    genome = Fasta('../../../2_files/hg38_genome/hg38.fa')
except FileNotFoundError:
    print("Error: hg38.fa not found. Please download the reference genome.")
    exit()

# Flag uniquely aligned sgRNA that has no PAM
for i in range(len(alignment_df)):
    # Define the genomic coordinates
    chromosome = alignment_df.loc[i, 'chromosome']
    pos = alignment_df.loc[i, 'pos']
    if alignment_df.loc[i, 'strand']=='+':
        start_position = pos+19
        end_position = start_position+3
        sequence = genome[chromosome][start_position:end_position]
    else:
        start_position = pos-3
        end_position = pos
        sequence = genome[chromosome][start_position:end_position]
        sequence = sequence.reverse.complement
    
    if (sequence.seq[1:]!='GG') and (sequence.seq[1:]!='gg') and (sequence.seq[1:]!='Gg' and (sequence.seq[1:]!='gG')):
        alignment_df.loc[i, 'PAM'] = False

In [30]:
ntc_sgRNA_df = alignment_df[alignment_df.PAM].copy()
ntc_sgRNA_df = ntc_sgRNA_df.reset_index(drop=True)

## check whether there are genes nearby (2kb, 30kb)

In [31]:
genes_df_subset = pd.read_parquet('genome/genes_df_subset_for_sgRNA_annotation.parquet')
ntc_sgRNA_df['chromosome_norm'] = ntc_sgRNA_df['chromosome'].str.replace('chr', '')
genes_df_subset['chromosome_norm'] = genes_df_subset['chromosome'].str.replace('chr', '')

In [32]:
ntc_sgRNA_df

Unnamed: 0,sgRNA,chromosome,pos,seq,strand,PAM,chromosome_norm
0,NTC-424,chr10,109193861,CAGGATCAGGGTGTATGGC,+,True,10


In [36]:
dist1 = 2000
dist2 = 30000
for i, row in ntc_sgRNA_df.iterrows():
    # Find nearest genes within certain distance on the same chromosome.
    sgrna_chrom = row['chromosome_norm']
    sgrna_pos = row['pos']
    
    same_chrom_genes = genes_df_subset[genes_df_subset['chromosome_norm'] == sgrna_chrom].copy()

    # Calculate minimum distance for each gene
    same_chrom_genes['distance'] = same_chrom_genes['tss'].apply(
        lambda tss_list: min([abs(sgrna_pos - tss) for tss in tss_list])
    )

    # Filter by distance
    nearby_genes_1 = same_chrom_genes[same_chrom_genes['distance'] <= dist1]
    nearby_genes_2 = same_chrom_genes[same_chrom_genes['distance'] <= dist2]

    if nearby_genes_1.empty:
        nearest_gene_1_id = np.nan
        nearest_gene_1_name = np.nan
        nearest_gene_1_dist = np.nan
    else:
        nearest_gene_1_id = nearby_1_genes.loc[nearby_1_genes['distance'].idxmin(), 'gene_id']
        nearest_gene_1_name = nearby_1_genes.loc[nearby_1_genes['distance'].idxmin(), 'gene_name']
        nearest_gene_1_dist = nearby_1_genes.loc[nearby_1_genes['distance'].idxmin(), 'distance']

    if nearby_genes_2.empty:
        nearest_gene_2_id = np.nan
        nearest_gene_2_name = np.nan
        nearest_gene_2_dist = np.nan
    else:
        nearest_gene_2_id = nearby_2_genes.loc[nearby_2_genes['distance'].idxmin(), 'gene_id']
        nearest_gene_2_name = nearby_2_genes.loc[nearby_2_genes['distance'].idxmin(), 'gene_name']
        nearest_gene_2_dist = nearby_2_genes.loc[nearby_2_genes['distance'].idxmin(), 'distance']

    ntc_sgRNA_df.loc[i, 'nearest_within2kb_gene_id'] = nearest_gene_1_id
    ntc_sgRNA_df.loc[i, 'nearest_within2kb_gene_name'] = nearest_gene_1_name
    ntc_sgRNA_df.loc[i, 'nearest_within2kb_gene_dist'] = nearest_gene_1_dist
    ntc_sgRNA_df.loc[i, 'nearest_within30kb_gene_id'] = nearest_gene_2_id
    ntc_sgRNA_df.loc[i, 'nearest_within30kb_gene_name'] = nearest_gene_2_name
    ntc_sgRNA_df.loc[i, 'nearest_within30kb_gene_dist'] = nearest_gene_2_dist

In [37]:
ntc_sgRNA_df

Unnamed: 0,sgRNA,chromosome,pos,seq,strand,PAM,chromosome_norm,nearest_within2kb_gene_id,nearest_within2kb_gene_name,nearest_within2kb_gene_dist,nearest_within30kb_gene_id,nearest_within30kb_gene_name,nearest_within30kb_gene_dist
0,NTC-424,chr10,109193861,CAGGATCAGGGTGTATGGC,+,True,10,,,,,,
