In [5]:
import pandas as pd
from Bio.Seq import Seq
import numpy as np
from sgRNAalign_util import *

### Prepare sgRNA alignment file

In [7]:
convert_csv_to_fasta('genome/GWCD4CRISPRi_targeting_sgRNA_list.csv', 'genome/GWCD4CRISPRi_targeting_sgRNA_list.fa')

Reading from 'genome/GWCD4CRISPRi_targeting_sgRNA_list.csv'...
Successfully converted 25512 records.
FASTA file saved as 'genome/GWCD4CRISPRi_targeting_sgRNA_list.fa'


bowtie2 -x /Users/rzhu/Gladstone Dropbox/Ronghui Zhu/GRNPerturbSeq/2_files/hg38_genome/hg38 \
        -f genome/GWCD4CRISPRi_targeting_sgRNA_list.fa \
        -S results/GWCD4CRISPRi_targeting_sgRNA_alignment.sam \
        --end-to-end \
        -D 20 \
        -R 3 \
        -L 5 \
        -N 0 \
        -k 100 \
        -i S,1,0.50 \
        --score-min "C,0,0" \
        --no-1mm-upfront \
        --no-unal \
        --un results/unaligned_targeting_sgrnas.fa

25512 reads; of these:
  25512 (100.00%) were unpaired; of these:
    7 (0.03%) aligned 0 times
    23993 (94.05%) aligned exactly 1 time
    1512 (5.93%) aligned >1 times
99.97% overall alignment rate

### Extract alignment info

In [2]:
alignment_df = sam_to_dataframe('results/GWCD4CRISPRi_targeting_sgRNA_alignment.sam')

In [3]:
# Get uniquely aligned sgRNAs
sgrna_counts = alignment_df['sgRNA'].value_counts()
unique_sgrna_ids = sgrna_counts[sgrna_counts == 1].index
unique_sgrna_df = alignment_df[alignment_df['sgRNA'].isin(unique_sgrna_ids)].copy()
unique_sgrna_df['note'] = 'unique alignment'
unique_sgrna_df['PAM'] = True
unique_sgrna_df['flag'] = False
unique_sgrna_df = unique_sgrna_df.reset_index(drop=True)

# Get sgRNAs with multi alignments
sgrna_counts = alignment_df['sgRNA'].value_counts()
multi_sgrna_ids = sgrna_counts[sgrna_counts > 1].index
multi_sgrna_df = alignment_df[alignment_df['sgRNA'].isin(multi_sgrna_ids)].copy()
multi_sgrna_df['note'] = 'multi alignment'
multi_sgrna_df['PAM'] = True
multi_sgrna_df = multi_sgrna_df.reset_index(drop=True)

In [66]:
len(multi_sgrna_df.sgRNA.unique())

1512

In [67]:
len(unique_sgrna_df.sgRNA.unique())

23993

### First filter by whether there are PAM sequences after protospacer

In [4]:
from pyfaidx import Fasta

# Load the reference genome
try:
    genome = Fasta('genome/hg38.fa')
except FileNotFoundError:
    print("Error: hg38.fa not found. Please download the reference genome.")
    exit()

# Flag uniquely aligned sgRNA that has no PAM
for i in range(len(unique_sgrna_df)):
    # Define the genomic coordinates
    chromosome = unique_sgrna_df.loc[i, 'chromosome']
    pos = unique_sgrna_df.loc[i, 'pos']
    if unique_sgrna_df.loc[i, 'strand']=='+':
        start_position = pos+19
        end_position = start_position+3
        sequence = genome[chromosome][start_position:end_position]
    else:
        start_position = pos-3
        end_position = pos
        sequence = genome[chromosome][start_position:end_position]
        sequence = sequence.reverse.complement
    
    if (sequence.seq[1:]!='GG') and (sequence.seq[1:]!='gg') and (sequence.seq[1:]!='Gg' and (sequence.seq[1:]!='gG')):
        unique_sgrna_df.loc[i, 'PAM'] = False
        unique_sgrna_df.loc[i, 'flag'] = True

# For multi aligned sgRNA, only include those with PAM
temp_df = multi_sgrna_df.copy()
for i in range(len(multi_sgrna_df)):
    chromosome = temp_df.loc[i, 'chromosome']
    pos = temp_df.loc[i, 'pos']
    if temp_df.loc[i, 'strand']=='+':
        start_position = pos+19
        end_position = start_position+3
        sequence = genome[chromosome][start_position:end_position]
    else:
        start_position = pos-3
        end_position = pos
        sequence = genome[chromosome][start_position:end_position]
        sequence = sequence.reverse.complement
    
    if (sequence.seq[1:]!='GG') and (sequence.seq[1:]!='gg') and (sequence.seq[1:]!='Gg' and (sequence.seq[1:]!='gG')):
        temp_df.drop(i, inplace=True)

In [5]:
# One of the multi alignment does not have PAM at all
set(multi_sgrna_df.sgRNA.unique()) - set(temp_df.sgRNA.unique())

{'NBPF14-2'}

In [6]:
# Get more uniquely aligned sgRNA
sgrna_counts = temp_df['sgRNA'].value_counts()
unique_sgrna_ids = sgrna_counts[sgrna_counts == 1].index
unique_sgrna_df_from_multi_after_PAMfilter = temp_df[temp_df['sgRNA'].isin(unique_sgrna_ids)].copy()
unique_sgrna_df_from_multi_after_PAMfilter['note'] = 'unique alignment'
unique_sgrna_df_from_multi_after_PAMfilter['flag'] = False

# Get multi-aligned sgRNA after PAM filtering
sgrna_counts = temp_df['sgRNA'].value_counts()
multi_sgrna_ids = sgrna_counts[sgrna_counts > 1].index
multi_sgrna_df_after_PAMfilter = temp_df[temp_df['sgRNA'].isin(multi_sgrna_ids)].copy()
multi_sgrna_df.loc[multi_sgrna_df.sgRNA=='NBPF14-2', 'PAM'] = False
multi_sgrna_df_after_PAMfilter = pd.concat([multi_sgrna_df_after_PAMfilter, multi_sgrna_df[multi_sgrna_df.sgRNA=='NBPF14-2']])
multi_sgrna_df_after_PAMfilter = multi_sgrna_df_after_PAMfilter.reset_index(drop=True)
len(multi_sgrna_df_after_PAMfilter.sgRNA.unique())

1018

### Rest of multi alignments, filter by whether there are genes nearby (30kb) or within a gene body

In [7]:
genes_df_subset = pd.read_pickle('genome/genes_df_subset_for_sgRNA_annotation.pkl')

In [8]:
# Check nearby gene
multi_sgrna_df_after_PAMfilter['gene_nearby'] = multi_sgrna_df_after_PAMfilter.apply(
    is_gene_nearby, 
    axis=1, 
    args=(genes_df_subset,)
)

multi_sgrna_df_after_genePAMfilter = multi_sgrna_df_after_PAMfilter[multi_sgrna_df_after_PAMfilter.gene_nearby].copy()
multi_sgrna_df_after_genePAMfilter = multi_sgrna_df_after_genePAMfilter.reset_index(drop=True)

In [9]:
# Check if there is any multi alignment sgRNA that have no nearby genes
set(multi_sgrna_df_after_PAMfilter.sgRNA.unique()) - set(multi_sgrna_df_after_genePAMfilter.sgRNA.unique())

set()

In [10]:
# Get more uniquely aligned sgRNA after nearby gene filtering
sgrna_counts = multi_sgrna_df_after_genePAMfilter['sgRNA'].value_counts()
unique_sgrna_ids = sgrna_counts[sgrna_counts == 1].index
unique_sgrna_df_from_multi_after_genePAMfilter = multi_sgrna_df_after_genePAMfilter[multi_sgrna_df_after_genePAMfilter['sgRNA'].isin(unique_sgrna_ids)].copy()
unique_sgrna_df_from_multi_after_genePAMfilter['note'] = 'unique alignment'
unique_sgrna_df_from_multi_after_genePAMfilter['flag'] = False

# Get multi-aligned sgRNA after nearby gene filtering
sgrna_counts = multi_sgrna_df_after_genePAMfilter['sgRNA'].value_counts()
multi_sgrna_ids = sgrna_counts[sgrna_counts > 1].index
multi_sgrna_df_filtered = multi_sgrna_df_after_genePAMfilter[multi_sgrna_df_after_genePAMfilter['sgRNA'].isin(multi_sgrna_ids)].copy()
multi_sgrna_df_filtered = multi_sgrna_df_filtered.reset_index(drop=True)
multi_sgrna_df_filtered['flag'] = 'True'

In [11]:
# Combine all uniquely aligned sgRNA
unique_sgrna_df_all = pd.concat([unique_sgrna_df, 
                                 unique_sgrna_df_from_multi_after_PAMfilter, 
                                 unique_sgrna_df_from_multi_after_genePAMfilter])
unique_sgrna_df_all.drop(columns='gene_nearby', inplace=True)
unique_sgrna_df_all = unique_sgrna_df_all.reset_index(drop=True)

In [12]:
len(multi_sgrna_df_filtered.sgRNA.unique())

238

In [13]:
len(unique_sgrna_df_all.sgRNA.unique())

25267

### Annotate uniquely aligned sgRNA

In [None]:
# Add sgRNA information
sgRNA = pd.read_pickle('genome/GWCD4CRISPRi_sgRNA_list_updated.pkl')
df1 = unique_sgrna_df_all[['sgRNA', 'chromosome', 'pos', 'seq', 'strand', 'PAM', 'note', 'flag']].copy()
df1.rename(columns={'seq': 'seq_last19bp'}, inplace=True)
df2 = sgRNA[['name', 'sgRNA', 'target_gene_name_from_sgRNA', 'gene_id', 'corrected_target_gene_name']].copy()
df2.rename(columns={
    'sgRNA': 'seq',
    'gene_id': 'designed_target_gene_id',
    'corrected_target_gene_name': 'designed_target_gene_name'
}, inplace=True)
merged_df = pd.merge(
    df1,
    df2,
    left_on='sgRNA',
    right_on='name',
    how='left'  # 'left' merge keeps all records from the first DataFrame
)
merged_df.drop(columns='name', inplace=True)
unique_sgrna_df_all = merged_df.copy()

#### Check whether near target genes

In [15]:
# Check whether target gene is within certain distance (2kb)
unique_sgrna_df_all['chromosome_norm'] = unique_sgrna_df_all['chromosome'].str.replace('chr', '')
genes_df_subset['chromosome_norm'] = genes_df_subset['chromosome'].str.replace('chr', '')
merged_df = pd.merge(
    unique_sgrna_df_all,
    genes_df_subset[['gene_id', 'gene_name', 'chromosome_norm', 'tss', 'cds', 'gene_start', 'gene_end']],
    left_on=['designed_target_gene_id', 'chromosome_norm'],
    right_on=['gene_id', 'chromosome_norm'],
    how='left'
)

primary_target_cols = ['target_gene_id', 'target_gene_name', 'distance_to_closest_target_tss']
merged_df[primary_target_cols] = merged_df.apply(find_closest_target_info, axis=1)

match_condition = merged_df['designed_target_gene_id']==merged_df['target_gene_id']
merged_df.loc[match_condition, 'note'] = 'unique alignment, aligned to target gene'

#### Check other scenarios

In [16]:
# If not check if other genes are within certain distance (2kb)
nearby_gene_cols = ['nearby_gene_within_2kb', 'nearby_gene_within_30kb']
merged_df[nearby_gene_cols[0]] = merged_df.apply(find_nearby_genes, args=(genes_df_subset,2000), axis=1)
merged_df[nearby_gene_cols[1]] = merged_df.apply(find_nearby_genes, args=(genes_df_subset,30000), axis=1)

nearest_gene_cols = ['nearest_within2kb_gene_id',
                     'nearest_within2kb_gene_name',
                     'nearest_within2kb_gene_dist',
                     'nearest_within2kb_nontarget_gene_id',
                     'nearest_within2kb_nontarget_gene_name',
                     'nearest_within2kb_nontarget_gene_dist']
merged_df[nearest_gene_cols] = merged_df.apply(find_nearest_genes, args=(genes_df_subset,2000), axis=1)

# Identify rows where the primary target was not found, but the closest nearby gene is within 2000bp.
condition = (merged_df['target_gene_id'].isna()) & (merged_df['nearby_gene_within_2kb'].apply(len)!=0)

# Set the flag and note for these re-assigned rows.
merged_df.loc[condition, 'note'] = 'unique alignment, target gene mismatch'
merged_df.loc[condition, 'flag'] = True

# Check if near a cds
cds_hit_condition = (
    merged_df['target_gene_id'].isna() &
    merged_df.apply(lambda row: is_near_cds_start(row['pos'], row['cds']), axis=1)
)

merged_df.loc[cds_hit_condition, 'note'] = 'unique alignment, aligned to a cds but far from tss'
merged_df.loc[cds_hit_condition, 'flag'] = True

# check whether within target gene body
gene_body_condition = (
    merged_df['target_gene_id'].isna() &
    ~cds_hit_condition &
    (merged_df['pos'] >= merged_df['gene_start']) &
    (merged_df['pos'] <= merged_df['gene_end'])
)
merged_df.loc[gene_body_condition, 'note'] = 'unique alignment, within target gene body but far from any tss or cds'
merged_df.loc[gene_body_condition, 'flag'] = True

# Final flagging
final_no_target_condition = merged_df['target_gene_id'].isna()
merged_df.loc[final_no_target_condition, 'flag'] = True
note_is_generic = merged_df['note'] == 'unique alignment'
merged_df.loc[final_no_target_condition & note_is_generic, 'note'] = 'unique alignment, no target gene found'

# Clean up temporary columns from original dataframes
unique_sgrna_df_all.drop(columns=['chromosome_norm'], inplace=True)
genes_df_subset.drop(columns=['chromosome_norm'], inplace=True)
final_df = merged_df[list(unique_sgrna_df_all.columns) + primary_target_cols + nearby_gene_cols + nearest_gene_cols]
unique_sgrna_df_all = final_df.copy()

#### Annotate putative bidirectional promoter

In [112]:
unique_sgrna_df_all['putative_bidirectional_promoter'] = False

# If a sgRNA sits on a putative bidirectional promoter, there should be >=2 nearby genes 
more_than_2nearby = unique_sgrna_df_all['nearby_gene_within_2kb'].apply(lambda x: len(x)) > 1
filtered = unique_sgrna_df_all[more_than_2nearby]

# Prepare genes_df_subset
genes_df_subset.set_index('gene_id', inplace=True)

def gene_orientation(sgrna_pos, gene_list):
    'Small helper function to determine nearby gene orientation'
    left = []
    right = []
    for gene in gene_list:
        if (genes_df_subset.loc[gene,'strand']=='-') & (genes_df_subset.loc[gene,'gene_start']<sgrna_pos):
            left.append(gene)
        elif (genes_df_subset.loc[gene,'strand']=='+') & (genes_df_subset.loc[gene,'gene_end']>sgrna_pos):
            right.append(gene)
    return left, right

for index, row in filtered.iterrows():
    left_gene, right_gene = gene_orientation(row['pos'], row['nearby_gene_within_2kb'])
    # If there is a gene on the left pointing to left, and a gene on the right pointing to right
    # sgRNA might be on a bidirectional promoter
    if (len(left_gene)!=0) & (len(right_gene)!=0):
        filtered.loc[index, 'putative_bidirectional_promoter'] = True

unique_sgrna_df_all[more_than_2nearby] = filtered
    
# Revert genes_df_subset
genes_df_subset.reset_index(names=['gene_id'], inplace=True)

In [22]:
unique_sgrna_df_all.to_pickle('results/unique_sgrna_df_final.pkl')

### Annotate multiple aligned sgRNA

In [17]:
# Add sgRNA information
df1 = multi_sgrna_df_filtered[['sgRNA', 'chromosome', 'pos', 'seq', 'strand', 'PAM', 'note', 'flag']].copy()
df1.rename(columns={'seq': 'seq_last19bp'}, inplace=True)
df2 = sgRNA[['name', 'sgRNA', 'target_gene_name_from_sgRNA', 'gene_id', 'corrected_target_gene_name']].copy()
df2.rename(columns={
    'sgRNA': 'seq',
    'gene_id': 'designed_target_gene_id',
    'corrected_target_gene_name': 'designed_target_gene_name'
}, inplace=True)
merged_df = pd.merge(
    df1,
    df2,
    left_on='sgRNA',
    right_on='name',
    how='left'  # 'left' merge keeps all records from the first DataFrame
)
merged_df.drop(columns='name', inplace=True)
multi_sgrna_df_filtered = merged_df.copy()

#### Check whether near target gene

In [18]:
# Check whether target gene is within certain distance (2kb)
multi_sgrna_df_filtered['chromosome_norm'] = multi_sgrna_df_filtered['chromosome'].str.replace('chr', '')
genes_df_subset['chromosome_norm'] = genes_df_subset['chromosome'].str.replace('chr', '')
merged_df = pd.merge(
    multi_sgrna_df_filtered,
    genes_df_subset[['gene_id', 'gene_name', 'chromosome_norm', 'tss', 'cds', 'gene_start', 'gene_end']],
    left_on=['designed_target_gene_id', 'chromosome_norm'],
    right_on=['gene_id', 'chromosome_norm'],
    how='left'
)

primary_target_cols = ['target_gene_id', 'target_gene_name', 'distance_to_closest_target_tss']
merged_df[primary_target_cols] = merged_df.apply(find_closest_target_info, axis=1)

match_condition = merged_df['designed_target_gene_id']==merged_df['target_gene_id']
merged_df.loc[match_condition, 'note'] = 'multi alignment, aligned to target gene'

#### Check other scenarios

In [None]:
# If not check if other genes are within certain distance (2kb)
nearby_gene_cols = ['nearby_gene_within_2kb', 'nearby_gene_within_30kb']
merged_df[nearby_gene_cols[0]] = merged_df.apply(find_nearby_genes, args=(genes_df_subset,2000), axis=1)
merged_df[nearby_gene_cols[1]] = merged_df.apply(find_nearby_genes, args=(genes_df_subset,30000), axis=1)

nearest_gene_cols = ['nearest_within2kb_gene_id',
                     'nearest_within2kb_gene_name',
                     'nearest_within2kb_gene_dist',
                     'nearest_within2kb_nontarget_gene_id',
                     'nearest_within2kb_nontarget_gene_name',
                     'nearest_within2kb_nontarget_gene_dist']
merged_df[nearest_gene_cols] = merged_df.apply(find_nearest_genes, args=(genes_df_subset,2000), axis=1)

# Identify rows where the primary target was not found, but the closest nearby gene is within 2000bp.
condition = (merged_df['target_gene_id'].isna()) & (merged_df['nearby_gene_within_2kb'].apply(len)!=0)
merged_df.loc[condition, 'note'] = 'multi alignment, target gene mismatch'

# Check if near a cds
cds_hit_condition = (
    merged_df['target_gene_id'].isna() &
    merged_df.apply(lambda row: is_near_cds_start(row['pos'], row['cds']), axis=1))
merged_df.loc[cds_hit_condition, 'note'] = 'multi alignment, aligned to a cds but far from tss'

# check whether within target gene body
gene_body_condition = (
    merged_df['target_gene_id'].isna() &
    ~cds_hit_condition &
    (merged_df['pos'] >= merged_df['gene_start']) &
    (merged_df['pos'] <= merged_df['gene_end']))
merged_df.loc[gene_body_condition, 'note'] = 'multi alignment, within target gene body but far from any tss or cds'
merged_df.loc[gene_body_condition, 'flag'] = True

# Final flagging
final_no_target_condition = merged_df['target_gene_id'].isna()
merged_df.loc[final_no_target_condition, 'flag'] = True
note_is_generic = merged_df['note'] == 'multi alignment'
merged_df.loc[final_no_target_condition & note_is_generic, 'note'] = 'multi alignment, no target gene found'

# Clean up temporary columns from original dataframes
multi_sgrna_df_filtered.drop(columns=['chromosome_norm'], inplace=True)
genes_df_subset.drop(columns=['chromosome_norm'], inplace=True)
final_df = merged_df[list(multi_sgrna_df_filtered.columns) + primary_target_cols + nearby_gene_cols + nearest_gene_cols]
multi_sgrna_df_filtered = final_df.copy()

#### Annotate bidirectional promoter

In [None]:
multi_sgrna_df_filtered['putative_bidirectional_promoter'] = False

# If a sgRNA sits on a putative bidirectional promoter, there should be >=2 nearby genes 
more_than_2nearby = multi_sgrna_df_filtered['nearby_gene_within_2kb'].apply(lambda x: len(x)) > 1
filtered = multi_sgrna_df_filtered[more_than_2nearby]

# Prepare genes_df_subset
genes_df_subset.set_index('gene_id', inplace=True)

for index, row in filtered.iterrows():
    left_gene, right_gene = gene_orientation(row['pos'], row['nearby_gene_within_2kb'])
    # If there is a gene on the left pointing to left, and a gene on the right pointing to right
    # sgRNA might be on a bidirectional promoter
    if (len(left_gene)!=0) & (len(right_gene)!=0):
        filtered.loc[index, 'putative_bidirectional_promoter'] = True

multi_sgrna_df_filtered[more_than_2nearby] = filtered
    
# Revert genes_df_subset
genes_df_subset.reset_index(names=['gene_id'], inplace=True)

#### Resolve multi alignment

In [126]:
# Resolve chrX and chrY multi alignment
key_cols = ['sgRNA', 'pos', 'strand']
duplicated_sgrna = multi_sgrna_df_filtered[multi_sgrna_df_filtered.duplicated('sgRNA', keep=False)]
grouped = duplicated_sgrna.groupby(key_cols)

resolved_rows = []
unresolved_indices = multi_sgrna_df_filtered.index.tolist()

for _, group in grouped:
    # Check if there are exactly 2 rows, one for chrX and one for chrY
    is_chrX = 'chrX' in group['chromosome'].values
    is_chrY = 'chrY' in group['chromosome'].values
    # The group must have exactly 2 rows, one for chrX and one for chrY
    if len(group) == 2 and is_chrX and is_chrY:
        row_x = group[group['chromosome'] == 'chrX'].iloc[0]
        row_y = group[group['chromosome'] == 'chrY'].iloc[0]
        new_row = row_x.copy()
        new_row['chromosome'] = 'chrX/Y'
        if pd.isnull(row_x['target_gene_id']) & pd.isnull(row_y['target_gene_id']):
            new_row['note'] = 'multi alignment, no target gene found on chrX/Y'
        else:
            dist_x = row_x['distance_to_closest_target_tss']
            dist_y = row_y['distance_to_closest_target_tss']
            new_row['distance_to_closest_target_tss'] = max(dist_x, dist_y)
            new_row['target_gene_id'] = row_x['designed_target_gene_id']
            new_row['note'] = 'multi alignment, target gene on chrX/Y'
            new_row['flag'] = False

        new_row['nearby_gene_within_2kb'] = list(set(row_x['nearby_gene_within_2kb']) | set(row_y['nearby_gene_within_2kb']))
        new_row['nearby_gene_within_30kb'] = list(set(row_x['nearby_gene_within_30kb']) | set(row_y['nearby_gene_within_30kb']))

        if pd.isna(row_x['nearest_within2kb_gene_dist']):
            new_row['nearest_within2kb_gene_id'] = row_y['nearest_within2kb_gene_id']
            new_row['nearest_within2kb_gene_name'] = row_y['nearest_within2kb_gene_name']
            new_row['nearest_within2kb_gene_dist'] = row_y['nearest_within2kb_gene_dist']
        elif not (pd.isna(row_x['nearest_within2kb_gene_dist']) & pd.isna(row_y['nearest_within2kb_gene_dist'])):
            new_row['nearest_within2kb_gene_id'] = group.loc[group['nearest_within2kb_gene_dist'].idxmin(), 'nearest_within2kb_gene_id']
            new_row['nearest_within2kb_gene_name'] = group.loc[group['nearest_within2kb_gene_dist'].idxmin(), 'nearest_within2kb_gene_name']
            new_row['nearest_within2kb_gene_dist'] = group.loc[group['nearest_within2kb_gene_dist'].idxmin(), 'nearest_within2kb_gene_dist']

        if pd.isna(row_x['nearest_within2kb_gene_dist']):
            new_row['nearest_within2kb_nontarget_gene_id'] = row_y['nearest_within2kb_nontarget_gene_id']
            new_row['nearest_within2kb_nontarget_gene_name'] = row_y['nearest_within2kb_nontarget_gene_name']
            new_row['nearest_within2kb_nontarget_gene_dist'] = row_y['nearest_within2kb_nontarget_gene_dist']
        elif not (pd.isna(row_x['nearest_within2kb_nontarget_gene_dist']) & pd.isna(row_y['nearest_within2kb_nontarget_gene_dist'])):
            new_row['nearest_within2kb_nontarget_gene_id'] = group.loc[group['nearest_within2kb_nontarget_gene_dist'].idxmin(), 'nearest_within2kb_nontarget_gene_id']
            new_row['nearest_within2kb_nontarget_gene_name'] = group.loc[group['nearest_within2kb_nontarget_gene_dist'].idxmin(), 'nearest_within2kb_nontarget_gene_name']
            new_row['nearest_within2kb_nontarget_gene_dist'] = group.loc[group['nearest_within2kb_nontarget_gene_dist'].idxmin(), 'nearest_within2kb_nontarget_gene_dist']

        new_row['putative_bidirectional_promoter'] = (row_x['putative_bidirectional_promoter']) | (row_y['putative_bidirectional_promoter'])
        
        resolved_rows.append(new_row.to_frame().T)
        
        # Remove the original chrX and chrY rows from the list of unresolved indices
        unresolved_indices.remove(row_x.name)
        unresolved_indices.remove(row_y.name)

unresolved_df = multi_sgrna_df_filtered.loc[unresolved_indices].copy()
resolved_df = pd.concat(resolved_rows, ignore_index=True)

In [128]:
# Resolve other multi alignments
final_resolved_rows = []
unresolved_df['other_alignment_chromosome'] = {}
unresolved_df['other_alignment_pos'] = {}

for _, group in unresolved_df.groupby('sgRNA', group_keys=False):
    primary_row_mask = group['note'] == 'multi alignment, aligned to target gene'
    # If at least one alignment aligned to target gene, collapse the group, add info from other rows to the primary row
    if primary_row_mask.any():
        primary_row = group[primary_row_mask].iloc[[0]].copy() 
        other_rows = group[~group.index.isin(primary_row.index)]
    else:
        if group['target_gene_id'].isnull().all():
            if group['nearby_gene_within_2kb'].apply(lambda x: isinstance(x, list) and len(x) != 0).any():
                # If at least one alignment has a target_gene_id, filter for rows that have a valid target gene ID, 
                # find the index of the row with the smallest distance to TSS
                rows_with_target = group.dropna(subset=['nearest_within2kb_gene_dist']) 
                best_row_index = rows_with_target['nearest_within2kb_gene_dist'].idxmin()
                primary_row = group.loc[[best_row_index]].copy()
                other_rows = group.drop(best_row_index).copy()
                primary_row['note'] = 'multi alignment, target gene mismatch'
            # Check other scenarios
            elif (group['note']=='multi alignment, aligned to a cds but far from tss').any():
                best_row_index = group[group['note'] == 'multi alignment, aligned to a cds but far from tss'].index[0]
                primary_row = group.loc[[best_row_index]].copy()
                other_rows = group.drop(best_row_index).copy()
                primary_row['note'] = 'multi alignment, aligned to a cds but far from tss'
            elif (group['note']=='multi alignment, within target gene body but far from any tss or cds').any():
                best_row_index = group[group['note'] == 'multi alignment, within target gene body but far from any tss or cds'].index[0]
                primary_row = group.loc[[best_row_index]].copy()
                other_rows = group.drop(best_row_index).copy()
                primary_row['note'] = 'multi alignment, within target gene body but far from any tss or cds'
            else:
                # If no alignment has any target gene, take the first row as the representative
                primary_row = group.iloc[[0]].copy()
                other_rows = group.iloc[1:].copy()
                primary_row['note'] = 'multi alignment, no target gene found'


    # Add new columns with information about the other alignments
    primary_row.at[primary_row.index[0], 'other_alignment_chromosome'] = list(other_rows['chromosome'])
    primary_row.at[primary_row.index[0], 'other_alignment_pos'] = list(other_rows['pos'])

    primary_row.at[primary_row.index[0], 'nearby_gene_within_2kb'] = list(set(group['nearby_gene_within_2kb'].explode().dropna().tolist()))
    primary_row.at[primary_row.index[0], 'nearby_gene_within_30kb'] = list(set(group['nearby_gene_within_30kb'].explode().dropna().tolist()))

    filtered = group.dropna(subset=['nearest_within2kb_gene_dist'])
    if not filtered.empty:
        primary_row['nearest_within2kb_gene_id'] = filtered.loc[filtered['nearest_within2kb_gene_dist'].idxmin(), 'nearest_within2kb_gene_id']
        primary_row['nearest_within2kb_gene_name'] = filtered.loc[filtered['nearest_within2kb_gene_dist'].idxmin(), 'nearest_within2kb_gene_name']
        primary_row['nearest_within2kb_gene_dist'] = filtered.loc[filtered['nearest_within2kb_gene_dist'].idxmin(), 'nearest_within2kb_gene_dist']

        filtered = group.dropna(subset=['nearest_within2kb_nontarget_gene_dist'])
    if not filtered.empty:
        primary_row['nearest_within2kb_nontarget_gene_id'] = filtered.loc[filtered['nearest_within2kb_nontarget_gene_dist'].idxmin(), 'nearest_within2kb_nontarget_gene_id']
        primary_row['nearest_within2kb_nontarget_gene_name'] = filtered.loc[filtered['nearest_within2kb_nontarget_gene_dist'].idxmin(), 'nearest_within2kb_nontarget_gene_name']
        primary_row['nearest_within2kb_nontarget_gene_dist'] = filtered.loc[filtered['nearest_within2kb_nontarget_gene_dist'].idxmin(), 'nearest_within2kb_nontarget_gene_dist']

    primary_row['putative_bidirectional_promoter'] = group['putative_bidirectional_promoter'].any()
    
    final_resolved_rows.append(primary_row.iloc[0])

final_resolved_df = pd.DataFrame(final_resolved_rows)
multi_sgrna_df_final = pd.concat([resolved_df, final_resolved_df])
multi_sgrna_df_final = multi_sgrna_df_final.reset_index(drop=True)

In [131]:
multi_sgrna_df_final.to_pickle('results/multi_sgrna_df_final.pkl')

### Spot checking 7 unaligned sequences, all aligned to regions that are different in hg17 and hg18

In [33]:
unaligned_sgrna_df = pd.DataFrame({
    'sgRNA': ['FDXR-1','MUC12-1','TYW1B-1','NBPF14-1','MUC12-2','SRGAP2-2','NBPF11-2'],
    'seq':['AGGTTGCTGTTCCCAGCCA',
           'CCTCCTGACGAGGGAAGAC',
           'GCGTCGTGCAGATACTAGT',
           'AATATTTATCAAACAGAGG',
           'TCTTGGTCCCTCCTGACGA',
           'CACAGGTCCCGAGCCAACG',
           'GCCTGCAGTCCTAAACGCT'],
    'chromosome':['chr17','chr7','chr7','chr1','chr7','chr1','chr1'],
    'pos':[74872943,100969575,72744713,120451417,100969567,206463364,148152266],
    'strand':['-','+','-','+','+','+','-'],
    'note':['hg17 alignment, CtoT at pos10',
            'hg17 alignment, TtoC at pos9, AtoG at pos13, TtoC at pos19',
            'hg17 alignment, AtoG at pos6',
            'hg17 alignment, AtoG at pos19, multi alignment',
            'hg17 alignment, TtoC at pos17',
            'hg17 alignment, AtoG at pos11',
            'hg17 alignment, GtoT at pos19',
           ],
    'PAM':[True, True, True, True, False, True, True],
    'flag':[True, True, True, True, True, True, True]
})

In [34]:
# Add sgRNA information
df1 = unaligned_sgrna_df[['sgRNA', 'chromosome', 'pos', 'seq', 'strand', 'PAM', 'note', 'flag']].copy()
df1.rename(columns={'seq': 'seq_last19bp'}, inplace=True)
df2 = sgRNA[['name', 'sgRNA', 'target_gene_name_from_sgRNA', 'gene_id', 'corrected_target_gene_name']].copy()
df2.rename(columns={
    'sgRNA': 'seq',
    'gene_id': 'designed_target_gene_id',
    'corrected_target_gene_name': 'designed_target_gene_name'
}, inplace=True)
merged_df = pd.merge(
    df1,
    df2,
    left_on='sgRNA',
    right_on='name',
    how='left'  # 'left' merge keeps all records from the first DataFrame
)
merged_df.drop(columns='name', inplace=True)
unaligned_sgrna_df = merged_df.copy()

In [35]:
# Check whether target gene is within certain distance (2kb)
unaligned_sgrna_df['chromosome_norm'] = unaligned_sgrna_df['chromosome'].str.replace('chr', '')
genes_df_subset['chromosome_norm'] = genes_df_subset['chromosome'].str.replace('chr', '')
merged_df = pd.merge(
    unaligned_sgrna_df,
    genes_df_subset[['gene_id', 'gene_name', 'chromosome_norm', 'tss', 'cds', 'gene_start', 'gene_end']],
    left_on=['designed_target_gene_id', 'chromosome_norm'],
    right_on=['gene_id', 'chromosome_norm'],
    how='left'
)

primary_target_cols = ['target_gene_id', 'target_gene_name', 'distance_to_closest_target_tss']
merged_df[primary_target_cols] = merged_df.apply(find_closest_target_info, axis=1)

match_condition = merged_df['designed_target_gene_id']==merged_df['target_gene_id']
merged_df.loc[match_condition, 'note'] = merged_df.loc[match_condition, 'note'] + ', aligned to target gene'

In [36]:
# If not check if other genes are within certain distance (2kb)
nearby_gene_cols = ['nearby_gene_within_2kb', 'nearby_gene_within_30kb']
merged_df[nearby_gene_cols[0]] = merged_df.apply(find_nearby_genes, args=(genes_df_subset,2000), axis=1)
merged_df[nearby_gene_cols[1]] = merged_df.apply(find_nearby_genes, args=(genes_df_subset,30000), axis=1)

nearest_gene_cols = ['nearest_within2kb_gene_id',
                     'nearest_within2kb_gene_name',
                     'nearest_within2kb_gene_dist',
                     'nearest_within2kb_nontarget_gene_id',
                     'nearest_within2kb_nontarget_gene_name',
                     'nearest_within2kb_nontarget_gene_dist']
merged_df[nearest_gene_cols] = merged_df.apply(find_nearest_genes, args=(genes_df_subset,2000), axis=1)

# Identify rows where the primary target was not found, but the closest nearby gene is within 2000bp.
condition = (merged_df['target_gene_id'].isna()) & (merged_df['nearby_gene_within_2kb'].apply(len)!=0)

# Set the flag and note for these re-assigned rows.
merged_df.loc[condition, 'note'] = 'unique alignment, target gene mismatch'
merged_df.loc[condition, 'flag'] = True

# Check if near a cds
cds_hit_condition = (
    merged_df['target_gene_id'].isna() &
    merged_df.apply(lambda row: is_near_cds_start(row['pos'], row['cds']), axis=1)
)

merged_df.loc[cds_hit_condition, 'note'] = merged_df.loc[cds_hit_condition, 'note'] + ', aligned to a cds but far from tss'
merged_df.loc[cds_hit_condition, 'flag'] = True

# check whether within target gene body
gene_body_condition = (
    merged_df['target_gene_id'].isna() &
    ~cds_hit_condition &
    (merged_df['pos'] >= merged_df['gene_start']) &
    (merged_df['pos'] <= merged_df['gene_end'])
)
merged_df.loc[gene_body_condition, 'note'] = merged_df.loc[gene_body_condition, 'note'] + ', within target gene body but far from any tss or cds'
merged_df.loc[gene_body_condition, 'flag'] = True

# Clean up temporary columns from original dataframes
unaligned_sgrna_df.drop(columns=['chromosome_norm'], inplace=True)
genes_df_subset.drop(columns=['chromosome_norm'], inplace=True)
final_df = merged_df[list(unaligned_sgrna_df.columns) + primary_target_cols + nearby_gene_cols + nearest_gene_cols]
unaligned_sgrna_df = final_df.copy()

In [132]:
unaligned_sgrna_df['putative_bidirectional_promoter'] = False

# If a sgRNA sits on a putative bidirectional promoter, there should be >=2 nearby genes 
more_than_2nearby = unaligned_sgrna_df['nearby_gene_within_2kb'].apply(lambda x: len(x)) > 1
filtered = unaligned_sgrna_df[more_than_2nearby]

# Prepare genes_df_subset
genes_df_subset.set_index('gene_id', inplace=True)

for index, row in filtered.iterrows():
    left_gene, right_gene = gene_orientation(row['pos'], row['nearby_gene_within_2kb'])
    # If there is a gene on the left pointing to left, and a gene on the right pointing to right
    # sgRNA might be on a bidirectional promoter
    if (len(left_gene)!=0) & (len(right_gene)!=0):
        filtered.loc[index, 'putative_bidirectional_promoter'] = True

unaligned_sgrna_df[more_than_2nearby] = filtered
    
# Revert genes_df_subset
genes_df_subset.reset_index(names=['gene_id'], inplace=True)

In [37]:
unaligned_sgrna_df['other_alignment_chromosome'] = {}
unaligned_sgrna_df['other_alignment_pos'] = {}
unaligned_sgrna_df.at[3, 'note'] = unaligned_sgrna_df.at[3, 'note'] + ', no target gene found'
unaligned_sgrna_df.at[3, 'other_alignment_chromosome'] = ['chr1', 'chr1']
unaligned_sgrna_df.at[3, 'other_alignment_pos'] = [120807845, 120814130]

In [135]:
unaligned_sgrna_df.to_pickle('results/unaligned_sgrna_df_final.pkl')

In [136]:
sgrna_df_final = pd.concat([unique_sgrna_df_all, multi_sgrna_df_final, unaligned_sgrna_df])
sgrna_df_final = sgrna_df_final.reset_index(drop=True)
sgrna_df_final.to_pickle('results/sgrna_df_final.pkl')
sgrna_df_final.to_csv('results/sgrna_df_final.csv')