In [31]:
import gffutils
import pandas as pd
import os
from sgRNAalign_util import *

# Path to your GTF file (e.g., from GENCODE or Ensembl)
gtf_file = "/Users/rzhu/Gladstone Dropbox/Ronghui Zhu/GRNPerturbSeq/2_files/hg38_genome/gencode.v48.annotation.gtf.gz"
db_file = "/Users/rzhu/Gladstone Dropbox/Ronghui Zhu/GRNPerturbSeq/2_files/hg38_genome/gencode.v48.db"

if not os.path.exists(db_file):
    print(f"Database not found. Creating a new one from {gtf_file}...")
    # The force=True flag will overwrite an existing database if needed.
    # The disable_infer_* flags are recommended for well-formed GTFs like GENCODE.
    gffutils.create_db(gtf_file, 
                      dbfn=db_file, 
                      force=True, 
                      keep_order=True,
                      disable_infer_genes=True, 
                      disable_infer_transcripts=True)
    print("Database created successfully.")

print(f"Connecting to database: {db_file}")
db = gffutils.FeatureDB(db_file, keep_order=True)

print("Extracting transcript features...")
records = []
for transcript in db.features_of_type('transcript', order_by='start'):
    # For each transcript feature, we create a dictionary to hold its information.
    # The .attributes dictionary holds the key-value pairs from the 9th column of the GTF.
    record = {
        'chrom': transcript.chrom,
        'start': transcript.start,
        'end': transcript.end,
        'strand': transcript.strand,
        'transcript_id': transcript.attributes.get('transcript_id', ['N/A'])[0],
        'gene_id': transcript.attributes.get('gene_id', ['N/A'])[0],
        'gene_name': transcript.attributes.get('gene_name', ['N/A'])[0],
        'transcript_type': transcript.attributes.get('transcript_type', ['N/A'])[0],
        # A transcript may not have a protein_id (e.g., non-coding transcripts),
        # so we handle this case gracefully by providing a default value of ''.
        'protein_id': transcript.attributes.get('protein_id', [''])[0] if 'protein_id' in transcript.attributes else '',
    }
    records.append(record)

print(f"Extracted {len(records)} transcripts.")
df_transcripts = pd.DataFrame(records)
df_transcripts.to_csv("genome/transcript_annotations_hg38.csv", index=False)

# Extract gene features
records = []
for gene in db.features_of_type('gene', order_by='start'):
    record = {
        'gene_id': gene.attributes.get('gene_id', [''])[0],
        'gene_name': gene.attributes.get('gene_name', [''])[0] if 'gene_name' in gene.attributes else '',
        'chrom': gene.chrom,
        'start': gene.start,
        'end': gene.end,
        'strand': gene.strand,
        'gene_type': gene.attributes.get('gene_type', [''])[0] if 'gene_type' in gene.attributes else ''
    }
    records.append(record)

# Convert to DataFrame
df_gene = pd.DataFrame(records)
df_gene.to_csv("genome/gene_annotations_hg38.csv", index=False)

cds_records = []
for cds in db.features_of_type('CDS', order_by='start'):
    record = {
        'chrom': cds.chrom,
        'start': cds.start,
        'end': cds.end,
        'strand': cds.strand,
        'transcript_id': cds.attributes.get('transcript_id', ['N/A'])[0],
        'gene_id': cds.attributes.get('gene_id', ['N/A'])[0],
        'gene_name': cds.attributes.get('gene_name', ['N/A'])[0],
        'protein_id': cds.attributes.get('protein_id', [''])[0] if 'protein_id' in cds.attributes else '',
    }
    records.append(record)

print(f"Extracted {len(records)} CDS entries.")
df_cds = pd.DataFrame(records)
df_cds.to_csv("genome/cds_annotations_hg38.csv", index=False)

Connecting to database: genome/gencode.v48.db
Extracting transcript features...
Extracted 385669 transcripts.


In [5]:
# Load hg38 gene annotations
df_transcripts = pd.read_csv('genome/transcript_annotations_hg38.csv')
df_gene = pd.read_csv('genome/gene_annotations_hg38.csv')
df_transcripts['gene_id'] = df_transcripts['gene_id'].str.split('.').str[0]
df_gene['gene_id'] = df_gene['gene_id'].str.split('.').str[0]
df_cds = pd.read_csv('genome/cds_annotations_hg38.csv')
df_cds['gene_id'] = df_cds['gene_id'].str.split('.').str[0]

# Identify all gene_ids from df_gene that are 'protein_coding', filter transcripts for protein-coding genes to only include those of type 'protein_coding'
protein_coding_gene_ids = set(df_gene[df_gene['gene_type'] == 'protein_coding']['gene_id'])
pc_transcripts = df_transcripts[
    (df_transcripts['gene_id'].isin(protein_coding_gene_ids)) &
    ((df_transcripts['transcript_type'] == 'protein_coding') | (df_transcripts['transcript_type'] == 'protein_coding_CDS_not_defined'))
]

# Get all transcripts for non-protein-coding genes (no change in logic for these)
non_pc_transcripts = df_transcripts[~df_transcripts['gene_id'].isin(protein_coding_gene_ids)]

# Combine the two filtered dataframes to create the final set of transcripts for aggregation
filtered_transcripts = pd.concat([pc_transcripts, non_pc_transcripts])

# Aggregate the filtered transcript data to get a list of starts and ends for each gene
ts_aggregated = filtered_transcripts.groupby('gene_id').agg(
    ts_starts=('start', list),
    ts_ends=('end', list)
).reset_index()

cds_aggregated = df_cds.groupby('gene_id').agg(
    cds_starts=('start', 'unique'), # Using unique to avoid duplicates
    cds_ends=('end', 'unique')
).reset_index()

merged_df = pd.merge(df_gene, ts_aggregated, on='gene_id', how='left')
merged_df = pd.merge(merged_df, cds_aggregated, on='gene_id', how='left')

# Apply the functions to calculate TSS and CDS for each gene
merged_df['tss'] = merged_df.apply(calculate_tss, axis=1)
merged_df['cds'] = merged_df.apply(calculate_cds, axis=1)

# Create the final, cleaned dataframe with selected columns, now including 'cds'
genes_df = merged_df[['gene_id', 'gene_name', 'chrom', 'start', 'end', 'strand', 'tss', 'gene_type', 'cds']].copy()

# Rename columns for clarity
genes_df.rename(columns={
    'chrom': 'chromosome',
    'start': 'gene_start',
    'end': 'gene_end'
}, inplace=True)

In [6]:
# Compile gene list to consider, using all protein-coding genes + a few non-protein-coding genes targeted by sgRNA library
sgRNA = pd.read_csv('results/resuGWCD4CRISPRi_sgRNA_list.csv')
sgRNA['target_gene_name_from_sgRNA'] = sgRNA.name.str[:-2]
genes_info_to_merge = genes_df[['gene_name', 'gene_id']]
merged_df = pd.merge(
    sgRNA,
    genes_info_to_merge,
    left_on='target_gene_name_from_sgRNA',
    right_on='gene_name',
    how='left'
)
final_df = merged_df.drop(columns=['gene_name'])

# Spot checking and drop duplicated gene names from df_gene
duplicates = final_df[final_df.duplicated(subset=['name', 'target_gene_name_from_sgRNA'], keep=False)]
sorted_duplicates = duplicates.sort_values(by=['name', 'target_gene_name_from_sgRNA'])
final_df.drop([11092, 23824, 862, 13631, 8949, 21689, 697, 13466, 1401, 14164, 8148, 20887, 9235, 21972, 103, 12877, 8632, 21372, 
               10229, 22965, 8102, 20841, 2998, 15749, 10377, 23112, 10510, 23245, 10517, 251, 13023, 9124, 21865, 12604, 25336, 
               6324, 19070, 11799, 24529, 8366, 21105, 5454, 18203, 5645, 18393], inplace=True)
final_df = final_df.reset_index(drop=True)
sgRNA['gene_id'] = final_df.gene_id

# Fill in gene id for deprecated gene names
oldgenename_geneid_association = pd.read_csv('oldgenename_geneid_association.csv')
gene_id_map = oldgenename_geneid_association.set_index('old_name')['gene_id']
sgRNA['gene_id'] = sgRNA['gene_id'].fillna(sgRNA['target_gene_name_from_sgRNA'].map(gene_id_map))

# Write corrected gene name
gene_name_map = df_gene.set_index('gene_id')['gene_name']
sgRNA['corrected_target_gene_name'] = sgRNA['gene_id'].map(gene_name_map)
sgRNA.loc[sgRNA.gene_id=='MTRNR2L1','corrected_target_gene_name'] = 'MTRNR2L1'
sgRNA.loc[sgRNA.gene_id=='MTRNR2L4','corrected_target_gene_name'] = 'MTRNR2L4'
sgRNA.loc[sgRNA.gene_id=='MTRNR2L8','corrected_target_gene_name'] = 'MTRNR2L8'
sgRNA.loc[sgRNA.gene_id=='OCLM','corrected_target_gene_name'] = 'OCLM'
sgRNA.to_pickle('GWCD4CRISPRi_sgRNA_list_updated.pkl')

In [38]:
# Find non-protein-coding genes in sgRNA library
non_protein_coding_genes = {
    gene_id
    for gene_id in set(sgRNA['gene_id']) - set(df_gene.loc[df_gene['gene_type'] == 'protein_coding', 'gene_id'])
    if gene_id.startswith('ENSG')
}

# Compile all protein-coding genes + a few non-protein-coding genes targeted by sgRNA library
genes_df_subset = genes_df[
    (genes_df['gene_type'] == 'protein_coding') |
    (genes_df['gene_type'] == 'TR_V_gene') |
    (genes_df['gene_type'] == 'TR_D_gene') |
    (genes_df['gene_type'] == 'TR_J_gene') |
    (genes_df['gene_type'] == 'TR_C_gene') |
    (genes_df['gene_type'] == 'IG_C_gene') |
    (genes_df['gene_type'] == 'IG_J_gene') |
    (genes_df['gene_type'] == 'IG_D_gene') |
    (genes_df['gene_type'] == 'IG_V_gene') |
    (genes_df['gene_id'] == 'ENSG00000242288') |
    (genes_df['gene_id'] == 'ENSG00000272752') |
    (genes_df['gene_id'].isin(non_protein_coding_genes))
].copy()
genes_df_subset = genes_df_subset.reset_index(drop=True)

genes_df_subset.to_pickle('genome/genes_df_subset_for_sgRNA_annotation.pkl')