# Annotating genomics data

### Imports

In [None]:
from sparc_multiomics.utils import get_gene_name
import pandas as pd
from tqdm import tqdm

### Data loading

In [2]:
genomics_data = pd.read_parquet(
    "genomics_processed.parquet",
)
genomics_data_columns = genomics_data.columns

INFO:aiobotocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


### Annotating the data

Iterate through the features and annotate the gene names by chromosome and base pair poisition.

In [4]:
annotated_genomics_columns = {}
for current_column in tqdm(genomics_data_columns):
    current_chromosome, current_position = current_column.split("_")
    try:
        possible_names = get_gene_name(
            chromosome=current_chromosome[3:],
            bp_position=int(current_position[2:]),
            release_number=110,
        )
        if len(possible_names) == 0:
            continue
        elif len(possible_names) > 0:
            possible_names = [x for x in possible_names if x != ""]
            if len(possible_names) == 0:
                continue
        if len(possible_names) == 1:
            annotated_genomics_columns[current_column] = (
                possible_names[0] + "_" + current_column
            )
        elif len(possible_names) > 1:
            annotated_genomics_columns[current_column] = (
                "-".join(possible_names) + "_" + current_column
            )
    except:
        continue

100%|██████████| 305112/305112 [02:46<00:00, 1835.77it/s]


In [5]:
print("Successfully annotated {} columns".format(len(annotated_genomics_columns)))
print(
    "This is {}% of the total number of columns".format(
        len(list(annotated_genomics_columns.keys()))
        / (len(genomics_data_columns) - 1)
        * 100
    )
)

Successfully annotated 166916 columns
This is 54.70664774459131% of the total number of columns


In [6]:
usable_columns = ["sample_id"] + list(annotated_genomics_columns.keys())
annotated_genomics_data = genomics_data[usable_columns]
annotated_genomics_data = annotated_genomics_data.rename(
    columns=annotated_genomics_columns
)
annotated_genomics_data.to_parquet("genomics_annotated.parquet")

In [7]:
annotated_genomics_data.head(5)

chr_bp_pos,sample_id,LINC01409_chr1_bp794332,LINC01128_chr1_bp838555,LINC01128_chr1_bp840753,LINC01128_chr1_bp846808,LINC01128_chr1_bp854250,LINC02593_chr1_bp917640,LINC02593_chr1_bp918573,LINC02593_chr1_bp919419,LINC02593_chr1_bp919501,...,MT-ND4L_chrMT_bp10550,MT-ND4_chrMT_bp11251,MT-ND4_chrMT_bp11467,MT-ND4_chrMT_bp11914,MT-TL2_chrMT_bp12308,MT-ND5_chrMT_bp12705,MT-CYB_chrMT_bp15043,MT-CYB_chrMT_bp15452,MT-TT_chrMT_bp15924,MT-TT_chrMT_bp15928
0,203990550002_R01C01,0,1,1,1,1,0,1,3,1,...,0,0,3,0,3,0,0,0,0,0
1,203990550002_R01C02,0,0,0,0,0,0,3,3,3,...,0,0,3,0,3,0,0,0,0,0
2,203990550002_R02C01,0,2,1,1,3,0,0,1,3,...,0,0,0,0,0,3,0,0,0,0
3,203990550002_R02C02,0,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,203990550002_R03C02,0,1,1,0,0,0,3,3,3,...,0,0,0,0,0,0,0,0,0,0
