# Processing genomics with vcftools

### Imports

In [None]:
from cyvcf2 import VCF
import pandas as pd
from tqdm import tqdm

### Data loading

In [4]:
opened_file = VCF("out.recode.vcf")
sample_id_list = opened_file.raw_header.split("\n")[-2].split("\t")[9:]

### Data processing

Turning the gene tagging into a readable format with the chromosome and position.

In [5]:
rows = []
for variant in tqdm(opened_file, total=306282):
    this_row = [
        f"chr{str(variant.CHROM)}_bp{str(variant.POS)}",
    ] + variant.gt_types.tolist()

    rows.append(this_row)
edited_df = pd.DataFrame(rows, columns=["chr_bp_pos"] + sample_id_list)
edited_df.set_index("chr_bp_pos", inplace=True)

100%|█████████▉| 305111/306282 [03:56<00:00, 1290.34it/s]


In [9]:
edited_df.head()

Unnamed: 0_level_0,203990550002_R01C01,203990550002_R01C02,203990550002_R02C01,203990550002_R02C02,203990550002_R03C02,203990550002_R08C02,203990550002_R10C01,203990550002_R10C02,203990550002_R11C01,203990550002_R11C02,...,205144340157_R07C01,205144340157_R07C02,205144340157_R08C01,205144340157_R08C02,205144340157_R09C01,205144340157_R09C02,205144340157_R10C01,205144340157_R10C02,205144340157_R11C01,205144340157_R11C02
chr_bp_pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1_bp58814,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1_bp727841,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
chr1_bp752721,3,1,0,1,0,3,3,3,1,1,...,3,3,1,3,0,3,3,1,3,1
chr1_bp759036,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1_bp794332,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
transposed_df = edited_df.T

In [11]:
transposed_df.head()

chr_bp_pos,chr1_bp58814,chr1_bp727841,chr1_bp752721,chr1_bp759036,chr1_bp794332,chr1_bp838555,chr1_bp840753,chr1_bp846808,chr1_bp854250,chr1_bp861808,...,chrMT_bp10550,chrMT_bp11251,chrMT_bp11467,chrMT_bp11914,chrMT_bp12308,chrMT_bp12705,chrMT_bp15043,chrMT_bp15452,chrMT_bp15924,chrMT_bp15928
203990550002_R01C01,0,0,3,0,0,1,1,1,1,3,...,0,0,3,0,3,0,0,0,0,0
203990550002_R01C02,0,0,1,1,0,0,0,0,0,3,...,0,0,3,0,3,0,0,0,0,0
203990550002_R02C01,0,0,0,0,0,2,1,1,3,0,...,0,0,0,0,0,3,0,0,0,0
203990550002_R02C02,0,0,1,0,0,1,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
203990550002_R03C02,0,1,0,0,0,1,1,0,0,3,...,0,0,0,0,0,0,0,0,0,0


In [12]:
transposed_df.shape

(2863, 305111)

In [13]:
transposed_df.to_parquet("genomics_processed.parquet")

{'paths': ['s3://enveda-data-dx/genomics/CCF_SPARC_gx_array_transposed.parquet'],
 'partitions_values': {}}