In [1]:
import os
import sys

In [2]:
import pandas as pd
import polars as pl
import seaborn as sns
from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen
from matplotlib import pyplot as plt

In [3]:
!ls data

copy_number_somatic_mutation.BRCA-EU.tsv
counts
reference_genomes
sample.BRCA-EU.tsv
simple_somatic_mutation.open.BRCA-EU.parquet
simple_somatic_mutation.open.BRCA-EU.tsv
simple_somatic_mutation_renamed.parquet
simple_somatic_mutation_sbs.parquet
simple_somatic_mutation_sbs.tsv
specimen.BRCA-EU.tsv
structural_somatic_mutation.BRCA-EU.tsv


In [15]:
# pl.scan_parquet(
#     "data/simple_somatic_mutation.open.BRCA-EU.parquet"
# ).rename(
#     {"icgc_sample_id": "analyzed_sample_id"}
# ).sink_parquet("data/simple_somatic_mutation_renamed.parquet")

### This is how a smaller SBS-only file was created; don't rerun

In [None]:
pl.scan_csv(
    "data/counts/simple_somatic_mutation.open.BRCA-EU.tsv", 
    separator="\t", 
    infer_schema_length=10_000, 
    low_memory=True
).filter(
    pl.col("mutation_type") == "single base substitution"
).collect(
    streaming=True
).write_csv(
    "data/simple_somatic_mutation.open.BRCA-EU.sbs_only.tsv",
    separator="\t"
)

### Keep only necessary cols

In [13]:
needed_cols = ['icgc_mutation_id',
  'icgc_donor_id',
  'icgc_sample_id',
  'matched_icgc_sample_id',
  'submitted_sample_id',
  'submitted_matched_sample_id',
  'chromosome',
  'chromosome_start',
  'chromosome_end',
  'chromosome_strand',
  'assembly_version',
  'mutation_type',
  'reference_genome_allele',
  'mutated_from_allele',
  'mutated_to_allele',
  'total_read_count',
  'mutant_allele_read_count',
  'consequence_type',
  'aa_mutation',
  'cds_mutation',
  'gene_affected',
  'transcript_affected',
  'gene_build_version',
  'seq_coverage',
  'raw_data_repository',
  'raw_data_accession',
]

In [18]:
pl.scan_csv(
    "./sigprof/input/simple_somatic_mutation_sbs.tsv", separator="\t", infer_schema_length=10_000
).rename(
    {"icgc_sample_id": "analyzed_sample_id"}
).sink_csv(
    "./sigprof/input/simple_somatic_mutation_sbs_renamed.tsv", separator="\t"
)

### Try to extract count matrices using out-of-the-box functions from SigProfiler

In [6]:
df = pl.read_csv("data/counts/simple_somatic_mutation.open.BRCA-EU.tsv", separator="\t", infer_schema_length=10_000, low_memory=True)
df.head()

icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,icgc_sample_id,matched_icgc_sample_id,submitted_sample_id,submitted_matched_sample_id,chromosome,chromosome_start,chromosome_end,chromosome_strand,assembly_version,mutation_type,reference_genome_allele,mutated_from_allele,mutated_to_allele,quality_score,probability,total_read_count,mutant_allele_read_count,verification_status,verification_platform,biological_validation_status,biological_validation_platform,consequence_type,aa_mutation,cds_mutation,gene_affected,transcript_affected,gene_build_version,platform,experimental_protocol,sequencing_strategy,base_calling_algorithm,alignment_algorithm,variation_calling_algorithm,other_analysis_algorithm,seq_coverage,raw_data_repository,raw_data_accession,initial_data_release_date
str,str,str,str,str,str,str,str,str,i64,i64,i64,str,str,str,str,str,str,str,i64,i64,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str
"""MU55281056""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""15""",24985486,24985486,1,"""GRCh37""","""single base su…","""G""","""G""","""A""",,,,,"""tested and ver…","""capillary sequ…","""tested and val…","""capillary sequ…","""intergenic_reg…",,,,,75,"""Illumina GA se…",,"""WGS""",,"""BWA v0.5.9 htt…","""CaVEMan http:/…",,,"""EGA""","""EGAS0000100119…",
"""MU55281056""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""15""",24985486,24985486,1,"""GRCh37""","""single base su…","""G""","""G""","""A""",,,57.0,6.0,"""not tested""",,,,"""intergenic_reg…",,,,,75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU50780316""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""3""",19778034,19778034,1,"""GRCh37""","""single base su…","""G""","""G""","""A""",,,84.0,5.0,"""not tested""",,,,"""intergenic_reg…",,,,,75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64.0,10.0,"""not tested""",,,,"""downstream_gen…",,,"""ENSG0000016271…","""ENST0000053208…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64.0,10.0,"""not tested""",,,,"""intergenic_reg…",,,,,75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",


In [27]:
df.filter(pl.col("mutated_to_allele") == "-")

icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,icgc_sample_id,matched_icgc_sample_id,submitted_sample_id,submitted_matched_sample_id,chromosome,chromosome_start,chromosome_end,chromosome_strand,assembly_version,mutation_type,reference_genome_allele,mutated_from_allele,mutated_to_allele,quality_score,probability,total_read_count,mutant_allele_read_count,verification_status,verification_platform,biological_validation_status,biological_validation_platform,consequence_type,aa_mutation,cds_mutation,gene_affected,transcript_affected,gene_build_version,platform,experimental_protocol,sequencing_strategy,base_calling_algorithm,alignment_algorithm,variation_calling_algorithm,other_analysis_algorithm,seq_coverage,raw_data_repository,raw_data_accession,initial_data_release_date
str,str,str,str,str,str,str,str,str,i64,i64,i64,str,str,str,str,str,str,str,i64,i64,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str
"""MU63946620""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",7283964,7283964,1,"""GRCh37""","""deletion of <=…","""C""","""C""","""-""",,,,,"""not tested""",,"""not tested""",,"""intron_variant…",,,"""ENSG0000017173…","""ENST0000030363…",75,"""Illumina GA se…",,"""WGS""",,"""BWA v0.5.9 htt…","""Pindel version…",,,"""EGA""","""EGAS0000100119…",
"""MU63946620""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",7283964,7283964,1,"""GRCh37""","""deletion of <=…","""C""","""C""","""-""",,,,,"""not tested""",,"""not tested""",,"""downstream_gen…",,,"""ENSG0000020705…","""ENST0000038432…",75,"""Illumina GA se…",,"""WGS""",,"""BWA v0.5.9 htt…","""Pindel version…",,,"""EGA""","""EGAS0000100119…",
"""MU63946620""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",7283964,7283964,1,"""GRCh37""","""deletion of <=…","""C""","""C""","""-""",,,,,"""not tested""",,"""not tested""",,"""intron_variant…",,,"""ENSG0000017173…","""ENST0000043941…",75,"""Illumina GA se…",,"""WGS""",,"""BWA v0.5.9 htt…","""Pindel version…",,,"""EGA""","""EGAS0000100119…",
"""MU63946620""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",7283964,7283964,1,"""GRCh37""","""deletion of <=…","""C""","""C""","""-""",,,54,10,"""not tested""",,,,"""intron_variant…",,,"""ENSG0000017173…","""ENST0000030363…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63946620""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",7283964,7283964,1,"""GRCh37""","""deletion of <=…","""C""","""C""","""-""",,,54,10,"""not tested""",,,,"""downstream_gen…",,,"""ENSG0000020705…","""ENST0000038432…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63946620""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",7283964,7283964,1,"""GRCh37""","""deletion of <=…","""C""","""C""","""-""",,,54,10,"""not tested""",,,,"""intron_variant…",,,"""ENSG0000017173…","""ENST0000043941…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63946631""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",17633912,17633914,1,"""GRCh37""","""deletion of <=…","""TAT""","""TAT""","""-""",,,,,"""not tested""",,"""not tested""",,"""intergenic_reg…",,,,,75,"""Illumina GA se…",,"""WGS""",,"""BWA v0.5.9 htt…","""Pindel version…",,,"""EGA""","""EGAS0000100119…",
"""MU63946631""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",17633912,17633914,1,"""GRCh37""","""deletion of <=…","""TAT""","""TAT""","""-""",,,,,"""not tested""",,"""not tested""",,"""upstream_gene_…",,,"""ENSG0000015933…","""ENST0000037545…",75,"""Illumina GA se…",,"""WGS""",,"""BWA v0.5.9 htt…","""Pindel version…",,,"""EGA""","""EGAS0000100119…",
"""MU63946631""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",17633912,17633914,1,"""GRCh37""","""deletion of <=…","""TAT""","""TAT""","""-""",,,,,"""not tested""",,"""not tested""",,"""upstream_gene_…",,,"""ENSG0000015933…","""ENST0000037544…",75,"""Illumina GA se…",,"""WGS""",,"""BWA v0.5.9 htt…","""Pindel version…",,,"""EGA""","""EGAS0000100119…",
"""MU63946631""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",17633912,17633914,1,"""GRCh37""","""deletion of <=…","""TAT""","""TAT""","""-""",,,60,8,"""not tested""",,,,"""intergenic_reg…",,,,,75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",


In [20]:
(df["mutated_from_allele"] == df["reference_genome_allele"]).sum(), df.shape

(13831513, (13831513, 42))

In [25]:
sigprof_matrices = matGen.SigProfilerMatrixGeneratorFunc(
    project="sigprof", 
    reference_genome="GRCh37",
    path_to_input_files="./sigprof/"
)

The given input files do not appear to be in the correct ICGC format.
Starting matrix generation for SNVs and DINUCs...Completed! Elapsed time: 1.66 seconds.
Matrices generated for 1 samples with 0 errors. Total of 7299 SNVs, 8 DINUCs, and 0 INDELs were successfully analyzed.


In [26]:
m = sigprof_matrices["96"]

In [27]:
m

Unnamed: 0_level_0,DO218489
MutationType,Unnamed: 1_level_1
A[C>A]A,49
A[C>A]C,156
A[C>A]G,5
A[C>A]T,76
A[C>G]A,42
...,...
T[T>C]T,119
T[T>G]A,66
T[T>G]C,70
T[T>G]G,34


### Try to correctly use Alexandrov's tool for each patient subframe

In [4]:
df = pl.read_csv(
    "data/counts/simple_somatic_mutation.open.BRCA-EU.tsv",
    separator="\t",
    infer_schema_length=10_000
)

In [5]:
df.shape

(13831513, 42)

In [6]:
df["icgc_donor_id"].n_unique()

569

In [4]:
for patient, subdf in df.partition_by("icgc_donor_id", as_dict=True):
    subdf.write_csv(f"tmp/simple_somatic_mutation.open.BRCA-EU-{patient}.tsv", separator="\t")

AttributeError: 'LazyFrame' object has no attribute 'partition_by'

In [5]:
?matGen.SigProfilerMatrixGeneratorFunc

[0;31mSignature:[0m
[0mmatGen[0m[0;34m.[0m[0mSigProfilerMatrixGeneratorFunc[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mproject[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreference_genome[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpath_to_input_files[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mexome[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbed_file[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mchrom_based[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mplot[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtsb_stat[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mseqInfo[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcushion[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgs[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvolume[0m[0;34m=[0m[0;32mNone[0m[0

In [14]:
# mutation_df = pl.read_csv("data/simple_somatic_mutation.open.BRCA-EU.tsv", separator="\t", low_memory=True, infer_schema_length=10000)
# sbs_mutation_df = pl.read_parquet("data/simple_somatic_mutation_sbs.parquet", low_memory=True)
sbs_mutation_df = pl.read_csv(
    "data/simple_somatic_mutation_sbs.tsv", 
    separator="\t", low_memory=True, infer_schema_length=10_000
).select(needed_cols)
sbs_mutation_df.head()
# sample_df = pd.read_csv("data/sample.BRCA-EU.tsv", sep="\t")

icgc_mutation_id,icgc_donor_id,icgc_sample_id,matched_icgc_sample_id,submitted_sample_id,submitted_matched_sample_id,chromosome,chromosome_start,chromosome_end,chromosome_strand,assembly_version,mutation_type,reference_genome_allele,mutated_from_allele,mutated_to_allele,total_read_count,mutant_allele_read_count,consequence_type,aa_mutation,cds_mutation,gene_affected,transcript_affected,gene_build_version,seq_coverage,raw_data_repository,raw_data_accession
str,str,str,str,str,str,str,i64,i64,i64,str,str,str,str,str,i64,i64,str,str,str,str,str,i64,str,str,str
"""MU55281056""","""DO218489""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""15""",24985486,24985486,1,"""GRCh37""","""single base su…","""G""","""G""","""A""",,,"""intergenic_reg…",,,,,75,,"""EGA""","""EGAS0000100119…"
"""MU55281056""","""DO218489""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""15""",24985486,24985486,1,"""GRCh37""","""single base su…","""G""","""G""","""A""",57.0,6.0,"""intergenic_reg…",,,,,75,,,"""FI36651:FI3665…"
"""MU50780316""","""DO218489""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""3""",19778034,19778034,1,"""GRCh37""","""single base su…","""G""","""G""","""A""",84.0,5.0,"""intergenic_reg…",,,,,75,,,"""FI36651:FI3665…"
"""MU63478103""","""DO218489""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",64.0,10.0,"""downstream_gen…",,,"""ENSG0000016271…","""ENST0000053208…",75,,,"""FI36651:FI3665…"
"""MU63478103""","""DO218489""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",64.0,10.0,"""intergenic_reg…",,,,,75,,,"""FI36651:FI3665…"


In [6]:
sbs_mutation_df.shape

(7299, 42)

In [7]:
(sbs_mutation_df["reference_genome_allele"] != sbs_mutation_df["mutated_from_allele"]).sum()

0

In [9]:
sbs_mutation_df["assembly_version"].unique()

assembly_version
str
"""GRCh37"""


In [24]:
sbs_mutation_df.filter(pl.col("biological_validation_platform").is_null())

icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,icgc_sample_id,matched_icgc_sample_id,submitted_sample_id,submitted_matched_sample_id,chromosome,chromosome_start,chromosome_end,chromosome_strand,assembly_version,mutation_type,reference_genome_allele,mutated_from_allele,mutated_to_allele,quality_score,probability,total_read_count,mutant_allele_read_count,verification_status,verification_platform,biological_validation_status,biological_validation_platform,consequence_type,aa_mutation,cds_mutation,gene_affected,transcript_affected,gene_build_version,platform,experimental_protocol,sequencing_strategy,base_calling_algorithm,alignment_algorithm,variation_calling_algorithm,other_analysis_algorithm,seq_coverage,raw_data_repository,raw_data_accession,initial_data_release_date
str,str,str,str,str,str,str,str,str,i64,i64,i64,str,str,str,str,str,str,str,i64,i64,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str
"""MU55281056""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""15""",24985486,24985486,1,"""GRCh37""","""single base su…","""G""","""G""","""A""",,,57,6,"""not tested""",,,,"""intergenic_reg…",,,,,75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU50780316""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""3""",19778034,19778034,1,"""GRCh37""","""single base su…","""G""","""G""","""A""",,,84,5,"""not tested""",,,,"""intergenic_reg…",,,,,75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64,10,"""not tested""",,,,"""downstream_gen…",,,"""ENSG0000016271…","""ENST0000053208…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64,10,"""not tested""",,,,"""intergenic_reg…",,,,,75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64,10,"""not tested""",,,,"""downstream_gen…",,,"""ENSG0000016271…","""ENST0000033611…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64,10,"""not tested""",,,,"""downstream_gen…",,,"""ENSG0000016271…","""ENST0000036649…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64,10,"""not tested""",,,,"""upstream_gene_…",,,"""ENSG0000017753…","""ENST0000031874…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64,10,"""not tested""",,,,"""downstream_gen…",,,"""ENSG0000016271…","""ENST0000039182…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64,10,"""not tested""",,,,"""downstream_gen…",,,"""ENSG0000016271…","""ENST0000034806…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64,10,"""not tested""",,,,"""downstream_gen…",,,"""ENSG0000016271…","""ENST0000039182…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",


In [None]:
needed_cols = ['icgc_mutation_id',
  'icgc_donor_id',
  'icgc_sample_id',
  'matched_icgc_sample_id',
  'submitted_sample_id',
  'submitted_matched_sample_id',
  'chromosome',
  'chromosome_start',
  'chromosome_end',
  'chromosome_strand',
  'assembly_version',
  'mutation_type',
  'reference_genome_allele',
  'mutated_from_allele',
  'mutated_to_allele',
  'total_read_count',
  'mutant_allele_read_count',
  'consequence_type',
  'aa_mutation',
  'cds_mutation',
  'gene_affected',
  'transcript_affected',
  'gene_build_version',
  'seq_coverage',
  'raw_data_repository',
  'raw_data_accession',
]

In [5]:
mutation_df.head()

icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,icgc_sample_id,matched_icgc_sample_id,submitted_sample_id,submitted_matched_sample_id,chromosome,chromosome_start,chromosome_end,chromosome_strand,assembly_version,mutation_type,reference_genome_allele,mutated_from_allele,mutated_to_allele,quality_score,probability,total_read_count,mutant_allele_read_count,verification_status,verification_platform,biological_validation_status,biological_validation_platform,consequence_type,aa_mutation,cds_mutation,gene_affected,transcript_affected,gene_build_version,platform,experimental_protocol,sequencing_strategy,base_calling_algorithm,alignment_algorithm,variation_calling_algorithm,other_analysis_algorithm,seq_coverage,raw_data_repository,raw_data_accession,initial_data_release_date
str,str,str,str,str,str,str,str,str,i64,i64,i64,str,str,str,str,str,str,str,i64,i64,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str
"""MU55281056""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""15""",24985486,24985486,1,"""GRCh37""","""single base su…","""G""","""G""","""A""",,,,,"""tested and ver…","""capillary sequ…","""tested and val…","""capillary sequ…","""intergenic_reg…",,,,,75,"""Illumina GA se…",,"""WGS""",,"""BWA v0.5.9 htt…","""CaVEMan http:/…",,,"""EGA""","""EGAS0000100119…",
"""MU55281056""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""15""",24985486,24985486,1,"""GRCh37""","""single base su…","""G""","""G""","""A""",,,57.0,6.0,"""not tested""",,,,"""intergenic_reg…",,,,,75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU50780316""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""3""",19778034,19778034,1,"""GRCh37""","""single base su…","""G""","""G""","""A""",,,84.0,5.0,"""not tested""",,,,"""intergenic_reg…",,,,,75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64.0,10.0,"""not tested""",,,,"""downstream_gen…",,,"""ENSG0000016271…","""ENST0000053208…",75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",
"""MU63478103""","""DO218489""","""BRCA-EU""","""SP117710""","""SA543682""","""SA545080""","""PD8623a""","""PD8623b""","""1""",247616560,247616560,1,"""GRCh37""","""single base su…","""C""","""C""","""T""",,,64.0,10.0,"""not tested""",,,,"""intergenic_reg…",,,,,75,"""Illumina HiSeq…",,"""WGS""",,,"""PCAWG Consensu…",,,,"""FI36651:FI3665…",


In [7]:
mutation_df.describe()

describe,icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,icgc_sample_id,matched_icgc_sample_id,submitted_sample_id,submitted_matched_sample_id,chromosome,chromosome_start,chromosome_end,chromosome_strand,assembly_version,mutation_type,reference_genome_allele,mutated_from_allele,mutated_to_allele,quality_score,probability,total_read_count,mutant_allele_read_count,verification_status,verification_platform,biological_validation_status,biological_validation_platform,consequence_type,aa_mutation,cds_mutation,gene_affected,transcript_affected,gene_build_version,platform,experimental_protocol,sequencing_strategy,base_calling_algorithm,alignment_algorithm,variation_calling_algorithm,other_analysis_algorithm,seq_coverage,raw_data_repository,raw_data_accession,initial_data_release_date
str,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,str,str,str
"""count""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""",13831513.0,13831513.0,13831513.0,"""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""",13831513.0,13831513.0,"""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""",13831513.0,"""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513""","""13831513"""
"""null_count""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""",0.0,0.0,0.0,"""0""","""0""","""0""","""0""","""0""","""13831513""","""13831513""",12489985.0,12489973.0,"""0""","""2588738""","""1344247""","""2588738""","""24""","""13691374""","""13698356""","""2059666""","""2060455""",24.0,"""0""","""13831513""","""0""","""13831513""","""1344247""","""0""","""13831513""","""13831513""","""1344247""","""0""","""13831513"""
"""mean""",,,,,,,,,,80636000.0,80636000.0,1.0,,,,,,,,54.073377,12.167813,,,,,,,,,,75.0,,,,,,,,,,,
"""std""",,,,,,,,,,57104000.0,57104000.0,0.0,,,,,,,,23.576048,10.564501,,,,,,,,,,0.0,,,,,,,,,,,
"""min""","""MU1000744""","""DO217786""","""BRCA-EU""","""SP116946""","""SA542425""","""SA544319""","""PD10010a""","""PD10010b""","""1""",421.0,421.0,1.0,"""GRCh37""","""deletion of <=…","""-""","""-""","""-""",,,3.0,1.0,"""not tested""","""capillary sequ…","""not tested""","""capillary sequ…","""3_prime_UTR_va…","""*101*""","""100000C>G""","""ENSG0000000000…","""ENST0000000023…",75.0,"""Illumina GA se…",,"""WGS""",,"""BWA v0.5.9 htt…","""CaVEMan http:/…",,,"""EGA""","""EGAS0000100119…",
"""25%""",,,,,,,,,,34501260.0,34501260.0,1.0,,,,,,,,41.0,6.0,,,,,,,,,,75.0,,,,,,,,,,,
"""50%""",,,,,,,,,,70179293.0,70179293.0,1.0,,,,,,,,50.0,9.0,,,,,,,,,,75.0,,,,,,,,,,,
"""75%""",,,,,,,,,,117709201.0,117709201.0,1.0,,,,,,,,62.0,14.0,,,,,,,,,,75.0,,,,,,,,,,,
"""max""","""MU9993472""","""DO225398""","""BRCA-EU""","""SP135175""","""SA570821""","""SA570822""","""PD9847a""","""PD9847b""","""Y""",249239538.0,249239538.0,1.0,"""GRCh37""","""single base su…","""TTTTTTTTTTTTTT…","""TTTTTTTTTTTTTT…","""TTTTTTTTCA""",,,917.0,561.0,"""tested and ver…","""capillary sequ…","""tested and val…","""capillary sequ…","""upstream_gene_…","""YVMGGVAM72""","""9T>C""","""ENSG0000027349…","""ENST0000061028…",75.0,"""Illumina HiSeq…",,"""WGS""",,"""BWA v0.5.9 htt…","""Pindel version…",,,"""EGA""","""FI9995:FI9994""",


In [8]:
mutation_df.columns

['icgc_mutation_id',
 'icgc_donor_id',
 'project_code',
 'icgc_specimen_id',
 'icgc_sample_id',
 'matched_icgc_sample_id',
 'submitted_sample_id',
 'submitted_matched_sample_id',
 'chromosome',
 'chromosome_start',
 'chromosome_end',
 'chromosome_strand',
 'assembly_version',
 'mutation_type',
 'reference_genome_allele',
 'mutated_from_allele',
 'mutated_to_allele',
 'quality_score',
 'probability',
 'total_read_count',
 'mutant_allele_read_count',
 'verification_status',
 'verification_platform',
 'biological_validation_status',
 'biological_validation_platform',
 'consequence_type',
 'aa_mutation',
 'cds_mutation',
 'gene_affected',
 'transcript_affected',
 'gene_build_version',
 'platform',
 'experimental_protocol',
 'sequencing_strategy',
 'base_calling_algorithm',
 'alignment_algorithm',
 'variation_calling_algorithm',
 'other_analysis_algorithm',
 'seq_coverage',
 'raw_data_repository',
 'raw_data_accession',
 'initial_data_release_date']

In [9]:
mutation_df["icgc_donor_id"].n_unique()

569

In [10]:
[x for x in mutation_df["mutation_type"].unique()]

['single base substitution',
 'insertion of <=200bp',
 'deletion of <=200bp',
 'multiple base substitution (>=2bp and <=200bp)']

In [6]:
mutation_df.write_parquet("data/simple_somatic_mutation.open.BRCA-EU.parquet")

In [None]:
mutation_df.filter(pl.col("mutation_type") == "single base substitution").write_csv("data/simple_somatic_mutation_sbs.tsv", separator="\t")

In [None]:
mutation_df.filter(pl.col("mutation_type") == "single base substitution").write_parquet("data/simple_somatic_mutation_sbs.parquet")