# Notebook: Merging ANNOVAR output with pre-existing information

In [19]:
import pandas as pd
pd.set_option('display.max_columns', None)

# 1. Invalid Input: refGene

invalid_df -> all samples have pos != end

## 1A. Get subset of samples that were invalid and look for pattern
* full_df == 3706 samples
* same pos == 491 samples
* same pos + end == 284 samples
* same pos + end + ref == 248 samples
* same pos + end + ref + alt == 229 samples -> same as invalid_df

## 1B. Analyse type of invalid samples:
* All samples with type "insertion" are invalid (37)
* All samples with type "duplication" are invalid (187)
* "Microsatellite" type has some invalid samples (5)

## 1C. Analyse ref and alt:
* All invalid samples have alt values with more than one nucleotide
* For "Microsatellite" type, the valide samples only have one nulceotide at "alt", and more than one when they are invalid
* However, there are valid samples from other types (55 Indel and 4 Inversion) that have more than one nucleotide in "alt"

In [35]:
full_df = pd.read_csv("mafalda_files/full_dataset.csv")
print("full_df: ", full_df.shape)
invalid_df = pd.read_csv("/export/scratch2/constellation-data/malafaia/ANNOVAR/files_clinvar/output.refGene.invalid_input", sep='\t', header = None)
print("invalid_df: ", invalid_df.shape)
invalid_df.head()

full_df:  (3706, 14)
invalid_df:  (229, 8)


Unnamed: 0,0,1,2,3,4,5,6,7
0,19,11089324,11089325,T,TA,,,0
1,19,11089422,11089423,G,GA,,,1
2,19,11089535,11089536,C,CA,,,0
3,19,11089562,11089563,G,GCTTTC,Trp,fs,1
4,19,11089579,11089580,A,ACCGT,Ala,fs,1


In [None]:
invalid_df = invalid_df.rename(columns={
    0: 'chr',
    1: 'pos',
    2: 'end',
    3: 'ref',
    4: 'alt',
    5: 'aa_ref',
    6: 'ClinSigSimple'
})
print(invalid_df.shape)
invalid_df.head()

(229, 8)


Unnamed: 0,chr,pos,end,ref,alt,aa_ref,ClinSigSimple,7
0,19,11089324,11089325,T,TA,,,0
1,19,11089422,11089423,G,GA,,,1
2,19,11089535,11089536,C,CA,,,0
3,19,11089562,11089563,G,GCTTTC,Trp,fs,1
4,19,11089579,11089580,A,ACCGT,Ala,fs,1


In [37]:
invalid_df.loc[invalid_df['pos']!=invalid_df['end']]

Unnamed: 0,chr,pos,end,ref,alt,aa_ref,ClinSigSimple,7
0,19,11089324,11089325,T,TA,,,0
1,19,11089422,11089423,G,GA,,,1
2,19,11089535,11089536,C,CA,,,0
3,19,11089562,11089563,G,GCTTTC,Trp,fs,1
4,19,11089579,11089580,A,ACCGT,Ala,fs,1
...,...,...,...,...,...,...,...,...
224,19,11132531,11132532,C,CA,,,0
225,19,11133385,11133386,T,TC,,,0
226,19,11133511,11133512,T,TTATA,,,0
227,19,11133511,11133512,T,TTATATATA,,,0


In [43]:
full_invalid = full_df.loc[(full_df['pos'].isin(invalid_df['pos'])) & \
                (full_df['end'].isin(invalid_df['end'])) & \
                (full_df['ref'].isin(invalid_df['ref'])) & \
                (full_df['alt'].isin(invalid_df['alt']))
                ]
print(full_invalid.shape)
full_invalid.head()

(229, 14)


Unnamed: 0,allele_id,rs#,gene,chr,pos,type,ONIM,end,ref,alt,ClinSigSimple,aa_ref,aa_change,aa_alt
7,245301,879254400.0,LDLR:3949|LDLR-AS1:115271120,19,11089324,Duplication,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11089325,T,TA,0,,,
67,362663,1057520000.0,LDLR:3949|LDLR-AS1:115271120,19,11089422,Duplication,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11089423,G,GA,1,,,
116,3546109,,LDLR:3949|LDLR-AS1:115271120,19,11089535,Duplication,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11089536,C,CA,0,,,
138,354076,774615500.0,LDLR:3949|LDLR-AS1:115271120,19,11089562,Insertion,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11089563,G,GCTTTC,1,Trp,6.0,fs
151,245343,879254400.0,LDLR:3949|LDLR-AS1:115271120,19,11089579,Duplication,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11089580,A,ACCGT,1,Ala,13.0,fs


In [52]:
full_valid = full_df.loc[~full_df.index.isin(full_invalid.index)]
print(full_valid.shape)

(3477, 14)


In [48]:
print("Types available in full dataset: ")
print(full_df['type'].value_counts())
print("\nTypes available in invalid dataset: ")
print(full_invalid['type'].value_counts())

Types available in full dataset: 
type
single_nucleotide_variant    2967
Deletion                      412
Duplication                   187
Indel                          66
Insertion                      37
Microsatellite                 33
Inversion                       4
Name: count, dtype: int64

Types available in invalid dataset: 
type
Duplication       187
Insertion          37
Microsatellite      5
Name: count, dtype: int64


In [64]:
full_valid.loc[(full_valid['alt'].str.len() > 1)]

Unnamed: 0,allele_id,rs#,gene,chr,pos,type,ONIM,end,ref,alt,ClinSigSimple,aa_ref,aa_change,aa_alt
27,245305,879254400.0,LDLR:3949|LDLR-AS1:115271120,19,11089356,Indel,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11089363,CTCCTCCT,TG,0,,,
230,914671,2077192000.0,LDLR:3949,19,11100234,Indel,"MONDO:MONDO:0005439,MedGen:C0020445,OMIM:PS143890",11100236,TGC,CGT,1,Cys,27.0,Arg
271,245380,879254400.0,LDLR:3949,19,11100271,Indel,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11100272,GC,AA,1,Cys,39.0,Ter
377,245412,879254400.0,LDLR:3949,19,11102662,Indel,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11102662,A,CT,1,,,
382,245415,879254400.0,LDLR:3949,19,11102666,Indel,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11102674,TCTGTCACC,GACTTCA,1,Ser,65.0,fs
383,1846237,2515955000.0,LDLR:3949,19,11102666,Indel,MedGen:CN230736,11102675,TCTGTCACCT,GGACTTCA,1,Ser,65.0,fs
438,647615,1600705000.0,LDLR:3949,19,11102722,Indel,"MONDO:MONDO:0005439,MedGen:C0020445,OMIM:PS143...",11102722,T,GG,1,Ile,83.0,fs
525,3499360,,LDLR:3949,19,11102786,Indel,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11102787,CG,TC,1,,,
582,245492,879254500.0,LDLR:3949,19,11105230,Indel,"MedGen:CN230736|MONDO:MONDO:0005439,MedGen:C00...",11105231,GT,TC,1,Cys,109.0,Arg
642,2143580,2515965000.0,LDLR:3949,19,11105279,Indel,"MONDO:MONDO:0005439,MedGen:C0020445,OMIM:PS143890",11105289,CAGTTCGTCTG,AAGTGCA,1,Gln,125.0,fs


In [62]:
full_invalid.loc[(full_invalid['alt'].str.len() > 1)]

Unnamed: 0,allele_id,rs#,gene,chr,pos,type,ONIM,end,ref,alt,ClinSigSimple,aa_ref,aa_change,aa_alt
7,245301,8.792544e+08,LDLR:3949|LDLR-AS1:115271120,19,11089324,Duplication,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11089325,T,TA,0,,,
67,362663,1.057520e+09,LDLR:3949|LDLR-AS1:115271120,19,11089422,Duplication,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11089423,G,GA,1,,,
116,3546109,,LDLR:3949|LDLR-AS1:115271120,19,11089535,Duplication,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11089536,C,CA,0,,,
138,354076,7.746155e+08,LDLR:3949|LDLR-AS1:115271120,19,11089562,Insertion,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11089563,G,GCTTTC,1,Trp,6.0,fs
151,245343,8.792544e+08,LDLR:3949|LDLR-AS1:115271120,19,11089579,Duplication,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11089580,A,ACCGT,1,Ala,13.0,fs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3642,332476,3.411354e+07,LDLR:3949,19,11132531,Duplication,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11132532,C,CA,0,,,
3689,348049,5.458603e+08,LDLR:3949,19,11133385,Duplication,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11133386,T,TC,0,,,
3695,3881293,,LDLR:3949,19,11133511,Microsatellite,MedGen:CN169374,11133512,T,TTATA,0,,,
3696,348051,3.980933e+06,LDLR:3949,19,11133511,Microsatellite,"MONDO:MONDO:0007750,MedGen:C0745103,OMIM:14389...",11133512,T,TTATATATA,0,,,


In [68]:
print("Pos range available in valid dataset: ")
print(full_valid['pos'].describe())
print("\nPos range available in invalid dataset: ")
print(full_invalid['pos'].describe())

Pos range available in valid dataset: 
count    3.477000e+03
mean     1.111233e+07
std      9.975460e+03
min      1.108926e+07
25%      1.110550e+07
50%      1.111328e+07
75%      1.112011e+07
max      1.113370e+07
Name: pos, dtype: float64

Pos range available in invalid dataset: 
count    2.290000e+02
mean     1.111270e+07
std      9.238981e+03
min      1.108932e+07
25%      1.110552e+07
50%      1.111161e+07
75%      1.112019e+07
max      1.113353e+07
Name: pos, dtype: float64


# ANNOVAR Output

In [30]:
df = pd.read_csv('/export/scratch2/constellation-data/malafaia/ANNOVAR/files_clinvar/output.hg38_multianno.csv')
print(df.shape)
df.head()

(3706, 178)


Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,SIFT_score,SIFT_converted_rankscore,SIFT_pred,SIFT4G_score,SIFT4G_converted_rankscore,SIFT4G_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_rankscore,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_rankscore,Polyphen2_HVAR_pred,LRT_score,LRT_converted_rankscore,LRT_pred,LRT_Omega,MutationTaster_score,MutationTaster_converted_rankscore,MutationTaster_pred,MutationAssessor_score,MutationAssessor_rankscore,MutationAssessor_pred,FATHMM_score,FATHMM_converted_rankscore,FATHMM_pred,PROVEAN_score,PROVEAN_converted_rankscore,PROVEAN_pred,VEST4_score,VEST4_rankscore,MetaSVM_score,MetaSVM_rankscore,MetaSVM_pred,MetaLR_score,MetaLR_rankscore,MetaLR_pred,Reliability_index,MetaRNN_score,MetaRNN_rankscore,MetaRNN_pred,M-CAP_score,M-CAP_rankscore,M-CAP_pred,REVEL_score,REVEL_rankscore,MutPred_score,MutPred_rankscore,MVP_score,MVP_rankscore,gMVP_score,gMVP_rankscore,MPC_score,MPC_rankscore,PrimateAI_score,PrimateAI_rankscore,PrimateAI_pred,DEOGEN2_score,DEOGEN2_rankscore,DEOGEN2_pred,BayesDel_addAF_score,BayesDel_addAF_rankscore,BayesDel_addAF_pred,BayesDel_noAF_score,BayesDel_noAF_rankscore,BayesDel_noAF_pred,ClinPred_score,ClinPred_rankscore,ClinPred_pred,LIST-S2_score,LIST-S2_rankscore,LIST-S2_pred,VARITY_R_score,VARITY_R_rankscore,VARITY_ER_score,VARITY_ER_rankscore,VARITY_R_LOO_score,VARITY_R_LOO_rankscore,VARITY_ER_LOO_score,VARITY_ER_LOO_rankscore,ESM1b_score,ESM1b_rankscore,ESM1b_pred,EVE_score,EVE_rankscore,AlphaMissense_score,AlphaMissense_rankscore,AlphaMissense_pred,Aloft_pred,Aloft_Confidence,CADD_raw,CADD_raw_rankscore,CADD_phred,DANN_score,DANN_rankscore,fathmm-MKL_coding_score,fathmm-MKL_coding_rankscore,fathmm-MKL_coding_pred,fathmm-MKL_coding_group,fathmm-XF_coding_score,fathmm-XF_coding_rankscore,fathmm-XF_coding_pred,Eigen-raw_coding,Eigen-raw_coding_rankscore,Eigen-phred_coding,Eigen-PC-raw_coding,Eigen-PC-raw_coding_rankscore,Eigen-PC-phred_coding,GenoCanyon_score,GenoCanyon_rankscore,integrated_fitCons_score,integrated_fitCons_rankscore,integrated_confidence_value,GM12878_fitCons_score,GM12878_fitCons_rankscore,GM12878_confidence_value,H1-hESC_fitCons_score,H1-hESC_fitCons_rankscore,H1-hESC_confidence_value,HUVEC_fitCons_score,HUVEC_fitCons_rankscore,HUVEC_confidence_value,LINSIGHT,LINSIGHT_rankscore,GERP++_NR,GERP++_RS,GERP++_RS_rankscore,phyloP100way_vertebrate,phyloP100way_vertebrate_rankscore,phyloP470way_mammalian,phyloP470way_mammalian_rankscore,phyloP17way_primate,phyloP17way_primate_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons470way_mammalian,phastCons470way_mammalian_rankscore,phastCons17way_primate,phastCons17way_primate_rankscore,SiPhy_29way_pi,SiPhy_29way_logOdds,SiPhy_29way_logOdds_rankscore,bStatistic,bStatistic_converted_rankscore,Interpro_domain,GTEx_V8_eQTL_gene,GTEx_V8_eQTL_tissue,GTEx_V8_sQTL_gene,GTEx_V8_sQTL_tissue,eQTLGen_snp_id,1000g2015aug_all,AF,AF_popmax,AF_male,AF_female,AF_raw,AF_afr,AF_sas,AF_amr,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax,Otherinfo1
0,19,11089263,11089263,C,G,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1
1,19,11089281,11089281,G,T,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0.00898562,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0
2,19,11089309,11089311,CAG,C,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0
3,19,11089318,11089319,AC,A,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,0
4,19,11089318,11089458,ACGGGTTAAAAAGCCGATGTCACATCGGCCGTTCGAAACTCCTCCT...,A,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1


In [31]:
for i in df.columns: print(i)

Chr
Start
End
Ref
Alt
Func.refGene
Gene.refGene
GeneDetail.refGene
ExonicFunc.refGene
AAChange.refGene
SIFT_score
SIFT_converted_rankscore
SIFT_pred
SIFT4G_score
SIFT4G_converted_rankscore
SIFT4G_pred
Polyphen2_HDIV_score
Polyphen2_HDIV_rankscore
Polyphen2_HDIV_pred
Polyphen2_HVAR_score
Polyphen2_HVAR_rankscore
Polyphen2_HVAR_pred
LRT_score
LRT_converted_rankscore
LRT_pred
LRT_Omega
MutationTaster_score
MutationTaster_converted_rankscore
MutationTaster_pred
MutationAssessor_score
MutationAssessor_rankscore
MutationAssessor_pred
FATHMM_score
FATHMM_converted_rankscore
FATHMM_pred
PROVEAN_score
PROVEAN_converted_rankscore
PROVEAN_pred
VEST4_score
VEST4_rankscore
MetaSVM_score
MetaSVM_rankscore
MetaSVM_pred
MetaLR_score
MetaLR_rankscore
MetaLR_pred
Reliability_index
MetaRNN_score
MetaRNN_rankscore
MetaRNN_pred
M-CAP_score
M-CAP_rankscore
M-CAP_pred
REVEL_score
REVEL_rankscore
MutPred_score
MutPred_rankscore
MVP_score
MVP_rankscore
gMVP_score
gMVP_rankscore
MPC_score
MPC_rankscore
PrimateA

In [32]:
cols_input = ['Chr', 'Start', 'End', 'Ref', 'Alt']
cols_other = ['Func.refGene', 'Gene.refGene']
cols_af = df.filter(regex='^AF').columns.tolist()
cols_1kgp = df.filter(regex='1000g').columns.tolist()
cols_gnomad = df.filter(regex='gnomad').columns.tolist() # nothing
cols_revel = df.filter(regex='REVEL').columns.tolist()
cols_metarnn = df.filter(regex='MetaRNN').columns.tolist()
cols_sift = df.filter(regex='SIFT_').columns.tolist()
cols_mtaster = df.filter(regex='MutationTaster').columns.tolist()
cols_provean = df.filter(regex='Provean').columns.tolist()

cols = cols_input + cols_other + cols_af + cols_1kgp + cols_gnomad + cols_revel + cols_metarnn + cols_sift + cols_mtaster + cols_provean
df_useful = df[cols]
print(df_useful.shape)
df_useful.head()

(3706, 32)


Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,AF,AF_popmax,AF_male,AF_female,AF_raw,AF_afr,AF_sas,AF_amr,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,1000g2015aug_all,REVEL_score,REVEL_rankscore,MetaRNN_score,MetaRNN_rankscore,MetaRNN_pred,SIFT_score,SIFT_converted_rankscore,SIFT_pred,MutationTaster_score,MutationTaster_converted_rankscore,MutationTaster_pred
0,19,11089263,11089263,C,G,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
1,19,11089281,11089281,G,T,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,0.00898562,.,.,.,.,.,.,.,.,.,.,.
2,19,11089309,11089311,CAG,C,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
3,19,11089318,11089319,AC,A,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
4,19,11089318,11089458,ACGGGTTAAAAAGCCGATGTCACATCGGCCGTTCGAAACTCCTCCT...,A,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.


In [33]:
for i in df_useful.columns:
    print(f"{i}: {df_useful[i].loc[df_useful[i]=='.'].shape[0]}")

Chr: 0
Start: 0
End: 0
Ref: 0
Alt: 0
Func.refGene: 229
Gene.refGene: 229
AF: 2923
AF_popmax: 2951
AF_male: 2923
AF_female: 2923
AF_raw: 2923
AF_afr: 2923
AF_sas: 2923
AF_amr: 2923
AF_eas: 2923
AF_nfe: 2923
AF_fin: 2923
AF_asj: 2923
AF_oth: 2923
1000g2015aug_all: 3500
REVEL_score: 2225
REVEL_rankscore: 2225
MetaRNN_score: 2171
MetaRNN_rankscore: 2171
MetaRNN_pred: 2171
SIFT_score: 2173
SIFT_converted_rankscore: 2173
SIFT_pred: 2173
MutationTaster_score: 1856
MutationTaster_converted_rankscore: 1856
MutationTaster_pred: 1856


In [28]:
df_useful['Func.refGene'].value_counts()

Func.refGene
exonic                   2685
intronic                  385
.                         229
ncRNA_exonic              155
splicing                  129
UTR3                      112
exonic;splicing             6
ncRNA_exonic;splicing       5
Name: count, dtype: int64

In [34]:
df_useful.loc[df_useful['AF']=='.']

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,AF,AF_popmax,AF_male,AF_female,AF_raw,AF_afr,AF_sas,AF_amr,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,1000g2015aug_all,REVEL_score,REVEL_rankscore,MetaRNN_score,MetaRNN_rankscore,MetaRNN_pred,SIFT_score,SIFT_converted_rankscore,SIFT_pred,MutationTaster_score,MutationTaster_converted_rankscore,MutationTaster_pred
0,19,11089263,11089263,C,G,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
1,19,11089281,11089281,G,T,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,0.00898562,.,.,.,.,.,.,.,.,.,.,.
2,19,11089309,11089311,CAG,C,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
3,19,11089318,11089319,AC,A,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
4,19,11089318,11089458,ACGGGTTAAAAAGCCGATGTCACATCGGCCGTTCGAAACTCCTCCT...,A,ncRNA_exonic,LDLR-AS1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3701,19,11133598,11133598,G,C,UTR3,LDLR,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
3702,19,11133666,11133666,C,G,UTR3,LDLR,.,.,.,.,.,.,.,.,.,.,.,.,.,0.000199681,.,.,.,.,.,.,.,.,.,.,.
3703,19,11133681,11133681,C,T,UTR3,LDLR,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
3704,19,11133682,11133682,G,T,UTR3,LDLR,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
