#0 - Basic Settings

In [None]:
#Permission to access some file from Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Increasing the capacity to view columns and rows.
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

#1 Reading the database with the 33 cancer types up to the *Uniprot* attribute

The databases of the 33 cancer types up to the **Uniprot** attribute were integrated in the notebook **Integra33TecidosparaBase_ate_Uniprot** where the **Base_33Tecidos_Ate_Uniprot__Missense_clean.csv** base was generated. The base and the notebook are in the **TratamentosFinais_Base_33_Artigo_Base** folder.

This notebook will add the following attributes to the base:
- Blosum62
- Amino acid classification group
- Change of the amino acid group
- Essential amino acid
- Change of amino acid essentiality
- Substitution

In [None]:
import pandas as pd
df_33 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean.csv", sep='\t')

In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 54 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

#36 - Generating the blosum62 attribute

In [None]:
import pandas as pd

#reading the file which line has index
base_blosum = pd.read_csv("blosum62.csv",sep=';', index_col=0)

In [None]:
base_blosum.head(22)

Unnamed: 0,Ala,Arg,Asn,Asp,Cys,Gln,Glu,Gly,His,Ile,Leu,Lys,Met,Phe,Pro,Ser,Thr,Trp,Tyr,Val,Asx
Ala,4,-1,-2,-2,0,-1,-1,0,-2,-1,-1,-1,-1,-2,-1,1,0,-3,-2,0,-2
Arg,-1,5,0,-2,-3,1,0,-2,0,-3,-2,2,-1,-3,-2,-1,-1,-3,-2,-3,-1
Asn,-2,0,6,1,-3,0,0,0,1,-3,-3,0,-2,-3,-2,1,0,-4,-2,-3,3
Asp,-2,-2,1,6,-3,0,2,-1,-1,-3,-4,-1,-3,-3,-1,0,-1,-4,-3,-3,4
Cys,0,-3,-3,-3,9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3
Gln,-1,1,0,0,-3,5,2,-2,0,-3,-2,1,0,-3,-1,0,-1,-2,-1,-2,0
Glu,-1,0,0,2,-4,2,5,-2,0,-3,-3,1,-2,-3,-1,0,-1,-3,-2,-2,1
Gly,0,-2,0,-1,-3,-2,-2,6,-2,-4,-4,-2,-3,-3,-2,0,-2,-2,-3,-3,-1
His,-2,0,1,-1,-3,0,0,-2,8,-3,-3,-1,-2,-1,-2,-1,-2,-2,2,-3,0
Ile,-1,-3,-3,-3,-1,-3,-3,-4,-3,4,2,-3,1,0,-3,-2,-1,-3,-1,3,-3


In [None]:
def categories_column(df):
    for col in ['aminBefore', 'aminAfter']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_33)

aminBefore {'Arg': 132686, 'Ala': 43463, 'Glu': 28448, 'Ser': 26158, 'Pro': 24708, 'Val': 24482, 'Gly': 23935, 'Asp': 19720, 'Thr': 19092, 'Leu': 8179, 'Ile': 5625, 'Met': 5135, 'Lys': 4882, 'His': 4493, 'Asn': 4132, 'Gln': 3494, 'Tyr': 3493, 'Phe': 2780, 'Cys': 2160, 'Trp': 730, 'Ter': 2}


aminAfter {'His': 37843, 'Gln': 37779, 'Cys': 37603, 'Leu': 31463, 'Lys': 27695, 'Thr': 26473, 'Val': 26053, 'Met': 24370, 'Trp': 23257, 'Ser': 21225, 'Asn': 20488, 'Ile': 20184, 'Arg': 12664, 'Phe': 10186, 'Tyr': 6657, 'Glu': 6576, 'Asp': 6038, 'Ala': 4700, 'Gly': 3785, 'Pro': 2455, '.': 303}




In [None]:
def get_peso(linha_coluna, df):
  if (("Ter" in linha_coluna) or ("*" in linha_coluna) or ("." in linha_coluna) ):
    print(linha_coluna)
    return "-"
  else:
      lista = linha_coluna.split(",")
      return df.loc[lista[0],lista[1]]

In [None]:
#creating the Blosum62 field
df_33["Blosum62"] = (df_33["aminBefore"] + "," + df_33["aminAfter"]).apply(get_peso,df=base_blosum)

Met,.
Met,.
Ter,Ser
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Ter,Ser
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.
Met,.


In [None]:
df_33['Blosum62'].value_counts()

1     102085
0      97477
-3     76576
-2     43828
-1     40304
3      14250
2      12972
-        305
Name: Blosum62, dtype: int64

In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 55 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

##36.1 Generating an intermediate file with the ACC database with the *Blosum62* field

In [None]:
df_33.to_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62.csv",sep='\t',index=False)

#37 - Generating the amino acid classification group attribute

In [None]:
#increasing the ability to view columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
#Reading the Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62 database

import pandas as pd
df_33 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62.csv", delimiter='\t')

In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 55 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

In [None]:
def categories_column(df):
    for col in ['aminBefore', 'aminAfter']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_33)

aminBefore {'Arg': 132686, 'Ala': 43463, 'Glu': 28448, 'Ser': 26158, 'Pro': 24708, 'Val': 24482, 'Gly': 23935, 'Asp': 19720, 'Thr': 19092, 'Leu': 8179, 'Ile': 5625, 'Met': 5135, 'Lys': 4882, 'His': 4493, 'Asn': 4132, 'Gln': 3494, 'Tyr': 3493, 'Phe': 2780, 'Cys': 2160, 'Trp': 730, 'Ter': 2}


aminAfter {'His': 37843, 'Gln': 37779, 'Cys': 37603, 'Leu': 31463, 'Lys': 27695, 'Thr': 26473, 'Val': 26053, 'Met': 24370, 'Trp': 23257, 'Ser': 21225, 'Asn': 20488, 'Ile': 20184, 'Arg': 12664, 'Phe': 10186, 'Tyr': 6657, 'Glu': 6576, 'Asp': 6038, 'Ala': 4700, 'Gly': 3785, 'Pro': 2455, '.': 303}




In [None]:
def get_group(aminoacido):
  if (aminoacido in ['Gly', 'Ala', 'Pro', 'Val', 'Leu', 'Ile', 'Met']):
    return 'nonpolar'
  elif (aminoacido in ['Ser', 'Thr', 'Cys', 'Asn', 'Gln']):
    return 'polar'
  elif (aminoacido in ['Phe', 'Tyr', 'Trp']):
    return 'aromatic'
  elif (aminoacido in ['Lys', 'Arg', 'His']):
    return 'positivecharge'
  elif (aminoacido in ['Glu', 'Asp']):
    return 'negativecharge'
  elif (aminoacido == "."):
    return 'nogroup_N-terminalDeletion'
  else:             #aminoacido = *
    print(aminoacido)
    return 'nogroup_StopCodon'

In [None]:
df_33["groupBefore"] = df_33["aminBefore"].apply(get_group)

Ter
Ter


In [None]:
df_33["groupAfter"] = df_33["aminAfter"].apply(get_group)

In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 57 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

In [None]:
df_33.head(35)

Unnamed: 0,CHROM,POS,REF,ALT,Interpro_domain,dbNSFP_DEOGEN2_pred,dbNSFP_MetaSVM_pred,dbNSFP_fathmmMKL_coding_pred,dbNSFP_PrimateAI_pred,dbNSFP_PROVEAN_pred,dbNSFP_MCAP_pred,dbNSFP_ClinPred_pred,dbNSFP_BayesDel_addAF_pred,dbNSFP_ExAC_AF,dbNSFP_Polyphen2_HVAR_pred,dbNSFP_SIFT_pred,dbNSFP_FATHMM_pred,dbNSFP_SIFT4G_pred,dbNSFP_LRT_pred,dbNSFP_fathmmXF_coding_pred,dbNSFP_BayesDel_noAF_pred,dbNSFP_gnomAD_exomes_AF,dbNSFP_Aloft_pred,dbNSFP_MutationTaster_pred,dbNSFP_MetaLR_pred,dbNSFP_LISTS2_pred,dbNSFP_Polyphen2_HDIV_pred,dbNSFP_MutationAssessor_pred,VariantEffect_EFF,Risco_Mut_EFF,Tipo_Mut_EFF,Point_Mutation_EFF,changeProt_EFF,changecDNA_EFF,Gene_EFF,RefSeq_EFF,Exon_EFF,Pos_Point_Mutation_EFF,poschangecDNA_EFF,typechangecDNA_EFF,aminBefore,aminAfter,poschangeProt,typechangeProt,SNP_ID_COMMON,COMMON,PolyPhen2_Dam_pred,Ndamage,NdamageCalc,Deleteria,Deleteria5,Deleteria10,Uniprot_id,Tecido,Blosum62,groupBefore,groupAfter
0,1,2027599,G,A,Neurotransmitter-gated_ion-channel_ligand-bind...,T,T,D,D,N,D,T,T,0.0,D,T,T,T,D,D,T,4e-06,.,D,T,D,D,N,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,Gac/Aac,p.Asp165Asn,c.493G>A,GABRD,NM_000815.4,5,1,493,>,Asp,Asn,165,subst,rs1477740666,0.0,1,8/20,8,1,1,0,O14764,ACC,1,negativecharge,polar
1,1,2303896,C,T,.,D,D,D,T,D,D,D,D,8e-06,P,D,D,T,D,D,D,8e-06,.,D,D,D,D,M,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,cCg/cTg,p.Pro423Leu,c.1268C>T,SKI,NM_003036.3,4,2,1268,>,Pro,Leu,423,subst,rs752779978,0.0,1,16/20,16,1,1,1,P12755,ACC,-3,nonpolar,nonpolar
2,1,3816294,G,A,.,T,T,N,T,N,T,T,T,0.0,B,T,T,T,N,N,T,0.0,.,N,T,T,B,N,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,cCg/cTg,p.Pro883Leu,c.2648C>T,CEP104,NM_014704.3,21,2,2648,>,Pro,Leu,883,subst,rs1197412379,0.0,0,0/20,0,0,0,0,O60308,ACC,-3,nonpolar,nonpolar
3,1,9058243,G,A,.,D,D,D,T,D,D,D,T,2.5e-05,D,D,D,D,D,D,D,1.2e-05,.,D,D,D,D,M,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,aCg/aTg,p.Thr14Met,c.41C>T,SLC2A5,NM_003039.2,2,2,41,>,Thr,Met,14,subst,rs765084352,0.0,1,16/20,16,1,1,1,P22732,ACC,-1,polar,nonpolar
4,1,9597039,G,A,"Ima1,_N-terminal_domain",T,T,N,T,N,T,T,T,2.5e-05,B,T,.,T,N,N,T,1.2e-05,.,N,T,T,B,N,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,Gct/Act,p.Ala139Thr,c.415G>A,TMEM201,NM_001130924.2,3,1,415,>,Ala,Thr,139,subst,rs752313181,0.0,0,0/20,0,0,0,0,Q5SNT2,ACC,0,nonpolar,polar
5,1,11501132,G,A,.,T,T,D,T,N,T,T,T,0.0,B,T,T,T,U,N,T,0.0,.,N,T,T,P,.,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,cGg/cAg,p.Arg47Gln,c.140G>A,PTCHD2,NM_020780.1,2,2,140,>,Arg,Gln,47,subst,rs1372854383,0.0,1,2/20,2,0,0,0,Q9P2K9,ACC,1,positivecharge,polar
6,1,11668750,G,A,F-box_domain,T,T,D,T,N,T,T,T,0.000321,B,T,T,T,N,N,T,0.000279,.,D,T,T,P,M,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,cGc/cAc,p.Arg31His,c.92G>A,FBXO6,NM_018438.5,2,2,92,>,Arg,His,31,subst,rs146139993,0.0,1,3/20,3,0,0,0,Q9NRD1,ACC,0,positivecharge,positivecharge
7,1,12879686,C,G,.,T,T,N,T,D,T,D,T,6.7e-05,D,D,T,T,N,N,T,3.2e-05,.,N,T,.,D,M,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,aGa/aCa,p.Arg432Thr,c.1295G>C,PRAMEF4,NM_001009611.4,4,2,1295,>,Arg,Thr,432,subst,rs543370672,0.0,1,4/20,4,1,0,0,O60810,ACC,-1,positivecharge,polar
8,1,16992040,G,A,"HAD-like_domain|P-type_ATPase,_cytoplasmic_dom...",T,T,N,T,N,D,T,T,8e-06,B,T,T,T,N,N,T,4e-06,.,N,T,T,B,N,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,Ccc/Tcc,p.Pro699Ser,c.2095C>T,ATP13A2,NM_022089.3,19,1,2095,>,Pro,Ser,699,subst,rs756039984,0.0,0,1/20,1,0,0,0,Q9NQ11,ACC,-1,nonpolar,polar
9,1,18365410,C,T,Immunoglobulin-like_fold,T,T,D,T,N,D,T,T,0.000173,D,D,T,T,N,D,T,0.000119,.,D,T,T,D,M,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,aCg/aTg,p.Thr243Met,c.728C>T,IGSF21,NM_032880.4,6,2,728,>,Thr,Met,243,subst,rs144142024,1.0,1,6/20,6,0,0,0,Q96ID5,ACC,-1,polar,nonpolar


##37.1 Generating an intermediate file with the ACC database with the *groupBefore* and *groupAfter* fields

In [None]:
df_33.to_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group.csv",sep='\t',index=False)

#38 - Generating the attribute that represents the change in the amino acid group

In [None]:
#increasing the ability to view columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
#Reading the Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group.csv database

import pandas as pd
df_33 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group.csv", delimiter='\t')

In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 57 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

In [None]:
def get_concat(linha_coluna):
  if ("nogroup" in linha_coluna):
    print(linha_coluna)
  lista = linha_coluna.split(",")
  return lista[0] + "TO" + lista[1]

In [None]:
#creating the groupChange field
df_33["groupChange"] = (df_33["groupBefore"] + "," + df_33["groupAfter"]).apply(get_concat)

nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nogroup_StopCodon,polar
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonpolar,nogroup_N-terminalDeletion
nonp

In [None]:
df_33["groupChange"].value_counts()

positivechargeTOpolar                   72808
nonpolarTOnonpolar                      70433
nonpolarTOpolar                         43367
positivechargeTOpositivecharge          38463
polarTOnonpolar                         33287
positivechargeTOaromatic                25207
negativechargeTOpositivecharge          24989
negativechargeTOpolar                   16826
nonpolarTOpositivecharge                 8985
polarTOaromatic                          8763
nonpolarTOnegativecharge                 8065
polarTOpolar                             7112
polarTOpositivecharge                    4741
positivechargeTOnonpolar                 4564
nonpolarTOaromatic                       4374
aromaticTOpolar                          3453
negativechargeTOnonpolar                 2411
negativechargeTOnegativecharge           2358
aromaticTOnonpolar                       2315
negativechargeTOaromatic                 1584
polarTOnegativecharge                    1133
aromaticTOpositivecharge          

In [None]:
df_33.query('groupBefore == "-" & groupAfter == "-"')

Unnamed: 0,CHROM,POS,REF,ALT,Interpro_domain,dbNSFP_DEOGEN2_pred,dbNSFP_MetaSVM_pred,dbNSFP_fathmmMKL_coding_pred,dbNSFP_PrimateAI_pred,dbNSFP_PROVEAN_pred,dbNSFP_MCAP_pred,dbNSFP_ClinPred_pred,dbNSFP_BayesDel_addAF_pred,dbNSFP_ExAC_AF,dbNSFP_Polyphen2_HVAR_pred,dbNSFP_SIFT_pred,dbNSFP_FATHMM_pred,dbNSFP_SIFT4G_pred,dbNSFP_LRT_pred,dbNSFP_fathmmXF_coding_pred,dbNSFP_BayesDel_noAF_pred,dbNSFP_gnomAD_exomes_AF,dbNSFP_Aloft_pred,dbNSFP_MutationTaster_pred,dbNSFP_MetaLR_pred,dbNSFP_LISTS2_pred,dbNSFP_Polyphen2_HDIV_pred,dbNSFP_MutationAssessor_pred,VariantEffect_EFF,Risco_Mut_EFF,Tipo_Mut_EFF,Point_Mutation_EFF,changeProt_EFF,changecDNA_EFF,Gene_EFF,RefSeq_EFF,Exon_EFF,Pos_Point_Mutation_EFF,poschangecDNA_EFF,typechangecDNA_EFF,aminBefore,aminAfter,poschangeProt,typechangeProt,SNP_ID_COMMON,COMMON,PolyPhen2_Dam_pred,Ndamage,NdamageCalc,Deleteria,Deleteria5,Deleteria10,Uniprot_id,Tecido,Blosum62,groupBefore,groupAfter,groupChange


In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 58 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

##38.1 Generating an intermediate file with the df_33 database with the *groupChange* field

In [None]:
df_33.to_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group_Change.csv",sep='\t',index=False)

#39 - Generating the Essential Amino Acid attributes

In [None]:
#increasing the ability to view columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
#Reading the Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group_Change.csv database

import pandas as pd
df_33 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group_Change.csv", delimiter='\t')

In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 58 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

In [None]:
def get_essencial(aminoacido):
  if (aminoacido in ['Phe','His','Ile', 'Lys','Leu','Met','Thr','Trp','Val']):
    return '1'
  elif (aminoacido in ['Asp','Glu','Ala','Arg','Asn','Cys','Gly','Gln','Pro','Ser','Tyr']):
    return '0'
  else: #valores: *, ., Ter
    print(aminoacido)
    return '-'

In [None]:
df_33["aminBeforeEssential"] = df_33["aminBefore"].apply(get_essencial)

Ter
Ter


In [None]:
df_33["aminAfterEssential"] = df_33["aminAfter"].apply(get_essencial)

.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.


In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 60 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

##39.1 Generating an intermediate file with the df_33 database with the *aminBeforeEssential*and  *aminAfterEssential* fields

In [None]:
df_33.to_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group_Change_Essential.csv",sep='\t',index=False)

#40 - Generating the attribute that represents the change in the essentiality of the amino acid

In [None]:
#increasing the ability to view columns and rows
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)
pd.set_option('display.width', 7000)

In [None]:
#Reading the  Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group_Change_Essential.csv database

import pandas as pd
df_33 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group_Change_Essential.csv", low_memory=False, dtype={'aminBeforeEssential': object}, delimiter='\t')

In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 60 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

In [None]:
df_33[['aminBeforeEssential','aminAfterEssential']]

Unnamed: 0,aminBeforeEssential,aminAfterEssential
0,0,0
1,0,1
2,0,1
3,1,1
4,0,1
...,...,...
387792,0,0
387793,0,0
387794,0,0
387795,0,1


In [None]:
def get_concat(linha_coluna):
  if ("-" in linha_coluna):
    print(linha_coluna)
  lista = linha_coluna.split(",")
  return lista[0] + "TO" + lista[1]

In [None]:
#criando o campo essencialChange
df_33["essencialChange"] = (df_33["aminBeforeEssential"] + "," + df_33["aminAfterEssential"]).apply(get_concat)

1,-
1,-
-,0
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
-,0
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-
1,-


In [None]:
df_33["essencialChange"]

0         0TO0
1         0TO1
2         0TO1
3         1TO1
4         0TO1
          ... 
387792    0TO0
387793    0TO0
387794    0TO0
387795    0TO1
387796    1TO1
Name: essencialChange, Length: 387797, dtype: object

In [None]:
df_33["essencialChange"].value_counts()

0TO1    168980
0TO0    143417
1TO1     58544
1TO0     16551
1TO-       303
-TO0         2
Name: essencialChange, dtype: int64

In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 61 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

##40.1 Generating an intermediate file with the df_33 database with the *essencialChange* field

In [None]:
df_33.to_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group_Change_Essential_change.csv",sep='\t',index=False)

#41 - Generating the *substitution* attribute

In [None]:
##increasing the ability to view columns and rows.
import pandas as pd

pd.set_option('display.max_columns', 7000)
pd.set_option('display.max_rows',90000)

In [None]:
#Reading the Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group_Change_Essential_change.csv database

import pandas as pd
df_33 = pd.read_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group_Change_Essential_change.csv", low_memory=False, dtype={'aminBeforeEssential': object}, delimiter='\t')

In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 61 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

In [None]:
def categories_column(df):
    for col in ['REF', 'ALT']:
        mydic= df[col].value_counts().to_dict()
        print(col, mydic)
        print('\n')

categories_column(df_33)

REF {'G': 176766, 'C': 176759, 'T': 17345, 'A': 16927}


ALT {'A': 172645, 'T': 172109, 'C': 21549, 'G': 21494}




In [None]:
def get_valor(bef_aft):
      lista = bef_aft.split(",")
      if ((lista[0] == 'A' and lista[1] == 'G') or (lista[0] == 'G' and lista[1] == 'A')):
        return 0
      elif ((lista[0] == 'C' and lista[1] == 'T') or (lista[0] == 'T' and lista[1] == 'C')):
        return 0
      elif ((lista[0] == 'A' and lista[1] == 'C') or (lista[0] == 'C' and lista[1] == 'A')):
        return 1
      elif ((lista[0] == 'G' and lista[1] == 'T') or (lista[0] == 'T' and lista[1] == 'G')):
        return 1
      elif ((lista[0] == 'A' and lista[1] == 'T') or (lista[0] == 'T' and lista[1] == 'A')):
        return 1
      elif ((lista[0] == 'C' and lista[1] == 'G') or (lista[0] == 'G' and lista[1] == 'C')):
        return 1


In [None]:
#creating the substitution attribute
df_33["substitution"] = (df_33["REF"] + "," + df_33["ALT"]).apply(get_valor)

In [None]:
df_33["substitution"].value_counts()

0    343421
1     44376
Name: substitution, dtype: int64

In [None]:
df_33.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387797 entries, 0 to 387796
Data columns (total 62 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   CHROM                         387797 non-null  int64  
 1   POS                           387797 non-null  int64  
 2   REF                           387797 non-null  object 
 3   ALT                           387797 non-null  object 
 4   Interpro_domain               387797 non-null  object 
 5   dbNSFP_DEOGEN2_pred           387797 non-null  object 
 6   dbNSFP_MetaSVM_pred           387797 non-null  object 
 7   dbNSFP_fathmmMKL_coding_pred  387797 non-null  object 
 8   dbNSFP_PrimateAI_pred         387797 non-null  object 
 9   dbNSFP_PROVEAN_pred           387797 non-null  object 
 10  dbNSFP_MCAP_pred              387797 non-null  object 
 11  dbNSFP_ClinPred_pred          387797 non-null  object 
 12  dbNSFP_BayesDel_addAF_pred    387797 non-nul

In [None]:
df_33.head(15)

Unnamed: 0,CHROM,POS,REF,ALT,Interpro_domain,dbNSFP_DEOGEN2_pred,dbNSFP_MetaSVM_pred,dbNSFP_fathmmMKL_coding_pred,dbNSFP_PrimateAI_pred,dbNSFP_PROVEAN_pred,dbNSFP_MCAP_pred,dbNSFP_ClinPred_pred,dbNSFP_BayesDel_addAF_pred,dbNSFP_ExAC_AF,dbNSFP_Polyphen2_HVAR_pred,dbNSFP_SIFT_pred,dbNSFP_FATHMM_pred,dbNSFP_SIFT4G_pred,dbNSFP_LRT_pred,dbNSFP_fathmmXF_coding_pred,dbNSFP_BayesDel_noAF_pred,dbNSFP_gnomAD_exomes_AF,dbNSFP_Aloft_pred,dbNSFP_MutationTaster_pred,dbNSFP_MetaLR_pred,dbNSFP_LISTS2_pred,dbNSFP_Polyphen2_HDIV_pred,dbNSFP_MutationAssessor_pred,VariantEffect_EFF,Risco_Mut_EFF,Tipo_Mut_EFF,Point_Mutation_EFF,changeProt_EFF,changecDNA_EFF,Gene_EFF,RefSeq_EFF,Exon_EFF,Pos_Point_Mutation_EFF,poschangecDNA_EFF,typechangecDNA_EFF,aminBefore,aminAfter,poschangeProt,typechangeProt,SNP_ID_COMMON,COMMON,PolyPhen2_Dam_pred,Ndamage,NdamageCalc,Deleteria,Deleteria5,Deleteria10,Uniprot_id,Tecido,Blosum62,groupBefore,groupAfter,groupChange,aminBeforeEssential,aminAfterEssential,essencialChange,substitution
0,1,2027599,G,A,Neurotransmitter-gated_ion-channel_ligand-bind...,T,T,D,D,N,D,T,T,0.0,D,T,T,T,D,D,T,4e-06,.,D,T,D,D,N,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,Gac/Aac,p.Asp165Asn,c.493G>A,GABRD,NM_000815.4,5,1,493,>,Asp,Asn,165,subst,rs1477740666,0.0,1,8/20,8,1,1,0,O14764,ACC,1,negativecharge,polar,negativechargeTOpolar,0,0,0TO0,0
1,1,2303896,C,T,.,D,D,D,T,D,D,D,D,8e-06,P,D,D,T,D,D,D,8e-06,.,D,D,D,D,M,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,cCg/cTg,p.Pro423Leu,c.1268C>T,SKI,NM_003036.3,4,2,1268,>,Pro,Leu,423,subst,rs752779978,0.0,1,16/20,16,1,1,1,P12755,ACC,-3,nonpolar,nonpolar,nonpolarTOnonpolar,0,1,0TO1,0
2,1,3816294,G,A,.,T,T,N,T,N,T,T,T,0.0,B,T,T,T,N,N,T,0.0,.,N,T,T,B,N,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,cCg/cTg,p.Pro883Leu,c.2648C>T,CEP104,NM_014704.3,21,2,2648,>,Pro,Leu,883,subst,rs1197412379,0.0,0,0/20,0,0,0,0,O60308,ACC,-3,nonpolar,nonpolar,nonpolarTOnonpolar,0,1,0TO1,0
3,1,9058243,G,A,.,D,D,D,T,D,D,D,T,2.5e-05,D,D,D,D,D,D,D,1.2e-05,.,D,D,D,D,M,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,aCg/aTg,p.Thr14Met,c.41C>T,SLC2A5,NM_003039.2,2,2,41,>,Thr,Met,14,subst,rs765084352,0.0,1,16/20,16,1,1,1,P22732,ACC,-1,polar,nonpolar,polarTOnonpolar,1,1,1TO1,0
4,1,9597039,G,A,"Ima1,_N-terminal_domain",T,T,N,T,N,T,T,T,2.5e-05,B,T,.,T,N,N,T,1.2e-05,.,N,T,T,B,N,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,Gct/Act,p.Ala139Thr,c.415G>A,TMEM201,NM_001130924.2,3,1,415,>,Ala,Thr,139,subst,rs752313181,0.0,0,0/20,0,0,0,0,Q5SNT2,ACC,0,nonpolar,polar,nonpolarTOpolar,0,1,0TO1,0
5,1,11501132,G,A,.,T,T,D,T,N,T,T,T,0.0,B,T,T,T,U,N,T,0.0,.,N,T,T,P,.,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,cGg/cAg,p.Arg47Gln,c.140G>A,PTCHD2,NM_020780.1,2,2,140,>,Arg,Gln,47,subst,rs1372854383,0.0,1,2/20,2,0,0,0,Q9P2K9,ACC,1,positivecharge,polar,positivechargeTOpolar,0,0,0TO0,0
6,1,11668750,G,A,F-box_domain,T,T,D,T,N,T,T,T,0.000321,B,T,T,T,N,N,T,0.000279,.,D,T,T,P,M,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,cGc/cAc,p.Arg31His,c.92G>A,FBXO6,NM_018438.5,2,2,92,>,Arg,His,31,subst,rs146139993,0.0,1,3/20,3,0,0,0,Q9NRD1,ACC,0,positivecharge,positivecharge,positivechargeTOpositivecharge,0,1,0TO1,0
7,1,12879686,C,G,.,T,T,N,T,D,T,D,T,6.7e-05,D,D,T,T,N,N,T,3.2e-05,.,N,T,.,D,M,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,aGa/aCa,p.Arg432Thr,c.1295G>C,PRAMEF4,NM_001009611.4,4,2,1295,>,Arg,Thr,432,subst,rs543370672,0.0,1,4/20,4,1,0,0,O60810,ACC,-1,positivecharge,polar,positivechargeTOpolar,0,1,0TO1,1
8,1,16992040,G,A,"HAD-like_domain|P-type_ATPase,_cytoplasmic_dom...",T,T,N,T,N,D,T,T,8e-06,B,T,T,T,N,N,T,4e-06,.,N,T,T,B,N,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,Ccc/Tcc,p.Pro699Ser,c.2095C>T,ATP13A2,NM_022089.3,19,1,2095,>,Pro,Ser,699,subst,rs756039984,0.0,0,1/20,1,0,0,0,Q9NQ11,ACC,-1,nonpolar,polar,nonpolarTOpolar,0,0,0TO0,0
9,1,18365410,C,T,Immunoglobulin-like_fold,T,T,D,T,N,D,T,T,0.000173,D,D,T,T,N,D,T,0.000119,.,D,T,T,D,M,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,aCg/aTg,p.Thr243Met,c.728C>T,IGSF21,NM_032880.4,6,2,728,>,Thr,Met,243,subst,rs144142024,1.0,1,6/20,6,0,0,0,Q96ID5,ACC,-1,polar,nonpolar,polarTOnonpolar,1,1,1TO1,0


##41.1 Generating an intermediate file with the df_33 database with the *substitution* field

In [None]:
df_33.to_csv("drive/My Drive/ProcessaNovaBase/TratamentosFinais_Base_33_Artigo_Base/AlphaPhold/Base_33Tecidos_Ate_Uniprot__Missense_clean_Blosum62_Group_Change_Essential_change_substitution.csv",sep='\t',index=False)