# Criando o pipeline para tratamento e obtenção dos dados
- Avaliar os dados recebidos
- Fazer o tratamento e a modelagem dos dados
- Construir o datapipeline de raw -> trusted

In [1]:
import os
from pathlib import Path
import zipfile
import pandas as pd

In [2]:
# definindo path para trabalho

rootPath = os.path.abspath('..')
dataPath = os.path.join(rootPath, 'data')

raw_path =     Path(f"{dataPath}/raw")
trusted_path = Path(f"{dataPath}/trusted")

In [3]:
# retira os dados do zip contido
def unzip_file(folder, file_name):
    zip_file = os.path.join(folder, file_name)
    z = zipfile.ZipFile(zip_file)
    z.extractall(folder)
    folder_files = os.listdir(folder)
    return print(f"Files on folder: {folder_files}")

unzip_file(raw_path, 'dados.zip')

Files on folder: ['dados.zip', 'pdb_data_no_dups.csv', '.gitkeep', 'pdb_data_seq.csv']


In [4]:
# faz a leitura das duas bases csv
df_structure = pd.read_csv(os.path.join(raw_path, 'pdb_data_no_dups.csv'))
df_sequence  = pd.read_csv(os.path.join(raw_path, 'pdb_data_seq.csv'))

In [5]:
# visualizando os dados brutos
def dataframe_resume(df: pd.DataFrame):
    display(df.head())
    display(df.info())
    print("Verificando nulos")
    display(df.isna().sum())

print("Dados gerais da estrutura molecular")
dataframe_resume(df_structure)

print("\n")

print("Dados com a sequencia genetica da estrutura molecular")
dataframe_resume(df_sequence)

Dados gerais da estrutura molecular


Unnamed: 0,structureId,classification,experimentalTechnique,macromoleculeType,residueCount,resolution,structureMolecularWeight,crystallizationMethod,crystallizationTempK,densityMatthews,densityPercentSol,pdbxDetails,phValue,publicationYear
0,100D,DNA-RNA HYBRID,X-RAY DIFFRACTION,DNA/RNA Hybrid,20,1.9,6360.3,"VAPOR DIFFUSION, HANGING DROP",,1.78,30.89,"pH 7.00, VAPOR DIFFUSION, HANGING DROP",7.0,1994.0
1,101D,DNA,X-RAY DIFFRACTION,DNA,24,2.25,7939.35,,,2.0,38.45,,,1995.0
2,101M,OXYGEN TRANSPORT,X-RAY DIFFRACTION,Protein,154,2.07,18112.8,,,3.09,60.2,"3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ...",9.0,1999.0
3,102D,DNA,X-RAY DIFFRACTION,DNA,24,2.2,7637.17,"VAPOR DIFFUSION, SITTING DROP",277.0,2.28,46.06,"pH 7.00, VAPOR DIFFUSION, SITTING DROP, temper...",7.0,1995.0
4,102L,HYDROLASE(O-GLYCOSYL),X-RAY DIFFRACTION,Protein,165,1.74,18926.61,,,2.75,55.28,,,1993.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141401 entries, 0 to 141400
Data columns (total 14 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   structureId               141401 non-null  object 
 1   classification            141399 non-null  object 
 2   experimentalTechnique     141401 non-null  object 
 3   macromoleculeType         137636 non-null  object 
 4   residueCount              141401 non-null  int64  
 5   resolution                128589 non-null  float64
 6   structureMolecularWeight  141401 non-null  float64
 7   crystallizationMethod     96242 non-null   object 
 8   crystallizationTempK      97039 non-null   float64
 9   densityMatthews           124724 non-null  float64
 10  densityPercentSol         124749 non-null  float64
 11  pdbxDetails               118534 non-null  object 
 12  phValue                   105110 non-null  float64
 13  publicationYear           117602 non-null  f

None

Verificando nulos


structureId                     0
classification                  2
experimentalTechnique           0
macromoleculeType            3765
residueCount                    0
resolution                  12812
structureMolecularWeight        0
crystallizationMethod       45159
crystallizationTempK        44362
densityMatthews             16677
densityPercentSol           16652
pdbxDetails                 22867
phValue                     36291
publicationYear             23799
dtype: int64



Dados com a sequencia genetica da estrutura molecular


Unnamed: 0,structureId,chainId,sequence,residueCount,macromoleculeType
0,100D,A,CCGGCGCCGG,20,DNA/RNA Hybrid
1,100D,B,CCGGCGCCGG,20,DNA/RNA Hybrid
2,101D,A,CGCGAATTCGCG,24,DNA
3,101D,B,CGCGAATTCGCG,24,DNA
4,101M,A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,154,Protein


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467304 entries, 0 to 467303
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   structureId        467304 non-null  object
 1   chainId            467294 non-null  object
 2   sequence           467276 non-null  object
 3   residueCount       467304 non-null  int64 
 4   macromoleculeType  432487 non-null  object
dtypes: int64(1), object(4)
memory usage: 17.8+ MB


None

Verificando nulos


structureId              0
chainId                 10
sequence                28
residueCount             0
macromoleculeType    34817
dtype: int64

In [6]:
# verificando valores duplicados de ID
def duplicated_id(df: pd.DataFrame, idColum: str):
    df = df.pivot_table(index= [f'{idColum}'], aggfunc='size')
    df = pd.DataFrame(df).rename(columns={0: "count"})
    df = df.query("count > 1")
    df = df.sort_values('count', ascending=False)
    return df

display(duplicated_id(df_structure, 'structureId'))
display(duplicated_id(df_sequence, 'structureId'))

Unnamed: 0_level_0,count
structureId,Unnamed: 1_level_1
2FYM,4
1UJQ,4
3NO0,4
1OS5,3
4KJ3,3
...,...
2QBZ,2
2Q8U,2
2Q30,2
2PV0,2


Unnamed: 0_level_0,count
structureId,Unnamed: 1_level_1
3J3Q,1356
3J3Y,1176
5Y6P,862
4V99,480
5MQ7,360
...,...
4CN0,2
4CMX,2
4CMW,2
4CO0,2


In [7]:
# verificando a diferença das duplicatas para ambas as tabelas

print("Dados gerais das estruturas")
display(df_structure.query("structureId == '2FYM'"))


print("Dados que contem a sequencia")
display(df_sequence.query("structureId == '5Y6P'"))

Dados gerais das estruturas


Unnamed: 0,structureId,classification,experimentalTechnique,macromoleculeType,residueCount,resolution,structureMolecularWeight,crystallizationMethod,crystallizationTempK,densityMatthews,densityPercentSol,pdbxDetails,phValue,publicationYear
38806,2FYM,LYASE,X-RAY DIFFRACTION,Protein,1760,1.6,186502.5,"VAPOR DIFFUSION, HANGING DROP",298.0,,,"8% PEG 4K, 0.2 M imidazole maleate, pH 6.0, VA...",6.0,2006.0
38807,2FYM,LYASE,X-RAY DIFFRACTION,Protein,1760,1.6,186502.5,"VAPOR DIFFUSION, HANGING DROP",298.0,,,"27% PEG 600, 0.1M HEPES, pH 7.5, VAPOR DIFFUSI...",7.5,2006.0
38808,2FYM,LYASE,X-RAY DIFFRACTION,Protein,1760,1.6,186502.5,"VAPOR DIFFUSION, HANGING DROP",298.0,2.47,50.11,"2.4M Sodium malonate, pH 7.0, VAPOR DIFFUSION,...",7.0,2006.0
38809,2FYM,LYASE,X-RAY DIFFRACTION,Protein,1760,1.6,186502.5,"VAPOR DIFFUSION, HANGING DROP",298.0,,,"40% MPEG 550, 0.1M HEPES, pH 8.2, VAPOR DIFFUS...",8.2,2006.0


Dados que contem a sequencia


Unnamed: 0,structureId,chainId,sequence,residueCount,macromoleculeType
460694,5Y6P,14,MSIPVLNYSLSTQNQRVYSFEYLPNEEQPKCYTTDNLPAAIEMDQI...,157478,Protein
460695,5Y6P,24,MAFVPIVNLRPALSAPSTSSFSGVRLADAPTSTPAPVVARLGTVLG...,157478,Protein
460696,5Y6P,34,MDSPAFAVNGMFSAVKVGNSSFTENKVTAVSKTAPTASVRMVVDPF...,157478,Protein
460697,5Y6P,44,MYAFAPNTPFTASKAVVGKTSFTSPLPAQSESRPTAAPTMVLRTVL...,157478,Protein
460698,5Y6P,A1,MQAFIPSSSLSALTGAPVQKSSALTSLRTTRSATPCTTRMAAYPYT...,157478,Protein
...,...,...,...,...,...
461551,5Y6P,z5,MLDAFSRVVVNSDSKAAYVGGSDLQSLKTFISDGNKRLDAVNCIVS...,157478,Protein
461552,5Y6P,z6,MLDAFSRVVVNSDSKAAYVGGSDLQSLKTFISDGNKRLDAVNCIVS...,157478,Protein
461553,5Y6P,z7,MLDAFSRVVVNSDSKAAYVGGSDLQSLKTFISDGNKRLDAVNCIVS...,157478,Protein
461554,5Y6P,z8,MKSVITTTISAADAAGRFPSSSDLESIQGNIQRASARLEAAEKLSG...,157478,Protein


O relacionamento entre Id e tabela é diferente para cada uma das colunas apresentadas.

Por isso, os arquivos vão passar por um data quality (padronização e otimização das colunas) e posteriormente será feita a modelagem visando uma feature store relacional, contendo a tabela fato e as dimensões.

In [8]:
# funcao para data quality:
# retira valores nulos e converte
# ira converter a coluna de acordo com o mapeado em um json
# mantem a fonte de dados final com a mesma estrutura, porem padronizado

def quality(df: pd.DataFrame, key_dict: dict):
    df_quality = pd.DataFrame()
    for column in key_dict:
        column_type = key_dict[column]
        column_name = f"{column_type.upper()}_{column.upper()}"
        df_quality[column_name] = df[column].dropna().astype(column_type)
    return df_quality

# o arquivo convertido e salvo como parquet na pasta trusted
def save_parquet(df: pd.DataFrame, folder, file_name):
    path_save = os.path.join(folder, file_name)
    df.to_parquet(path_save, index=False)
    return print(f"Trusted persisted data: {path_save}")

In [9]:
# processando e salvando os dados gerais
dtype_map = {
    'structureId' : 'str',
    'classification' : 'str',
    'experimentalTechnique' : 'str',
    'macromoleculeType' : 'str',
    'residueCount' : 'int',
    'resolution': 'float',
    'structureMolecularWeight' : 'float',
    'crystallizationMethod' : 'str',
    'crystallizationTempK' : 'float',
    'densityMatthews' : 'float',
    'densityPercentSol' : 'float',
    'pdbxDetails' : 'str',
    'phValue' : 'float',
    'publicationYear' : 'int'
}

save_parquet(
    df= quality(df_structure, dtype_map),
    folder= trusted_path,
    file_name= "data_structure.parquet"
    )

Trusted persisted data: /home/gustavo-cunha/Documentos/GitHub/classificacao_proteinas/data/trusted/data_structure.parquet


In [10]:
# processando e salvando os dados da sequencia
dtype_map = {
    'structureId' : 'str',
    'chainId' : 'str',
    'sequence' : 'str',
    'residueCount' : 'int',
    'macromoleculeType' : 'str'
}

save_parquet(
    df= quality(df_sequence, dtype_map),
    folder= trusted_path,
    file_name= "data_sequence.parquet"
    )

Trusted persisted data: /home/gustavo-cunha/Documentos/GitHub/classificacao_proteinas/data/trusted/data_sequence.parquet


**Verificando os dados persistidos**:

In [12]:
display(
    dataframe_resume(
        pd.read_parquet(os.path.join(trusted_path, 'data_sequence.parquet'))
    )
)

Unnamed: 0,STR_STRUCTUREID,STR_CHAINID,STR_SEQUENCE,INT_RESIDUECOUNT,STR_MACROMOLECULETYPE
0,100D,A,CCGGCGCCGG,20,DNA/RNA Hybrid
1,100D,B,CCGGCGCCGG,20,DNA/RNA Hybrid
2,101D,A,CGCGAATTCGCG,24,DNA
3,101D,B,CGCGAATTCGCG,24,DNA
4,101M,A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,154,Protein


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467304 entries, 0 to 467303
Data columns (total 5 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   STR_STRUCTUREID        467304 non-null  object
 1   STR_CHAINID            467294 non-null  object
 2   STR_SEQUENCE           467276 non-null  object
 3   INT_RESIDUECOUNT       467304 non-null  int64 
 4   STR_MACROMOLECULETYPE  432487 non-null  object
dtypes: int64(1), object(4)
memory usage: 17.8+ MB


None

Verificando nulos


STR_STRUCTUREID              0
STR_CHAINID                 10
STR_SEQUENCE                28
INT_RESIDUECOUNT             0
STR_MACROMOLECULETYPE    34817
dtype: int64

None

In [13]:
display(
    dataframe_resume(
        pd.read_parquet(os.path.join(trusted_path, 'data_structure.parquet'))
    )
)

Unnamed: 0,STR_STRUCTUREID,STR_CLASSIFICATION,STR_EXPERIMENTALTECHNIQUE,STR_MACROMOLECULETYPE,INT_RESIDUECOUNT,FLOAT_RESOLUTION,FLOAT_STRUCTUREMOLECULARWEIGHT,STR_CRYSTALLIZATIONMETHOD,FLOAT_CRYSTALLIZATIONTEMPK,FLOAT_DENSITYMATTHEWS,FLOAT_DENSITYPERCENTSOL,STR_PDBXDETAILS,FLOAT_PHVALUE,INT_PUBLICATIONYEAR
0,100D,DNA-RNA HYBRID,X-RAY DIFFRACTION,DNA/RNA Hybrid,20,1.9,6360.3,"VAPOR DIFFUSION, HANGING DROP",,1.78,30.89,"pH 7.00, VAPOR DIFFUSION, HANGING DROP",7.0,1994.0
1,101D,DNA,X-RAY DIFFRACTION,DNA,24,2.25,7939.35,,,2.0,38.45,,,1995.0
2,101M,OXYGEN TRANSPORT,X-RAY DIFFRACTION,Protein,154,2.07,18112.8,,,3.09,60.2,"3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ...",9.0,1999.0
3,102D,DNA,X-RAY DIFFRACTION,DNA,24,2.2,7637.17,"VAPOR DIFFUSION, SITTING DROP",277.0,2.28,46.06,"pH 7.00, VAPOR DIFFUSION, SITTING DROP, temper...",7.0,1995.0
4,102L,HYDROLASE(O-GLYCOSYL),X-RAY DIFFRACTION,Protein,165,1.74,18926.61,,,2.75,55.28,,,1993.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141401 entries, 0 to 141400
Data columns (total 14 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   STR_STRUCTUREID                 141401 non-null  object 
 1   STR_CLASSIFICATION              141399 non-null  object 
 2   STR_EXPERIMENTALTECHNIQUE       141401 non-null  object 
 3   STR_MACROMOLECULETYPE           137636 non-null  object 
 4   INT_RESIDUECOUNT                141401 non-null  int64  
 5   FLOAT_RESOLUTION                128589 non-null  float64
 6   FLOAT_STRUCTUREMOLECULARWEIGHT  141401 non-null  float64
 7   STR_CRYSTALLIZATIONMETHOD       96242 non-null   object 
 8   FLOAT_CRYSTALLIZATIONTEMPK      97039 non-null   float64
 9   FLOAT_DENSITYMATTHEWS           124724 non-null  float64
 10  FLOAT_DENSITYPERCENTSOL         124749 non-null  float64
 11  STR_PDBXDETAILS                 118534 non-null  object 
 12  FLOAT_PHVALUE   

None

Verificando nulos


STR_STRUCTUREID                       0
STR_CLASSIFICATION                    2
STR_EXPERIMENTALTECHNIQUE             0
STR_MACROMOLECULETYPE              3765
INT_RESIDUECOUNT                      0
FLOAT_RESOLUTION                  12812
FLOAT_STRUCTUREMOLECULARWEIGHT        0
STR_CRYSTALLIZATIONMETHOD         45159
FLOAT_CRYSTALLIZATIONTEMPK        44362
FLOAT_DENSITYMATTHEWS             16677
FLOAT_DENSITYPERCENTSOL           16652
STR_PDBXDETAILS                   22867
FLOAT_PHVALUE                     36291
INT_PUBLICATIONYEAR               23799
dtype: int64

None