**Goal** : Extract meaningful features from `alphafoldDB.txt` to `alphafold_dpcfam_reps.csv`


In [1]:
# A. Imports
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# C. Path to data
root_path = "/u/mdmc/enyanduk/internship_areasciencepark/Data/dpcfam/dpcfam_standard/"
path_to_alphafoldDB = root_path + "/zenodo_unzipped_folders/metaclusters_xml/mc_info_from_xml/alphafoldDB.txt"
# Output file
target_path = "/u/mdmc/enyanduk/internship_areasciencepark/Dataframes/DPCFam/"

In [3]:
with open(path_to_alphafoldDB, "r",encoding="utf-8") as f:
  print(repr(f.readline()))

'#AF_protein MC_ID MC_hmm_length e-value AF_seq_start AF_seq_end MC_hmm_start MC_hmm_end MC_hmm_coverage average_plddt\n'


`Oberservation` : It's a `spaces`-separated file.

In [4]:
# D. dataframe
df = pd.read_csv(path_to_alphafoldDB, sep=r"\s+")
df.head()

Unnamed: 0,#AF_protein,MC_ID,MC_hmm_length,e-value,AF_seq_start,AF_seq_end,MC_hmm_start,MC_hmm_end,MC_hmm_coverage,average_plddt
0,AF-P0A149-F1,MC1,119,2.8000000000000003e-23,4,181,1,112,0.941176,96.2441
1,AF-Q04671-F1,MC4,63,1.3e-29,770,830,2,62,0.968254,93.797
2,AF-A0A1C1CP96-F1,MC19,71,1.3e-40,66,135,2,71,0.985915,83.6246
3,AF-P97089-F1,MC21,93,1.5e-35,1,93,1,93,1.0,93.5686
4,AF-P9WJB2-F1,MC24,90,4.5e-26,161,250,2,89,0.977778,89.7296


In [5]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 38668 entries, 0 to 38667
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   #AF_protein      38668 non-null  str    
 1   MC_ID            38668 non-null  str    
 2   MC_hmm_length    38668 non-null  int64  
 3   e-value          38668 non-null  float64
 4   AF_seq_start     38668 non-null  int64  
 5   AF_seq_end       38668 non-null  int64  
 6   MC_hmm_start     38668 non-null  int64  
 7   MC_hmm_end       38668 non-null  int64  
 8   MC_hmm_coverage  38668 non-null  float64
 9   average_plddt    38668 non-null  float64
dtypes: float64(3), int64(5), str(2)
memory usage: 3.0 MB


In [6]:
# Rename some columns meaningfully:
df = df.rename(columns={
    "#AF_protein": "alphafold_prot",
    "MC_ID":"mcid",
    "AF_seq_start":"af_seq_start",
    "AF_seq_end":"af_seq_end",
    "MC_hmm_coverage":"hmm_coverage",
    "e-value":"e_value",
    "average_plddt":"avg_plddt"
    })
df.head()

Unnamed: 0,alphafold_prot,mcid,MC_hmm_length,e_value,af_seq_start,af_seq_end,MC_hmm_start,MC_hmm_end,hmm_coverage,avg_plddt
0,AF-P0A149-F1,MC1,119,2.8000000000000003e-23,4,181,1,112,0.941176,96.2441
1,AF-Q04671-F1,MC4,63,1.3e-29,770,830,2,62,0.968254,93.797
2,AF-A0A1C1CP96-F1,MC19,71,1.3e-40,66,135,2,71,0.985915,83.6246
3,AF-P97089-F1,MC21,93,1.5e-35,1,93,1,93,1.0,93.5686
4,AF-P9WJB2-F1,MC24,90,4.5e-26,161,250,2,89,0.977778,89.7296


In [7]:
# Perform some meaningful transformations : 
# T1 : Drop some columns:
df = df.drop(columns=["MC_hmm_length", "MC_hmm_start", "MC_hmm_end"])

# T2 : Shorten the writing of e_value column : eg : 1.300000e-29 -> 1.3e-29
df["e_value"] = df["e_value"].apply(lambda x: f"{x:.1e}" if pd.notnull(x) else x)

# T3 : Combine af_seq_start and af_seq_end columns into one : seq_range = af_seq_start - af_seq_end:
df["seq_range"] = df["af_seq_start"].astype(str) + '-' + df["af_seq_end"].astype(str)

# T4 : Multiply hmm_coverage column by 100 and display 2 digits after ,:
df["hmm_coverage"] = df["hmm_coverage"].fillna(0).multiply(100).round(2)

# T5 : Display 2 digits after , in avg_plddt column
df["avg_plddt"] = df["avg_plddt"].fillna(0).round(2)

# Final dataframe : 
df = df[["mcid", "alphafold_prot", "seq_range", "hmm_coverage", "avg_plddt"]]

# Head:
df.head()

Unnamed: 0,mcid,alphafold_prot,seq_range,hmm_coverage,avg_plddt
0,MC1,AF-P0A149-F1,4-181,94.12,96.24
1,MC4,AF-Q04671-F1,770-830,96.83,93.8
2,MC19,AF-A0A1C1CP96-F1,66-135,98.59,83.62
3,MC21,AF-P97089-F1,1-93,100.0,93.57
4,MC24,AF-P9WJB2-F1,161-250,97.78,89.73


In [8]:
# Statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hmm_coverage,38668.0,89.783183,20.077372,2.42,94.49,98.15,99.33,100.0
avg_plddt,38668.0,84.171784,13.497789,22.39,79.13,88.55,93.88,98.87


In [9]:
# Info
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 38668 entries, 0 to 38667
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mcid            38668 non-null  str    
 1   alphafold_prot  38668 non-null  str    
 2   seq_range       38668 non-null  str    
 3   hmm_coverage    38668 non-null  float64
 4   avg_plddt       38668 non-null  float64
dtypes: float64(2), str(3)
memory usage: 1.5 MB


In [10]:
# Save as .csv
path_to_alphafoldDB_csv = target_path + "alphafold_dpcfam_reps.csv"
df.to_csv(path_to_alphafoldDB_csv, index=False)