**Goal** : Convert `pfam_comparison.txt` to `pfam_comparison.csv`

In [1]:
# 0. Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# 1. Paths to files
# Input file
source_path = "/u/mdmc/enyanduk/internship_areasciencepark/Data/dpcfam/dpcfam_standard/zenodo_unzipped_folders/metaclusters_xml/mc_info_from_xml/pfam_comparison.txt"
# Output file
target_path = "/u/mdmc/enyanduk/internship_areasciencepark/Dataframes/DPCFam/"

In [3]:
# Type of file ?
with open(source_path, "r",encoding="utf-8") as f:
  print(repr(f.readline()))

'#MC DA DAC %DA %DAC %DACF %DACFA AvOv fred fext LABEL pfam_seqs\n'


`Oberservation` : It is not a `tab`-separated file, it's rather a `spaces`-separated file.

In [4]:
# Convert to a dataframe
df = pd.read_csv(source_path, sep=r"\s+")
df.head()

Unnamed: 0,#MC,DA,DAC,%DA,%DAC,%DACF,%DACFA,AvOv,fred,fext,LABEL,pfam_seqs
0,1,PF13614,CCL0023,0.442271,0.855137,0.989872,0.999022,0.808203,0.058723,0.134096,equivalent,6332
1,4,PF03600,CCL0182,0.628415,0.704918,0.970856,1.0,0.075365,0.917423,0.699776,shifted,345
2,15,UNK,UNK,,,,,,,,,131
3,19,PF11915,PF11915,0.940678,0.940678,1.0,1.0,0.136637,0.774788,0.722817,shifted,111
4,21,PF01012,CCL0039,0.988998,0.988998,1.0,1.0,0.345184,0.591957,0.278153,shifted,809


In [5]:
# Infos
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 46828 entries, 0 to 46827
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   #MC        46828 non-null  int64  
 1   DA         46828 non-null  str    
 2   DAC        46828 non-null  str    
 3   %DA        32638 non-null  float64
 4   %DAC       32638 non-null  float64
 5   %DACF      32638 non-null  float64
 6   %DACFA     32638 non-null  float64
 7   AvOv       32638 non-null  float64
 8   fred       32638 non-null  float64
 9   fext       32638 non-null  float64
 10  LABEL      32638 non-null  str    
 11  pfam_seqs  46828 non-null  int64  
dtypes: float64(7), int64(2), str(3)
memory usage: 4.3 MB


In [6]:
# Renaming 
df = df.rename(columns={"#MC": "MCID"})
df["MCID"] = df["MCID"].astype(int)
df.head()

Unnamed: 0,MCID,DA,DAC,%DA,%DAC,%DACF,%DACFA,AvOv,fred,fext,LABEL,pfam_seqs
0,1,PF13614,CCL0023,0.442271,0.855137,0.989872,0.999022,0.808203,0.058723,0.134096,equivalent,6332
1,4,PF03600,CCL0182,0.628415,0.704918,0.970856,1.0,0.075365,0.917423,0.699776,shifted,345
2,15,UNK,UNK,,,,,,,,,131
3,19,PF11915,PF11915,0.940678,0.940678,1.0,1.0,0.136637,0.774788,0.722817,shifted,111
4,21,PF01012,CCL0039,0.988998,0.988998,1.0,1.0,0.345184,0.591957,0.278153,shifted,809


In [7]:
# Statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MCID,46828.0,219773.410182,146629.434003,1.0,89277.25,204258.5,341567.5,504487.0
%DA,32638.0,0.660229,0.36992,0.000298,0.32967,0.848947,0.976378,1.0
%DAC,32638.0,0.668539,0.368022,0.000298,0.353243,0.861111,0.977273,1.0
%DACF,32638.0,0.968146,0.083224,0.124533,0.981132,1.0,1.0,1.0
%DACFA,32638.0,0.995486,0.023713,0.322124,1.0,1.0,1.0,1.0
AvOv,32638.0,0.43117,0.290307,0.001387,0.192232,0.364685,0.688475,0.995589
fred,32638.0,0.39229,0.340314,0.0,0.021618,0.420745,0.707163,0.996639
fext,32638.0,0.349807,0.330001,0.0,0.045691,0.219642,0.659183,0.996865
pfam_seqs,46828.0,188.447339,1090.696865,1.0,37.0,62.0,123.0,90231.0


In [8]:
# Save to csv 
final_target = target_path + "pfam_comparison.csv"
df.to_csv(final_target, index=False)