**Goal** : Merge all csv files related to dpcfam into a unique csv file.

In [1]:
# 0. Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# 1. Paths to files
# Root  Path
root_path = "/u/mdmc/enyanduk/internship_areasciencepark/Dataframes/DPCFam/"
# Files
# 1. MC LIST
mc_list = root_path + "mclist.csv"
df1 = pd.read_csv(mc_list)
# 2. SEQUENCE INFORMATION
sequence_information = root_path + "sequence_information.csv"
df2 = pd.read_csv(sequence_information)
# 3. PFAM COMPARISON
pfam_comparison = root_path + "pfam_comparison.csv"
df3 = pd.read_csv(pfam_comparison)
# 4. LC REGIONS
lc_regions = root_path + "lcregions.csv"
df4 = pd.read_csv(lc_regions)
# 5. CC REGIONS
cc_regions = root_path + "ccregions.csv"
df5 = pd.read_csv(cc_regions)
# 6. DIS REGIONS
dis_regions = root_path + "disregions.csv"
df6 = pd.read_csv(dis_regions)
# 7. TM REGIONS
tm_regions = root_path + "tmregions.csv"
df7 = pd.read_csv(tm_regions)

In [3]:
# List of dataframes
dfs = [df1, df2, df3, df4, df5, df6, df7]

In [4]:
# Merge all dataframes :
df = (
    pd.concat(
        [d.set_index("MCID") for d in dfs],
        axis=1,
        join="outer"   # keeps all MCIDs
    )
    .reset_index()
)

In [5]:
df.head()

Unnamed: 0,MCID,seed_size,average_lenght,std_average_length,DA,DAC,%DA,%DAC,%DACF,%DACFA,AvOv,fred,fext,LABEL,pfam_seqs,%LC,%CC,%DIS,TM
0,1,17931,185.681,28.7691,PF13614,CCL0023,0.442271,0.855137,0.989872,0.999022,0.808203,0.058723,0.134096,equivalent,6332,0.047194,0.0,0.184394,0.0098
1,4,617,59.9109,6.0669,PF03600,CCL0182,0.628415,0.704918,0.970856,1.0,0.075365,0.917423,0.699776,shifted,345,0.049879,0.0,0.018654,1.2619
2,15,139,81.2086,5.0515,UNK,UNK,,,,,,,,,131,0.04956,0.001799,0.138066,0.029126
3,19,120,71.5667,7.69711,PF11915,PF11915,0.940678,0.940678,1.0,1.0,0.136637,0.774788,0.722817,shifted,111,0.088535,0.022461,0.048627,1.68966
4,21,937,91.1974,7.69776,PF01012,CCL0039,0.988998,0.988998,1.0,1.0,0.345184,0.591957,0.278153,shifted,809,0.026462,0.0,0.243835,0.0


In [6]:
# Meaningful transformations in the dataframe
# T1 : Rewrite each ID in MCID column as MCID : e.g : 1 -> MC1
df["MCID"] = df["MCID"].apply(lambda x: f"MC{x}")
# T2 : Rename seed_size column as Size Uni50, DA as PFam DA,pfam_seqs as Size Pfam,label as Overlap Label
df.rename(columns={"seed_size": "Size Uni50", "DA": "PFam DA", "pfam_seqs": "Size Pfam", "LABEL": "Overlap Label"}, inplace=True)
# T3 : Rename average_length as Avg.Len  
df.rename(columns={"average_lenght": "Avg.Len"}, inplace=True)
# T4 : For each value != NaN in columns (%DA,%LC,%CC,%DIS), multiply it by 100 and round it (2 digits after ,)
df[["%DA", "%LC", "%CC", "%DIS"]] = df[["%DA", "%LC", "%CC", "%DIS"]].fillna(0).multiply(100).round(2)
# T5 : Round  columns Avg. Len, TM to 2 digits:
df[["Avg.Len", "TM"]] = df[["Avg.Len", "TM"]].round(2)
# T6 : Drop some columns and return the dataframe organised as 
# (MCID,Size Uni50, Avg.Len, %LC, %CC, %DIS,TM,Pfam Size, Pfam DA,%DA, Overlap Label):
df = df[["MCID", "Size Uni50", "Avg.Len", "%LC", "%CC", "%DIS", "TM", "Size Pfam", "PFam DA", "%DA", "Overlap Label"]]
# T7 : In column PFam DA : replace UNK by UNKOWN, in Overlap Label : replace Nan by None
df["PFam DA"] = df["PFam DA"].replace("UNK", "UNKNOWN")
df["Overlap Label"] = df["Overlap Label"].fillna("NONE")
df.head()

Unnamed: 0,MCID,Size Uni50,Avg.Len,%LC,%CC,%DIS,TM,Size Pfam,PFam DA,%DA,Overlap Label
0,MC1,17931,185.68,4.72,0.0,18.44,0.01,6332,PF13614,44.23,equivalent
1,MC4,617,59.91,4.99,0.0,1.87,1.26,345,PF03600,62.84,shifted
2,MC15,139,81.21,4.96,0.18,13.81,0.03,131,UNKNOWN,0.0,NONE
3,MC19,120,71.57,8.85,2.25,4.86,1.69,111,PF11915,94.07,shifted
4,MC21,937,91.2,2.65,0.0,24.38,0.0,809,PF01012,98.9,shifted


In [7]:
# Infos
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 46828 entries, 0 to 46827
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MCID           46828 non-null  str    
 1   Size Uni50     46828 non-null  int64  
 2   Avg.Len        46828 non-null  float64
 3   %LC            46828 non-null  float64
 4   %CC            46828 non-null  float64
 5   %DIS           46828 non-null  float64
 6   TM             46828 non-null  float64
 7   Size Pfam      46828 non-null  int64  
 8   PFam DA        46828 non-null  str    
 9   %DA            46828 non-null  float64
 10  Overlap Label  46828 non-null  str    
dtypes: float64(6), int64(2), str(3)
memory usage: 3.9 MB


In [8]:
# Statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Size Uni50,46828.0,328.904758,2146.797795,50.0,69.0,104.0,196.0,131827.0
Avg.Len,46828.0,175.568361,162.883216,50.0,79.41,120.96,208.325,3150.42
%LC,46828.0,4.425717,4.629315,0.0,1.51,2.85,5.6,49.97
%CC,46828.0,1.580738,7.39121,0.0,0.0,0.0,0.02,86.34
%DIS,46828.0,24.050744,14.235749,0.09,14.89,21.82,30.32,92.85
TM,46828.0,0.348141,1.16327,0.0,0.0,0.0,0.04,35.25
Size Pfam,46828.0,188.447339,1090.696865,1.0,37.0,62.0,123.0,90231.0
%DA,46828.0,46.016375,43.294262,0.0,0.0,43.9,94.44,100.0


In [9]:
# Save to csv 
final_target = root_path + "dpcfam_standard_merged_properties.csv"
df.to_csv(final_target, index=False)