**Goal** : We aim to explore relevant information from `dpcfam/uniref50_annotated.xml.tar.gz` to desired dataframes.

In [1]:
# 0. Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# 1. Location 
uniref_data = "/u/mdmc/enyanduk/internship_areasciencepark/Dataframes/DPCFam/uniref50_dpcfam_pfam_mapping.csv"

In [3]:
# 2. Load data
df_uniref = pd.read_csv(uniref_data)

In [4]:
# 3. Explore data
df_uniref.head()

Unnamed: 0,Uniref50_id,UniProtKB_ID,UniProtKB_accession,length,dpcfam_ids,dpcfam_ranges,pfam_ids,pfam_ranges
0,UniRef50_Q8WZ42-5,Q8WZ42-5,Q8WZ42-5,32900,MC392407;MC109156;MC435339;MC342152;MC205449;M...,9481-9542;7361-7581;22113-22225;463-877;30726-...,,
1,UniRef50_Q3ASY8,Q3ASY8_CHLCH,Q3ASY8,36805,MC88908;MC88908;MC88908;MC88908;MC88908;MC8890...,2219-2303;10892-10977;10520-10601;6648-6735;10...,,
2,UniRef50_G5B0U1,G5B0U1_HETGA,G5B0U1,36507,MC392407;MC109156;MC435339;MC342152;MC205449;M...,11207-11268;9273-9489;25109-25228;13519-13799;...,PF07679;PF07679;PF07679;PF07679;PF07679;PF0767...,15159-15237;11109-11197;10241-10328;2348-2427;...
3,UniRef50_Q8WZ42,TITIN_HUMAN,Q9Y6L9,34350,MC392407;MC109156;MC435339;MC342152;MC205449;M...,9481-9542;7438-7662;23564-23658;461-744;32176-...,PF07679;PF07679;PF07679;PF07679;PF07679;PF0767...,24163-24243;8514-8602;7478-7568;4675-4753;2447...
4,UniRef50_K7EE71,K7EE71_ORNAN,K7EE71,7472,MC109156;MC435339;MC24649;MC24649;MC24649;MC24...,3642-3792;3891-3983;2686-2888;1592-1808;5922-6...,PF07679;PF07679;PF07679;PF07679;PF07679;PF0767...,6959-7041;68-149;5477-5558;2230-2312;1549-1627...


Let's split this dataframe into 3 useful dataframes to be mapped to tables in PostgreSQL database.

**1. Uniref50 proteins**

In [5]:
# 1.poteins:
df_proteins =  df_uniref[["Uniref50_id", "UniProtKB_ID", "UniProtKB_accession", "length"]]
# some transformtions to be done on this dataframe:
# T1 : In column Uniref50_id, we consider only id part and remove the prefix "UniRef50_"
df_proteins["Uniref50_id"] = df_proteins["Uniref50_id"].apply(lambda x: x.split("UniRef50_")[1])    
# T2  : We rename columns to python standard naming convention
df_proteins.rename(columns={
    "Uniref50_id": "uniref50_id", 
    "UniProtKB_ID": "uniprotkb_id", 
    "UniProtKB_accession": "uniprotkb_accession", 
    "length": "length"}, inplace=True)
# How it looks like now
df_proteins.head()

Unnamed: 0,uniref50_id,uniprotkb_id,uniprotkb_accession,length
0,Q8WZ42-5,Q8WZ42-5,Q8WZ42-5,32900
1,Q3ASY8,Q3ASY8_CHLCH,Q3ASY8,36805
2,G5B0U1,G5B0U1_HETGA,G5B0U1,36507
3,Q8WZ42,TITIN_HUMAN,Q9Y6L9,34350
4,K7EE71,K7EE71_ORNAN,K7EE71,7472


In [6]:
# Info about the dataframe
df_proteins.info()

<class 'pandas.DataFrame'>
RangeIndex: 23531980 entries, 0 to 23531979
Data columns (total 4 columns):
 #   Column               Dtype
---  ------               -----
 0   uniref50_id          str  
 1   uniprotkb_id         str  
 2   uniprotkb_accession  str  
 3   length               int64
dtypes: int64(1), str(3)
memory usage: 718.1 MB


In [7]:
# Save the dataframe
df_proteins.to_csv("/u/mdmc/enyanduk/internship_areasciencepark/Dataframes/DPCFam/uniref50_proteins.csv", index=False)

**2. Uniref50 proteins - DPCFam metaclusters**

In [8]:
# We extract releavant columns from the original dataframe
df_dpcfam = df_uniref[["Uniref50_id", "dpcfam_ids", "dpcfam_ranges"]]
df_dpcfam.head()

Unnamed: 0,Uniref50_id,dpcfam_ids,dpcfam_ranges
0,UniRef50_Q8WZ42-5,MC392407;MC109156;MC435339;MC342152;MC205449;M...,9481-9542;7361-7581;22113-22225;463-877;30726-...
1,UniRef50_Q3ASY8,MC88908;MC88908;MC88908;MC88908;MC88908;MC8890...,2219-2303;10892-10977;10520-10601;6648-6735;10...
2,UniRef50_G5B0U1,MC392407;MC109156;MC435339;MC342152;MC205449;M...,11207-11268;9273-9489;25109-25228;13519-13799;...
3,UniRef50_Q8WZ42,MC392407;MC109156;MC435339;MC342152;MC205449;M...,9481-9542;7438-7662;23564-23658;461-744;32176-...
4,UniRef50_K7EE71,MC109156;MC435339;MC24649;MC24649;MC24649;MC24...,3642-3792;3891-3983;2686-2888;1592-1808;5922-6...


In [9]:
# We rename Uniref50_id to uniref50_id and clean it to keep only the ID part
df_dpcfam_long = df_dpcfam.copy()
df_dpcfam_long.rename(columns={"Uniref50_id": "uniref50_id"}, inplace=True)
df_dpcfam_long["uniref50_id"] = df_dpcfam_long["uniref50_id"].str.replace("UniRef50_", "", regex=False)

# Split and explode the dpcfam_ids and dpcfam_ranges columns to create one row per domain
df_dpcfam_long["dpcfam_ids"] = df_dpcfam_long["dpcfam_ids"].str.split(";")
df_dpcfam_long["dpcfam_ranges"] = df_dpcfam_long["dpcfam_ranges"].str.split(";")

# We can explode multiple columns simultaneously since pandas 1.3.0
df_dpcfam_long = df_dpcfam_long.explode(["dpcfam_ids", "dpcfam_ranges"])

# We can have empty strings in dpcfam_ids and dpcfam_ranges if there were trailing semicolons. We filter those out
df_dpcfam_long = df_dpcfam_long[df_dpcfam_long["dpcfam_ids"] != ""]

df_dpcfam_long.head()

Unnamed: 0,uniref50_id,dpcfam_ids,dpcfam_ranges
0,Q8WZ42-5,MC392407,9481-9542
0,Q8WZ42-5,MC109156,7361-7581
0,Q8WZ42-5,MC435339,22113-22225
0,Q8WZ42-5,MC342152,463-877
0,Q8WZ42-5,MC205449,30726-30976


In [10]:
# How many unique DPCFam domains do we have?
unique_dpcfam_domains = df_dpcfam_long["dpcfam_ids"].nunique
print(f"Number of unique DPCFam domains: {unique_dpcfam_domains}")

Number of unique DPCFam domains: <bound method IndexOpsMixin.nunique of 0           MC392407
0           MC109156
0           MC435339
0           MC342152
0           MC205449
              ...   
23531979    MC433447
23531979     MC44681
23531979     MC44681
23531979    MC360938
23531979    MC360938
Name: dpcfam_ids, Length: 71233185, dtype: str>


In [11]:
# info about the dataframe
df_dpcfam_long.info()

<class 'pandas.DataFrame'>
Index: 71233185 entries, 0 to 23531979
Data columns (total 3 columns):
 #   Column         Dtype
---  ------         -----
 0   uniref50_id    str  
 1   dpcfam_ids     str  
 2   dpcfam_ranges  str  
dtypes: str(3)
memory usage: 2.1 GB


In [12]:
# Load another dataframe containing DPCFam final MCIDs
valid_mcids = pd.read_csv("/u/mdmc/enyanduk/internship_areasciencepark/Dataframes/DPCFam/dpcfam_standard_merged_properties.csv")
valid_mcids.head()

Unnamed: 0,mcid,size_uniref50,avg_len,std_avg_len,lc_percent,cc_percent,dis_percent,tm,pfam_da,da_percent,size_pfam,avg_ov_percent,overlap_label
0,MC1,17931,185.68,28.77,4.72,0.0,18.44,0.01,PF13614,44.23,6332,80.82,equivalent
1,MC4,617,59.91,6.07,4.99,0.0,1.87,1.26,PF03600,62.84,345,7.54,shifted
2,MC15,139,81.21,5.05,4.96,0.18,13.81,0.03,UNKNOWN,0.0,131,0.0,NONE
3,MC19,120,71.57,7.7,8.85,2.25,4.86,1.69,PF11915,94.07,111,13.66,shifted
4,MC21,937,91.2,7.7,2.65,0.0,24.38,0.0,PF01012,98.9,809,34.52,shifted


In [13]:
# Now lets filter our dpcfam_long dataframe to keep only rows where the dpcfam_id is in the valid_mcids dataframe[mcid column]
valid_dpcfam_long = df_dpcfam_long[df_dpcfam_long["dpcfam_ids"].isin(valid_mcids["mcid"])]
valid_dpcfam_long.head()

Unnamed: 0,uniref50_id,dpcfam_ids,dpcfam_ranges
0,Q8WZ42-5,MC392407,9481-9542
0,Q8WZ42-5,MC109156,7361-7581
0,Q8WZ42-5,MC435339,22113-22225
0,Q8WZ42-5,MC342152,463-877
0,Q8WZ42-5,MC205449,30726-30976


In [14]:
# Info
valid_dpcfam_long.info()

<class 'pandas.DataFrame'>
Index: 60749838 entries, 0 to 23531979
Data columns (total 3 columns):
 #   Column         Dtype
---  ------         -----
 0   uniref50_id    str  
 1   dpcfam_ids     str  
 2   dpcfam_ranges  str  
dtypes: str(3)
memory usage: 1.8 GB


In [20]:
# Remove all rows where dpcfam_ids is NaN or empty string
valid_dpcfam_long = valid_dpcfam_long[valid_dpcfam_long["dpcfam_ids"].notna() & (valid_dpcfam_long["dpcfam_ids"] != "")]
valid_dpcfam_long.info()

<class 'pandas.DataFrame'>
Index: 60749838 entries, 0 to 23531979
Data columns (total 3 columns):
 #   Column         Dtype
---  ------         -----
 0   uniref50_id    str  
 1   dpcfam_ids     str  
 2   dpcfam_ranges  str  
dtypes: str(3)
memory usage: 1.8 GB


In [21]:
# Save the dataframe
valid_dpcfam_long.to_csv("/u/mdmc/enyanduk/internship_areasciencepark/Dataframes/DPCFam/uniref50_dpcfam_valid.csv", index=False)

**3. Uniref50 - Pfam families**

In [22]:
# We extract relevant columns from the original dataframe
df_pfam = df_uniref[["Uniref50_id", "pfam_ids", "pfam_ranges"]]
df_pfam.head()

Unnamed: 0,Uniref50_id,pfam_ids,pfam_ranges
0,UniRef50_Q8WZ42-5,,
1,UniRef50_Q3ASY8,,
2,UniRef50_G5B0U1,PF07679;PF07679;PF07679;PF07679;PF07679;PF0767...,15159-15237;11109-11197;10241-10328;2348-2427;...
3,UniRef50_Q8WZ42,PF07679;PF07679;PF07679;PF07679;PF07679;PF0767...,24163-24243;8514-8602;7478-7568;4675-4753;2447...
4,UniRef50_K7EE71,PF07679;PF07679;PF07679;PF07679;PF07679;PF0767...,6959-7041;68-149;5477-5558;2230-2312;1549-1627...


In [23]:
df_pfam.info()

<class 'pandas.DataFrame'>
RangeIndex: 23531980 entries, 0 to 23531979
Data columns (total 3 columns):
 #   Column       Dtype
---  ------       -----
 0   Uniref50_id  str  
 1   pfam_ids     str  
 2   pfam_ranges  str  
dtypes: str(3)
memory usage: 538.6 MB


In [24]:
# We remove all rows where pfam_ids is NaN or empty string
df_pfam = df_pfam[df_pfam["pfam_ids"].notna() & (df_pfam["pfam_ids"] != "")]
df_pfam.head()

Unnamed: 0,Uniref50_id,pfam_ids,pfam_ranges
2,UniRef50_G5B0U1,PF07679;PF07679;PF07679;PF07679;PF07679;PF0767...,15159-15237;11109-11197;10241-10328;2348-2427;...
3,UniRef50_Q8WZ42,PF07679;PF07679;PF07679;PF07679;PF07679;PF0767...,24163-24243;8514-8602;7478-7568;4675-4753;2447...
4,UniRef50_K7EE71,PF07679;PF07679;PF07679;PF07679;PF07679;PF0767...,6959-7041;68-149;5477-5558;2230-2312;1549-1627...
5,UniRef50_W5MH34,PF07679;PF07679;PF07679;PF07679;PF07679;PF0767...,21505-21586;10343-10422;20426-20504;4530-4619;...
6,UniRef50_G1P5X9,PF07679;PF07679;PF07679;PF07679;PF07679;PF0767...,20262-20342;22633-22713;21946-22027;34291-3436...


In [25]:
# info
df_pfam.info()

<class 'pandas.DataFrame'>
Index: 9315206 entries, 2 to 23531976
Data columns (total 3 columns):
 #   Column       Dtype
---  ------       -----
 0   Uniref50_id  str  
 1   pfam_ids     str  
 2   pfam_ranges  str  
dtypes: str(3)
memory usage: 284.3 MB


In [26]:
# We rename Uniref50_id to uniref50_id and clean it to keep only the ID part
df_pfam_long = df_pfam.copy()
df_pfam_long.rename(columns={"Uniref50_id": "uniref50_id"}, inplace=True)
df_pfam_long["uniref50_id"] = df_pfam_long["uniref50_id"].str.replace("UniRef50_", "", regex=False)

# Split and explode the pfam_ids and pfam_ranges columns to create one row per domain
df_pfam_long["pfam_ids"] = df_pfam_long["pfam_ids"].str.split(";")
df_pfam_long["pfam_ranges"] = df_pfam_long["pfam_ranges"].str.split(";")

# We can explode multiple columns simultaneously since pandas 1.3.0
df_pfam_long = df_pfam_long.explode(["pfam_ids", "pfam_ranges"])

# We can have empty strings in pfam_ids and pfam_ranges if there were trailing semicolons. We filter those out
df_pfam_long = df_pfam_long[df_pfam_long["pfam_ids"] != ""]

df_pfam_long.head() 

Unnamed: 0,uniref50_id,pfam_ids,pfam_ranges
2,G5B0U1,PF07679,15159-15237
2,G5B0U1,PF07679,11109-11197
2,G5B0U1,PF07679,10241-10328
2,G5B0U1,PF07679,2348-2427
2,G5B0U1,PF07679,31048-31127


In [27]:
# Infos
df_pfam_long.info()

<class 'pandas.DataFrame'>
Index: 14554301 entries, 2 to 23531976
Data columns (total 3 columns):
 #   Column       Dtype
---  ------       -----
 0   uniref50_id  str  
 1   pfam_ids     str  
 2   pfam_ranges  str  
dtypes: str(3)
memory usage: 444.2 MB


In [29]:
# Save the dataframe
df_pfam_long.to_csv("/u/mdmc/enyanduk/internship_areasciencepark/Dataframes/DPCFam/uniref50_pfam_valid.csv", index=False)