In [None]:
import os
import pandas as pd
import requests
import matplotlib.pyplot as plt

from pathlib import Path

from pair_prediction.data.dataset import LinkPredictionDataset

DATA_DIR = Path("/Users/dawid/Private/School/Master's Thesuis/non-canonical-base-pair-prediction/data/")

def read_rfam_mapping(file_path = "Rfam.pdb"):
    """
    Read the Rfam mapping file and return a DataFrame.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File {file_path} does not exist.")
    
    df = pd.read_csv(file_path, sep='\t')
    return df

MAPPING = read_rfam_mapping()

def get_rfam_id(pdb_id):
    """
    Get the Rfam ID for a given PDB ID.
    """
    pdb_id = pdb_id.lower().strip()
    if pdb_id not in MAPPING['pdb_id'].values:
        raise ValueError(f"PDB ID {pdb_id} not found in mapping.")
    
    rfam_id = MAPPING.loc[MAPPING['pdb_id'] == pdb_id, 'rfam_acc'].values[0]
    return rfam_id

In [None]:
dataset = LinkPredictionDataset(DATA_DIR)

families = []
for data in dataset:
    pdb_id = data.id[:4]
    try:
        rfam_id = get_rfam_id(pdb_id)
        families.append((data.id, pdb_id, rfam_id))
    except Exception as e:
        print(f"Error processing {data.id}: {e}")
        families.append((data.id, pdb_id, None))

df = pd.DataFrame(families, columns=['id', 'pdb_id', 'rfam_id'])

In [None]:
URL = "https://rfam.org/family/{rfam_id}?content-type=application/json"

mapping = {}
for rfam_id in df['rfam_id'].unique():
    if pd.isna(rfam_id):
        continue
    response = requests.get(URL.format(rfam_id=rfam_id))
    response.raise_for_status()
    data = response.json()
    mapping[rfam_id] = data['rfam']['id']
    print(f"Fetched Rfam name for {rfam_id}: {mapping[rfam_id]}")

df['rfam_name'] = df['rfam_id'].map(mapping)

In [None]:
df.rfam_name = df.rfam_name.fillna('Family not found')
rfam_counts = df.rfam_name.value_counts(ascending=True)

# Use other class for minorities
df.rfam_name = df.rfam_name.apply(lambda x: x if rfam_counts[x] > 100 else 'Other family')
df.rfam_name = df.rfam_name.str.strip()

rfam_counts = df.rfam_name.value_counts(ascending=True)

plt.figure(figsize=(12, 6), dpi=500)
plt.hist(
    rfam_counts.index, 
    weights=rfam_counts.values, 
    bins=len(df.rfam_name.unique()) - 1, 
    color='purple', 
    edgecolor='black',
    orientation='horizontal'
)
plt.title('Distribution of Rfam IDs in Dataset')
plt.xlabel('Count')
plt.ylabel('Rfam ID')
plt.xscale('log')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
df.to_csv(DATA_DIR / 'rfam_mapping.csv', index=False)