In [1]:
import pandas as pd
import textwrap
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import time

In [2]:
# Load the downloaded DB as a pandas DF

df = pd.read_table("REFOLD_161028.txt")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1875 entries, 0 to 1874
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   pubmed_id                   1841 non-null   float64
 1   title                       1841 non-null   object 
 2   abstract                    1839 non-null   object 
 3   date                        1841 non-null   object 
 4   author                      1841 non-null   object 
 5   journal                     1841 non-null   object 
 6   name                        1875 non-null   object 
 7   aaseq                       1852 non-null   object 
 8   comment                     1175 non-null   object 
 9   uniprot_id                  1788 non-null   object 
 10  function                    640 non-null    object 
 11  domain                      532 non-null    object 
 12  ph                          1804 non-null   float64
 13  temperature                 1708 

In [20]:
for index, row in df.iterrows():
    if pd.isna(row["date"]):
        print(row)
        break

pubmed_id                                                                   NaN
title                                                                       NaN
abstract                                                                    NaN
date                                                                        NaN
author                                                                      NaN
journal                                                                     NaN
name                                   Major histocompatibility complex class I
aaseq                         MEPSLLSLFVLGVVALTETRAGSHSLRYFDTAMSRPELGDSQFISV...
comment                                                                     NaN
uniprot_id                                                               Q9TPK7
function                                                                    NaN
domain                                                                      NaN
ph                                      

It looks like the newer DB is missing some important fields such as the solvant used in the experiments. For the older records (before 2009), the information could be scrapped from https://pford.info/refolddatabase/. But for the newer records, it might be necessary to explore the papers.

In [None]:
# Extract a single record and explore it to see how it looks like

record = df.sample(1).iloc[0]

for col, val in record.items():
    val_str = str(val)
    wrapped = textwrap.fill(val_str, width=80)  # wrap at 80 chars
    print(f"**{col}**: {wrapped}")
    print("*"*80)

As the records don't even contains the link to to Monash University REFOLD database, we will have to match them ourselves. First we scrap Monash University REFOLD database to get the list of records.



In [None]:
# First we gather the columns name for our DB using a single record
r = requests.get("https://pford.info/refolddatabase/refoldingrecord/5/")

soup = BeautifulSoup(r.text, "html.parser")
header = ""
col_names = []
data = []

for th in soup.find_all("th"):
    if th.get("class")[0] == "detail_header":
        header = th.find(string=True, recursive=False).strip()
    else:
        col_names.append(header + "." + th.text.replace(" ","_"))

In [None]:
def gatherrecord(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    data = []
    for row in soup.find_all('tr'):
            for cell in row.find_all("td"):
                data.append(cell.text)
    
    return data


def analyze_page(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    page_data = []
    for row in tqdm(soup.find_all('tr'), unit="record"):
        link = row.find("a")
        if link: # Deal with the headers 
            url = "https://pford.info/refolddatabase/refoldingrecord/" + link.get("href")
            row_data = gatherrecord(url)
            page_data.append(row_data)
                

    return page_data

In [None]:
# Next we explore the full website

full_data = []

# We start from the start page and do it manually (as we need to have a next page)
start_time = time.time()
url = "https://pford.info/refolddatabase/refoldingrecord"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
page_data = analyze_page(url)
    
full_data.extend(page_data)
end_time = time.time()
elapsed_time = end_time - start_time
print("It took "+ str(elapsed_time) + " to deal with the page")


# We loop while we have a next page
while soup.find("a", string="Next"):
    start_time = time.time()

    url = "https://pford.info/refolddatabase/refoldingrecord"+soup.find("a", string="Next").get("href")
    print("Dealing with : "+soup.find("a", string="Next").get("href") + "/47")
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    page_data = analyze_page(url)
    full_data.extend(page_data)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print("It took "+ str(elapsed_time) + " to deal with the page")

    break

In [None]:
full_df.iloc[0]

In [None]:

 

full_df = pd.DataFrame(full_data, columns=col_names)

In [None]:


url = "https://pford.info/refolddatabase/refoldingrecord"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
data_columns = [th.text.strip() for th in soup.find_all('th')]
data_columns.append("url")
data = []
for row in soup.find_all('tr'):
    row_data = []
    for cell in row.find_all("td"):
        link = cell.find("a")
        if link:
            url = "https://pford.info/refolddatabase/refoldingrecord/" + link.get("href")
        row_data.append(cell.text)
    row_data.append(url)
    data.append(row_data)

# remove the first entry as it is the headers
data = data[1:]



In [None]:

while soup.find("a", string="Next"):
    url = "https://pford.info/refolddatabase/refoldingrecord"+soup.find("a", string="Next").get("href")
    r = requests.get(url)
    sub_data = []
    soup = BeautifulSoup(r.text, "html.parser")
    for row in soup.find_all('tr'):
        row_data = []
        for cell in row.find_all("td"):
            link = cell.find("a")
            if link:
                url = "https://pford.info/refolddatabase/refoldingrecord/" + link.get("href")
            row_data.append(cell.text)
        row_data.append(url)
        sub_data.append(row_data)
    

    # remove the first entry as it is the headers
    sub_data = sub_data[1:]

    data.extend(sub_data)
    break

Monash_df = pd.DataFrame(data, columns=data_columns)








In [None]:
soup.find_all("th")[0].find(string=True, recursive=False).strip()

In [None]:
r = requests.get("https://pford.info/refolddatabase/refoldingrecord/5/")

soup = BeautifulSoup(r.text, "html.parser")
header = ""
col_names = []
data = []

for th in soup.find_all("th"):
    if th.get("class")[0] == "detail_header":
        header = th.find(string=True, recursive=False).strip()
    else:
        col_names.append(header + "." + th.text.replace(" ","_"))


for row in soup.find_all('tr'):
        for cell in row.find_all("td"):
            data.append(cell.text)


In [None]:
len(col_names)

In [None]:

# counts per uniprot_id (including NaN)
counts = df['uniprot_id'].value_counts(dropna=False)
print(counts.head(20))

# summary stats
total = len(df)
unique = df['uniprot_id'].nunique(dropna=True)
n_missing = df['uniprot_id'].isna().sum()
n_duplicated_ids = (counts > 1).sum()

print(f"total rows: {total}")
print(f"unique uniprot_id (non-null): {unique}")
print(f"missing uniprot_id: {n_missing}")
print(f"uniprot_ids appearing more than once: {n_duplicated_ids}")