In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [2]:
protein_seeds = ['3M5L', '1YPH']

## Define a function for extracting info from every row (\<tr> component)

In [3]:
def extract_values_from_row(row):
    row_html = row.get_attribute('outerHTML')
    row_bsoup = BeautifulSoup(row_html, 'html.parser')
    
    # extract string from each <td> component (each <td> = a column value)
    name = row_bsoup.find('td', {'class': 'c2'}).string
    description = row_bsoup.find('td', {'class': 'c3'}).string
    taxonomy = row_bsoup.find('td', {'class': 'c31'}).string
    aligned_protein = row_bsoup.find('td', {'class': 'cx cx1'}).string
    rmsd = row_bsoup.find('td', {'class': 'cx cx2'}).string
    aligned_residues = row_bsoup.find('td', {'class': 'cx cx3 vtable-sorted'}).string
    seq_id = row_bsoup.find('td', {'class': 'cx cx4'}).string
    
    return name, description, taxonomy, aligned_protein, rmsd, aligned_residues, seq_id
    

## Create webdriver instance and connect to URL

In [4]:
driver = webdriver.Firefox()

## Find total amount of rows in search query, and change table length (delete soon)

In [81]:
row_count_elem = driver.find_element_by_id("page_total_rows")
row_count = row_count_elem.get_attribute('innerHTML')

In [82]:
search_bar = driver.find_element_by_name("vtable_length")

In [84]:
search_bar.clear()
search_bar.send_keys(row_count)
search_bar.send_keys(Keys.RETURN)

## Now iterate over all protein entries in the table

In [85]:
proteins = driver.find_elements_by_class_name('even')
proteins_2 = driver.find_elements_by_class_name('odd')
proteins.extend(proteins_2)

In [87]:
names = []
descriptions = []
taxonomies = []
aligned_proteins = []
rmsds = []
aligned_residues = []
seq_ids = []

In [88]:
for p in proteins:
    name, des, tax, al_pro, rms, al_res, seq_id = extract_values_from_row(p)
    
    names.append(name)
    descriptions.append(des)
    taxonomies.append(tax)
    aligned_proteins.append(al_pro)
    rmsds.append(rms)
    aligned_residues.append(al_res)
    seq_ids.append(seq_id)
    

## Create pandas dataframe after generating dictionary of all the lists

In [95]:
protein_dict = {
    'name': names,
    'description': descriptions,
    'taxonomy': taxonomies,
    'aligned_protein': aligned_proteins,
    'rmsd': rmsds,
    'aligned_residue': aligned_residues,
    'sequence_id': seq_ids
}

In [96]:
df = pd.DataFrame(protein_dict)

# Create a function to loop above process for all matching protein codenames

In [5]:
# list of unknown_proteins gets updated with every pass of the algo
def get_all_matching_proteins(unknown_proteins: list):
    count = 0
    
    known_proteins = []
    
    names = []
    descriptions = []
    taxonomies = []
    aligned_proteins = []
    rmsds = []
    aligned_residues = [] 
    seq_ids = []
    seed_proteins = []
        
    while unknown_proteins and count < 692: # while unknown proteins is not empty
        seed_protein = unknown_proteins.pop(0)
        
        if seed_protein not in known_proteins:
            known_proteins.append(seed_protein) # Current protein we're searching now known

            driver.get("https://www.ncbi.nlm.nih.gov/Structure/vastplus/vastplus.cgi?uid=" + seed_protein)
            row_count_elem = driver.find_element_by_id("page_total_rows")
            row_count = row_count_elem.get_attribute('innerHTML')
            search_bar = driver.find_element_by_name("vtable_length")
            search_bar.clear()
            search_bar.send_keys(row_count)
            search_bar.send_keys(Keys.RETURN)

            proteins = driver.find_elements_by_class_name('even')
            proteins_2 = driver.find_elements_by_class_name('odd')
            proteins.extend(proteins_2)

            rejected_protein_count = 0
            for p in proteins:
                name, des, tax, al_pro, rms, al_res, seq_id = extract_values_from_row(p)

                if name not in known_proteins and name not in unknown_proteins: 
                    # 2nd condition is for whether protein is already in our search list (unknown proteins)
                    names.append(name)
                    descriptions.append(des)
                    taxonomies.append(tax)
                    aligned_proteins.append(al_pro)
                    rmsds.append(rms)
                    aligned_residues.append(al_res)
                    seq_ids.append(seq_id)
                    seed_proteins.append(seed_protein)
                    
                    unknown_proteins.append(name) # add protein to list of proteins we need to search
                else:
                    rejected_protein_count += 1
                
                    
            count += 1
            print('Finished searching protein #' + str(count) + ': ' + seed_protein + \
                  '; Matched proteins already seen: ' + str(rejected_protein_count))
        
        if count % 50 == 0:
            print("-------------------")
            print("Writing contents to disk and flushing all lists")
            print("-------------------")
            
            protein_dict = {
                'name': names,
                'seed_protein': seed_proteins,
                'description': descriptions,
                'taxonomy': taxonomies,
                'aligned_protein': aligned_proteins,
                'rmsd': rmsds,
                'aligned_residue': aligned_residues,
                'sequence_id': seq_ids
            }
    
            df = pd.DataFrame(protein_dict)
            df.to_csv("proteins_" + str(int(count / 50)) + ".csv")
            
            names.clear()
            descriptions.clear()
            taxonomies.clear()
            aligned_proteins.clear()
            rmsds.clear()
            aligned_residues.clear()
            seq_ids.clear()
            seed_proteins.clear()
            
            
                  
    print('Proteins searched for: ' + str(count))
                       
    protein_dict = {
        'name': names,
        'seed_protein': seed_proteins,
        'description': descriptions,
        'taxonomy': taxonomies,
        'aligned_protein': aligned_proteins,
        'rmsd': rmsds,
        'aligned_residue': aligned_residues,
        'sequence_id': seq_ids
    }
    
    df = pd.DataFrame(protein_dict)
    df.to_csv("proteins_" + str(int(count / 50)) + ".csv")
    
    return df
        

In [6]:
df = get_all_matching_proteins(['3M5L'])

Finished searching protein #1: 3M5L; Matched proteins already seen: 0
Finished searching protein #2: 3SU3; Matched proteins already seen: 659
Finished searching protein #3: 3RC4; Matched proteins already seen: 715
Finished searching protein #4: 6P6R; Matched proteins already seen: 720
Finished searching protein #5: 3SU2; Matched proteins already seen: 788
Finished searching protein #6: 6DIR; Matched proteins already seen: 559
Finished searching protein #7: 6DIV; Matched proteins already seen: 753
Finished searching protein #8: 6DIS; Matched proteins already seen: 771
Finished searching protein #9: 6DIW; Matched proteins already seen: 659
Finished searching protein #10: 4WH8; Matched proteins already seen: 756
Finished searching protein #11: 6P6L; Matched proteins already seen: 976
Finished searching protein #12: 6PIV; Matched proteins already seen: 1063
Finished searching protein #13: 3SV9; Matched proteins already seen: 1078
Finished searching protein #14: 6PIY; Matched proteins alrea

Finished searching protein #110: 3URC; Matched proteins already seen: 2154
Finished searching protein #111: 4A8B; Matched proteins already seen: 2412
Finished searching protein #112: 5YOF; Matched proteins already seen: 1881
Finished searching protein #113: 5GPI; Matched proteins already seen: 2145
Finished searching protein #114: 1GBE; Matched proteins already seen: 2006
Finished searching protein #115: 5LPR; Matched proteins already seen: 1970
Finished searching protein #116: 3GY3; Matched proteins already seen: 2916
Finished searching protein #117: 1P03; Matched proteins already seen: 2075
Finished searching protein #118: 3OTP; Matched proteins already seen: 2450
Finished searching protein #119: 6KK6; Matched proteins already seen: 2062
Finished searching protein #120: 1P02; Matched proteins already seen: 1959
Finished searching protein #121: 3CP7; Matched proteins already seen: 2850
Finished searching protein #122: 4A8A; Matched proteins already seen: 3791
Finished searching protei

Finished searching protein #217: 5E0J; Matched proteins already seen: 2554
Finished searching protein #218: 1VCW; Matched proteins already seen: 2865
Finished searching protein #219: 3GCN; Matched proteins already seen: 2715
Finished searching protein #220: 1SGE; Matched proteins already seen: 3141
Finished searching protein #221: 5DPA; Matched proteins already seen: 2860
Finished searching protein #222: 2ZTX; Matched proteins already seen: 2822
Finished searching protein #223: 3ZYE; Matched proteins already seen: 2833
Finished searching protein #224: 4X2V; Matched proteins already seen: 2415
Finished searching protein #225: 1TPO; Matched proteins already seen: 2952
Finished searching protein #226: 1C2D; Matched proteins already seen: 2924
Finished searching protein #227: 1K1J; Matched proteins already seen: 2931
Finished searching protein #228: 1F5L; Matched proteins already seen: 2756
Finished searching protein #229: 1GJB; Matched proteins already seen: 2846
Finished searching protei

Finished searching protein #324: 2M5T; Matched proteins already seen: 284
Finished searching protein #325: 3LGU; Matched proteins already seen: 2671
Finished searching protein #326: 2ZLE; Matched proteins already seen: 2049
Finished searching protein #327: 5WI6; Matched proteins already seen: 2779
Finished searching protein #328: 2WUB; Matched proteins already seen: 8811
Finished searching protein #329: 2XW9; Matched proteins already seen: 10
Finished searching protein #330: 4IC5; Matched proteins already seen: 10
Finished searching protein #331: 5TCA; Matched proteins already seen: 10
Finished searching protein #332: 1TQ0; Matched proteins already seen: 10
Finished searching protein #333: 5NAT; Matched proteins already seen: 10
Finished searching protein #334: 2QGR; Matched proteins already seen: 10
Finished searching protein #335: 1BTH; Matched proteins already seen: 10
Finished searching protein #336: 2VID; Matched proteins already seen: 10
Finished searching protein #337: 4INH; Mat

Finished searching protein #434: 4M9F; Matched proteins already seen: 10
Finished searching protein #435: 6EAW; Matched proteins already seen: 10
Finished searching protein #436: 1O2Q; Matched proteins already seen: 10
Finished searching protein #437: 6LPR; Matched proteins already seen: 10
Finished searching protein #438: 1P09; Matched proteins already seen: 10
Finished searching protein #439: 3U1J; Matched proteins already seen: 10
Finished searching protein #440: 1GBH; Matched proteins already seen: 10
Finished searching protein #441: 2H5D; Matched proteins already seen: 10
Finished searching protein #442: 1GBM; Matched proteins already seen: 10
Finished searching protein #443: 3QGJ; Matched proteins already seen: 10
Finished searching protein #444: 1GBI; Matched proteins already seen: 10
Finished searching protein #445: 1GBF; Matched proteins already seen: 10
Finished searching protein #446: 6KK3; Matched proteins already seen: 10
Finished searching protein #447: 4AOQ; Matched prot

Finished searching protein #544: 1YYY; Matched proteins already seen: 10
Finished searching protein #545: 4ABG; Matched proteins already seen: 10
Finished searching protein #546: 4ABI; Matched proteins already seen: 10
Finished searching protein #547: 1LTO; Matched proteins already seen: 10
Finished searching protein #548: 3GDV; Matched proteins already seen: 2846
Finished searching protein #549: 3ZVA; Matched proteins already seen: 10
Finished searching protein #550: 2SGQ; Matched proteins already seen: 10
-------------------
Writing contents to disk and flushing all lists
-------------------
Finished searching protein #551: 4AFQ; Matched proteins already seen: 10
Finished searching protein #552: 6QFE; Matched proteins already seen: 10
Finished searching protein #553: 5TQE; Matched proteins already seen: 10
Finished searching protein #554: 1V2V; Matched proteins already seen: 10
Finished searching protein #555: 2TBS; Matched proteins already seen: 10
Finished searching protein #556: 1

Finished searching protein #653: 2FYQ; Matched proteins already seen: 10
Finished searching protein #654: 2TLD; Matched proteins already seen: 10
Finished searching protein #655: 3LGY; Matched proteins already seen: 10
Finished searching protein #656: 5JY0; Matched proteins already seen: 10
Finished searching protein #657: 2GDD; Matched proteins already seen: 10
Finished searching protein #658: 4JZD; Matched proteins already seen: 10
Finished searching protein #659: 1P3C; Matched proteins already seen: 10
Finished searching protein #660: 3M35; Matched proteins already seen: 2893
Finished searching protein #661: 4RQY; Matched proteins already seen: 10
Finished searching protein #662: 3LGT; Matched proteins already seen: 10
Finished searching protein #663: 3TJO; Matched proteins already seen: 10
Finished searching protein #664: 6M3C; Matched proteins already seen: 10
Finished searching protein #665: 2XWB; Matched proteins already seen: 7
Finished searching protein #666: 2FYR; Matched pro

In [27]:
df.head() # data collection started at 5:04pm (apprx)

Unnamed: 0,name,seed_protein,description,taxonomy,aligned_protein,rmsd,aligned_residue,sequence_id
0,3SU3,3M5L,Crystal structure of NS3/4A protease in comple...,Hepatitis C virus subtype 1a,1,0.19Å,198,100%
1,3RC4,3M5L,Molecular mechanisms of viral and host-cell su...,Others,1,0.21Å,198,100%
2,6P6R,3M5L,HCV NS3/4A protease domain of genotype 1a3a ch...,Hepatitis C virus (isolate 1),1,0.22Å,198,98%
3,3SU2,3M5L,Crystal structure of NS3/4A protease variant A...,Hepatitis C virus subtype 1a,1,0.22Å,198,99%
4,6DIR,3M5L,Crystal structure of HCV NS3/4A protease in co...,Hepacivirus C,1,0.22Å,198,99%


In [28]:
df.to_csv("four_protein_searched.csv")

In [8]:
df_1 = pd.read_csv("proteins_1.csv")
df_2 = pd.read_csv("proteins_2.csv")
df_3 = pd.read_csv("proteins_3.csv")
df_4 = pd.read_csv("proteins_4.csv")
df_5 = pd.read_csv("proteins_5.csv")
df_6 = pd.read_csv("proteins_6.csv")
df_7 = pd.read_csv("proteins_7.csv")
df_8 = pd.read_csv("proteins_8.csv")
df_9 = pd.read_csv("proteins_9.csv")
df_10 = pd.read_csv("proteins_10.csv")
df_11 = pd.read_csv("proteins_11.csv")
df_12 = pd.read_csv("proteins_12.csv")
df_13 = pd.read_csv("proteins_13.csv")

In [19]:
all_proteins_df = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, df_11, df_12, df_13], ignore_index=True)

In [20]:
all_proteins_df

Unnamed: 0.1,Unnamed: 0,name,seed_protein,description,taxonomy,aligned_protein,rmsd,aligned_residue,sequence_id
0,0,3SU3,3M5L,Crystal structure of NS3/4A protease in comple...,Hepatitis C virus subtype 1a,1,0.19Å,198,100%
1,1,3RC4,3M5L,Molecular mechanisms of viral and host-cell su...,Others,1,0.21Å,198,100%
2,2,6P6R,3M5L,HCV NS3/4A protease domain of genotype 1a3a ch...,Hepatitis C virus (isolate 1),1,0.22Å,198,98%
3,3,3SU2,3M5L,Crystal structure of NS3/4A protease variant A...,Hepatitis C virus subtype 1a,1,0.22Å,198,99%
4,4,6DIR,3M5L,Crystal structure of HCV NS3/4A protease in co...,Hepacivirus C,1,0.22Å,198,99%
...,...,...,...,...,...,...,...,...,...
16159,17,5MV8,3GDV,Structure Of Human Myosin 7b C-terminal Myth4-...,Homo sapiens,1,1.30Å,50,30%
16160,18,5XBF,3GDV,Crystal Structure of Myo7b C-terminal MyTH4-FE...,Homo sapiens,1,1.30Å,43,28%
16161,0,5FO7,2XWB,Crystal Structure Of Human Complement C3b At 2...,Homo sapiens,2,2.10Å,1521,100%
16162,1,5O35,2XWB,Structure of complement proteins complex,Homo sapiens,2,2.09Å,1530,99%


In [21]:
all_proteins_df = all_proteins_df.drop(columns=['Unnamed: 0'])

In [22]:
all_proteins_df

Unnamed: 0,name,seed_protein,description,taxonomy,aligned_protein,rmsd,aligned_residue,sequence_id
0,3SU3,3M5L,Crystal structure of NS3/4A protease in comple...,Hepatitis C virus subtype 1a,1,0.19Å,198,100%
1,3RC4,3M5L,Molecular mechanisms of viral and host-cell su...,Others,1,0.21Å,198,100%
2,6P6R,3M5L,HCV NS3/4A protease domain of genotype 1a3a ch...,Hepatitis C virus (isolate 1),1,0.22Å,198,98%
3,3SU2,3M5L,Crystal structure of NS3/4A protease variant A...,Hepatitis C virus subtype 1a,1,0.22Å,198,99%
4,6DIR,3M5L,Crystal structure of HCV NS3/4A protease in co...,Hepacivirus C,1,0.22Å,198,99%
...,...,...,...,...,...,...,...,...
16159,5MV8,3GDV,Structure Of Human Myosin 7b C-terminal Myth4-...,Homo sapiens,1,1.30Å,50,30%
16160,5XBF,3GDV,Crystal Structure of Myo7b C-terminal MyTH4-FE...,Homo sapiens,1,1.30Å,43,28%
16161,5FO7,2XWB,Crystal Structure Of Human Complement C3b At 2...,Homo sapiens,2,2.10Å,1521,100%
16162,5O35,2XWB,Structure of complement proteins complex,Homo sapiens,2,2.09Å,1530,99%


In [24]:
all_proteins_df.to_csv("3m5l_proteins.csv", index=False)

In [31]:
all_proteins_df.iloc[93]

name                                                            3E90
seed_protein                                                    3M5L
description        West Nile Vi Rus Ns2b-Ns3protease In Complexed...
taxonomy                                             West Nile virus
aligned_protein                                                    1
rmsd                                                           2.82Å
aligned_residue                                                  129
sequence_id                                                      16%
Name: 93, dtype: object

Rule 1: > 90% sequence identity - toss it out
Rule 2: # aligned residues < 50 - toss it out
Rule 3: RMSD < 2 ang AND # aligned residues > n (50? 60? 70?) - don't iterate

Rule *: # aligned residues > 50% of original protein length