In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sbn
import matplotlib.pyplot as plt
from Bio import SeqIO
import requests
from io import StringIO
import PDBdata_Scraper as Scraper

In [2]:
PDB_taxid_data = pd.read_csv('PDB_TaxID.csv',index_col = False)


In [3]:
PDB_taxid_data = PDB_taxid_data.drop(columns='Unnamed: 0')
PDB_taxid_data.head()

Unnamed: 0,PDB_ID,TAX_ID
0,5wkx,['9031']
1,6r69,['28901']
2,6s0l,['9606']
3,6w2c,"['10090', '9606']"
4,7o01,['3055']


In [4]:
PDB_taxid_data[PDB_taxid_data['TAX_ID'] == 'Not found taxid']

Unnamed: 0,PDB_ID,TAX_ID
4223,2jtw,Not found taxid
4778,2lx0,Not found taxid
6848,2ki9,Not found taxid


In [5]:
PDB_taxid_data[PDB_taxid_data['TAX_ID'] == "[None]"]

Unnamed: 0,PDB_ID,TAX_ID
102,2zfe,[None]
464,2k3c,[None]
859,2k58,[None]
1854,1rkl,[None]
2594,1z65,[None]
2908,1jdm,[None]
3061,1ln6,[None]
3668,1r2n,[None]
3950,1l0m,[None]
4715,1bnx,[None]


In [6]:
len(PDB_taxid_data[PDB_taxid_data['TAX_ID'] == 'PDB data unavailable'])

117

In [7]:
## manually search and fill out 3 'Not found taxid' candidates
# organism for '2lx0' is undefined
dict_extra = {'2jtw':['559292'],'2ki9':['9606'] }


In [8]:
PDB_taxid_data_cp = PDB_taxid_data.copy()


In [9]:
# manually add taxid  
PDB_taxid_data_cp.iat[4223, 1] =  "['559292']"
PDB_taxid_data_cp.iat[6848, 1] =  "['9606']"


In [10]:
PDB_taxid_data_cp[PDB_taxid_data_cp['PDB_ID'] == '2ki9']

Unnamed: 0,PDB_ID,TAX_ID
6848,2ki9,['9606']


In [11]:
PDB_taxid_data_cp[PDB_taxid_data_cp['TAX_ID'] == 'Not found taxid']

Unnamed: 0,PDB_ID,TAX_ID
4778,2lx0,Not found taxid


In [12]:
PDB_taxid_data_cp.to_csv('PDB_TaxID_final.csv')

In [13]:
PDB_taxid_data_cp[PDB_taxid_data_cp['TAX_ID'] == "[None]"]

Unnamed: 0,PDB_ID,TAX_ID
102,2zfe,[None]
464,2k3c,[None]
859,2k58,[None]
1854,1rkl,[None]
2594,1z65,[None]
2908,1jdm,[None]
3061,1ln6,[None]
3668,1r2n,[None]
3950,1l0m,[None]
4715,1bnx,[None]


In [None]:
def col_str_converter(col_str):
    # covert format of taxid in the df
    t1 = col_str.replace('[','')
    t2 = t1.replace(']','')
    t3 = t2.replace("'",'')
    t4 = t3.split(',') #convert t4 as a list
    return t4

In [None]:
col_str_converter("[None]")

In [None]:
t = Scraper.col_str_converter('Not found taxid')
t


In [None]:
('Not found taxid' in t) or ('None' in t)

In [None]:
def get_GC_info(taxid_col_str,df1,df2,df3):
    GC_taxlist = []
    taxid_list = col_str_converter(taxid_col_str)
    print(taxid_list)
    if 'PDB data unavailable'in taxid_list:
        return 'PDB data unavailable'
    else:
        if ('Not found taxid' in taxid_list) or ('None' in taxid_list) :
            return 'taxid not availble'
        else: 
            for TaxID in taxid_list:
                int_taxid = int(float(TaxID))
                if int_taxid in df1['TaxID'].values:
                    GC_df = df1[df1['TaxID'] == int_taxid]['GC%'].values
                    GC = float(GC_df[0])
                    GC_taxlist.append(GC) 
                    #return float(GC)
                else:
                    if int_taxid in df2['TaxID'].values:
                        GC_df = df2[df2['TaxID'] == int_taxid]['GC%'].values
                        GC = float(GC_df[0])
                        GC_taxlist.append(GC) 
                        #return float(GC)
                    else:
                        if int_taxid in df3['TaxID'].values:
                            GC_df = df3[df3['TaxID'] == int_taxid]['GC%'].values
                            GC = float(GC_df[0])
                            GC_taxlist.append(GC) 
                            #return float(GC)
                        else:
                            GC = 'Not found'
                            GC_taxlist.append(GC) 
                            #return ('Not found')
            return GC_taxlist
            

In [14]:
Scraper.get_GC_info("[None]",Scraper.df_euk,Scraper.df_pro,Scraper.df_virus)

['None']


'taxid not availble'

In [15]:
test = PDB_taxid_data_cp.head(200)

In [16]:
Scraper.Merge_pdb_GC(test,Scraper.df_euk,Scraper.df_pro,Scraper.df_virus)

['9031']
1
['28901']
2
['9606']
3
['10090', ' 9606']
4
['3055']
5
['9940']
6
['2242']
7
['9606']
8
['9606', ' 9844']
9
['84112', ' 32630']
10
['32630', ' 10116', ' 9606', ' 9913']
11
['420246', ' 1699078']
12
['390333']
13
['9103', ' 9913', ' 9844']
14
['573']
15
['10090', ' 10116']
16
['643867', ' 9844']
17
['9606']
18
['9606']
19
['83333']
20
['9606']
21
['1916']
22
['33072']
23
['9823']
24
['32630', ' 9606', ' 9844']
25
['198628']
26
['1063']
27
['36809']
28
['9913']
29
['9606']
30
['727']
31
['9913']
32
['562']
33
['416870']
34
['9940']
35
['83333']
36
['6239']
37
['9606']
38
['573']
39
['9646']
40
['562']
41
['9606']
42
['83333']
43
['1247190']
44
['6253']
45
['169963']
46
['1063']
47
['269796']
48
['1075']
49
['83333']
50
['PDB data unavailable']
51
2he6 FAILED PDB
['32042']
52
['9606']
53
['10090']
54
['562']
55
['1049564']
56
['37502']
57
['83333']
58
['1868482']
59
['3702']
60
['470']
61
['281310']
62
['9157']
63
['9606']
64
['1079']
65
['32630', ' 9606']
66
['272558']
67
['27

{'5wkx': [42.2233],
 '6r69': [52.217],
 '6s0l': [41.4876],
 '6w2c': [41.8825, 41.4876],
 '7o01': [63.5313],
 '6zkc': [41.9869],
 '5a44': [66.2551],
 '7d3f': [41.4876],
 '7dge': [41.4876, 'Not found'],
 '7qia': [64.0949, 'Not found'],
 '7fin': ['Not found', 42.0588, 41.4876, 41.9479],
 '5doq': [48.8513, 48.9],
 '7nnu': [49.7],
 '8dcs': [41.6144, 41.9479, 'Not found'],
 '7pzf': [56.9958],
 '6ud4': [41.8825, 42.0588],
 '6hd9': [35.5046, 'Not found'],
 '6mhs': [41.4876],
 '7lkz': [41.4876],
 '4lsf': [50.7569],
 '7dsx': [41.4876],
 '3gb7': [72.2],
 '3tlw': ['Not found'],
 '7v31': [41.971],
 '7wuj': ['Not found', 41.4876, 'Not found'],
 '5hej': [56.3],
 '2qjp': [69.1338],
 '6wbx': [64.1138],
 '2ped': [41.9479],
 '7ekp': [41.4876],
 '3m78': [38.2],
 '2i35': [41.9479],
 '1gfn': [50.6073],
 '4dve': [35.7],
 '7jjp': [41.9869],
 '2wvp': [50.7569],
 '7usy': [35.4317],
 '3oe0': [41.4876],
 '1osm': [56.9958],
 '5u1v': [41.8526],
 '4ea3': [50.6073],
 '6cm4': [41.4876],
 '4jbw': [50.7569],
 '7kaq': ['