In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import re
import os
from PIL import Image
import requests
from io import BytesIO
from joblib import Parallel, delayed

In [2]:
prot_class_dict = {
    'Nucleoplasm': '0',
    'Nuclear membrane': '1',
    'Nucleoli': '2',
    'Nucleoli fibrillar center': '3',
    'Nuclear speckles': '4',
    'Nuclear bodies': '5',
    'Endoplasmic reticulum': '6', 
    'Golgi apparatus': '7',
    'Peroxisomes': '8',
    'Endosomes': '9',
    'Lysosomes': '10',
    'Intermediate filaments': '11',   
    'Actin filaments': '12',
    'Focal adhesion sites': '13',
    'Microtubules': '14',
    'Microtubule ends': '15',  
    'Cytokinetic bridge': '16',
    'Mitotic spindle': '17',
    'Microtubule organizing center': '18',  
    'Centrosome': '19',
    'Lipid droplets': '20',  
    'Plasma membrane': '21',   
    'Cell junctions': '22',   
    'Mitochondria': '23',   
    'Aggresome': '24',   
    'Cytosol': '25',   
    'Cytoplasmic bodies': '26',   
    'Rods & rings': '27'
}

In [3]:
newpath = 'external_data' 
if not os.path.exists(newpath):
    os.makedirs(newpath)
    

def get_html(url):
    response = requests.get(url)
    return response.text

In [4]:
#from https://www.proteinatlas.org/download/subcellular_location.tsv.zip
df = pd.DataFrame.from_csv('subcellular_location.tsv', sep="\t",index_col = None)

df = df[df.Reliability != 'Uncertain']

df.reset_index(drop=True, inplace=True)

urls = []
for name in df[['Gene', 'Gene name']].values:  
    name = '-'.join(name)
    url = ('https://www.proteinatlas.org/'+name+'/antibody#ICC')
    urls.append(url)

  


In [5]:
df[:10]

Unnamed: 0,Gene,Gene name,Reliability,Enhanced,Supported,Approved,Uncertain,Single-cell variation intensity,Single-cell variation spatial,Cell cycle dependency,GO id
0,ENSG00000000003,TSPAN6,Approved,,,Cytosol,,,,,Cytosol (GO:0005829)
1,ENSG00000000460,C1orf112,Approved,,,Mitochondria,,,,,Mitochondria (GO:0005739)
2,ENSG00000000938,FGR,Approved,,,Aggresome;Plasma membrane,,,,,Aggresome (GO:0016235);Plasma membrane (GO:000...
3,ENSG00000000971,CFH,Approved,,,Vesicles,,,,,Vesicles (GO:0043231)
4,ENSG00000001084,GCLC,Approved,,,Cytosol;Nucleoli;Nucleus,,,,,Cytosol (GO:0005829);Nucleoli (GO:0005730);Nuc...
5,ENSG00000001167,NFYA,Enhanced,Nucleoplasm,,,,,,,Nucleoplasm (GO:0005654)
6,ENSG00000001460,STPG1,Approved,,,Nucleoplasm,,,,,Nucleoplasm (GO:0005654)
7,ENSG00000001461,NIPAL3,Approved,,,Nucleoplasm,,,,,Nucleoplasm (GO:0005654)
8,ENSG00000001497,LAS1L,Approved,,Nucleoplasm,Cytosol;Microtubule organizing center,,,,,Cytosol (GO:0005829);Microtubule organizing ce...
9,ENSG00000001629,ANKIB1,Supported,,Cytosol;Plasma membrane,,,,,,Cytosol (GO:0005829);Plasma membrane (GO:0005886)


In [6]:
#pages with images
urls[:10], len(urls)

(['https://www.proteinatlas.org/ENSG00000000003-TSPAN6/antibody#ICC',
  'https://www.proteinatlas.org/ENSG00000000460-C1orf112/antibody#ICC',
  'https://www.proteinatlas.org/ENSG00000000938-FGR/antibody#ICC',
  'https://www.proteinatlas.org/ENSG00000000971-CFH/antibody#ICC',
  'https://www.proteinatlas.org/ENSG00000001084-GCLC/antibody#ICC',
  'https://www.proteinatlas.org/ENSG00000001167-NFYA/antibody#ICC',
  'https://www.proteinatlas.org/ENSG00000001460-STPG1/antibody#ICC',
  'https://www.proteinatlas.org/ENSG00000001461-NIPAL3/antibody#ICC',
  'https://www.proteinatlas.org/ENSG00000001497-LAS1L/antibody#ICC',
  'https://www.proteinatlas.org/ENSG00000001629-ANKIB1/antibody#ICC'],
 11377)

In [7]:
external_df = df

In [8]:
cols = ['Enhanced', 'Supported', 'Approved']

In [9]:
external_df['classes'] = external_df[cols].apply(lambda x: ';'.join(x.fillna('nan').values.tolist()), axis=1)

In [10]:
external = []

In [44]:
urls[1110]

'https://www.proteinatlas.org/ENSG00000083814-ZNF671/antibody#ICC'

In [39]:
html = get_html(urls[0])

In [40]:
soup = BeautifulSoup(html, 'lxml')

In [41]:
soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>TSPAN6 - Antibodies - The Human Protein Atlas</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="Antibody information for antibodies HPA004109 used in analysis of ENSG00000000003 / TSPAN6 (T245, TM4SF6, TSPAN-6)" name="description"/>
<link href="/images_static/favicon_anim.gif" rel="icon" type="image/png"/>
<script src="/utils/es6-shim.min.js?v=v18.1" type="text/javascript"></script><script src="/utils/core/lib/jquery/jquery.min.js?v=v18.1" type="text/javascript"></script><script src="/utils/core/lib/jquery_ui/jquery-ui.min.js?v=v18.1" type="text/javascript"></script><link href="/utils/core/lib/jquery_ui/jquery-ui.min.css?v=v18.1" rel="stylesheet" type="text/css"/><script src="/utils/core/lib/cookie/js.cookie.min.js?v=v18.1" type="text/javascript"></script><script src="/utils/core/lib/d3/d3.min.js?v=v18.1" type="text/javasc

In [22]:
links = []
for a in soup.findAll('a',{'class':'colorbox'},href=True):
    if '_selected' in  a['href']:
        links.append(''.join(('https://www.proteinatlas.org'+a['href']).split('_medium')))

In [42]:
a

<a class="colorbox" href="/images_protein_array/4109.png" text="Antibody specificity analysis with protein arrays. Predicted and matching interactions are shown in green."><img src="/images_protein_array/4109_medium.png" style="border-radius:5px;background-clip: padding-box; vertical-align:top;"/></a>

In [23]:
links

['https://www.proteinatlas.org/images/4109/if_selected.jpg',
 'https://www.proteinatlas.org/images/4109/ihc_selected.jpg']

In [32]:
img = Image.open(fp='external_data/ENSG00000075415_0.png')

np.array(img).shape

(800, 800, 3)

In [51]:
def load_img(url):
    html = get_html(url)
    soup = BeautifulSoup(html, 'lxml')

    links = []
    for a in soup.findAll('a',{'class':'colorbox'},href=True):
        if '_selected' in  a['href']:
            links.append(''.join(('https://www.proteinatlas.org'+a['href']).split('_medium')))
    
    i = 0
    for link in set(links):
        try:
            name = url.split('/')[-2].split('-')[0]  
            response = requests.get(link)
            img = Image.open(BytesIO(response.content))
            if np.array(img)[:,:,0].mean()<70:
                img.save('external_data/'+name+'_'+str(i)+'.png')
                
                proteins = []
                for prots in external_df[external_df['Gene']==name]['classes'].str.split(';'):
                    for prot in prots:
                        if(prot in prot_class_dict.keys()):
                            proteins.append(prot_class_dict[prot])
                if(len(proteins)==0):
                    os.remove('external_data/'+name+'_'+str(i)+'.png')
                else:
                    external.append({'Id': name+'_'+str(i), 'Target': ' '.join(proteins)})
                i+=1
        except:
            pass

In [52]:
#load 

num_cores = 8
Parallel(n_jobs=num_cores, prefer="threads")(delayed(load_img)(i) for i in urls)

(800, 800, 3)
(800, 800, 3)
(800, 800, 3)
(800, 800, 3)
(800, 800, 3)
(800, 800, 3)
(800, 800, 3)
(800, 800, 3)
(800, 800, 3)
(800, 800, 3)
(800, 800, 3)
(800, 800, 3)


KeyboardInterrupt: 

In [None]:
external

In [None]:
external_result = pd.DataFrame(external)
external_result.head()

In [None]:
external_result.to_csv('external_train')