# Image Collection and Dataframe Formation 

## Part 1 - Image Collection 

### Using the PUG Rest API and BeautifulSoup to collect images 

In [1]:
## Libraries 

import pandas as pd 
import requests
from bs4 import BeautifulSoup
import ast
import time
# !pip install regex
import regex as re
import os 
import numpy as np



## Define The URL Parameters 

- Importing images based on the tanimoto search parameters defined in the PUG Rest API documentation
- Inputs:
    - CID 
    - Tanimoto Percent
    
### References: 

- https://chem.libretexts.org/Courses/St._Louis_College_of_Pharmacy/CHEM3351%3A_Cheminformatics/5%3A_How_to_Search_PubChem_for_Chemical_Information_(Part_1)

In [163]:
## This function makes a reques to the PUG rest API and returns a list_key 
## which must be used to retrieve the results 

# number = PubChem CID only
# threshold = the set Tanimoto similarity threshold

def request_format(number, threshold):
    '''
    number : int, d
    '''
    
    
    #Part 1, we need to send the request to pubchem, expect an 'ListKey' return
    pubchem_cid = number
    
    #This is the base URL we are using for the  tanimoto similarity search 
    input_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/similarity/cid/{pubchem_cid}/JSON?Threshold={threshold}' 
    
    #This is gathering the 'ListKey'
    api_request = requests.get(input_url)
    status = ast.literal_eval(api_request.text)
    list_key = int(status['Waiting']['ListKey'])
    print(api_request.status_code)

    return list_key

In [176]:
list_key = request_format(141992, 95)

202


In [177]:
print (list_key)

2493835728108288312


In [178]:
## Not functional to be implement in future versions 

# def list_test(fn_list, fn_present):
    
#     #Checking that my variables lists are the same size 
#     if len(fn_list) != len(fn_present):
#         return  ("The Lists are the Not the Same Length")
    
#     else:
#         #Create the dataframe 
#         test = pd.DataFrame()
        
#         for index, entry in enumerate(fn_list):
#             test[fn_list[index]] = fn_present[index]    
        
#     return test  

In [179]:
## This function uses the list_key collected in the PUG Rest API request, and returns the list
## of PubChem CIDs based on the similarity search results

def output_format(list_key):
    
    
    #HTML Request
    result_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{list_key}/cids/JSON'
    api_request = requests.get(result_url)
    
    #Returns a JSON Object => Dictionary 
    output_dictionary = ast.literal_eval(api_request.text)
    
    #Accessing the Dictionary
    cid = output_dictionary['IdentifierList']['CID']
    
    #Creating the the datafame 
    df = pd.DataFrame()

    #Always Included 
    df['cid'] = cid


    return df

In [180]:
time.sleep(60) 
df = output_format(list_key)

In [181]:
df.head()

Unnamed: 0,cid
0,141992
1,12529273
2,57417610
3,57506627
4,101364802


In [182]:
## Adding in the functioanl Group Info 
## Heterocylces count 

df['ring'] =1
# df['KEY'] =1
# df['BNZ'] =1  
df['AKE'] = 1
# df['AKA'] = 1 
df['AKH'] = 1
# df['ALC'] = 1 
# df['COC'] = 1
# df['COO'] = 1
df['AMN'] = 1 
# df['SHH'] = 1 
# df['COH'] = 1 
# df['AKY'] = 1
# df['COONH2'] = 1 

In [183]:
df.head()

Unnamed: 0,cid,ring,AKE,AKH,AMN
0,141992,1,1,1,1
1,12529273,1,1,1,1
2,57417610,1,1,1,1
3,57506627,1,1,1,1
4,101364802,1,1,1,1


In [184]:
df.shape

(6, 5)

## Downloading the Images from PubChem 

### Using PubChem PUGRest API

**References** 

https://www.geeksforgeeks.org/downloading-files-web-using-python/

In [185]:
def image_collection(dataframe, new_column_name, dataframe_column_input, image_class):
    item_list = []
    for entry in dataframe[dataframe_column_input]:
        
        #Print Start
        print (entry)
        
        # We want to be able to keep track the files we are creating for this project
        image_name = f'{image_class}_{entry}'
        item_list.append(image_name)
        
        #Using Requests
        image_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{entry}/PNG'
        api_call = requests.get(image_url) # create HTTP response object 
        
        with open(f"./images_multi_label/{image_class}_{entry}.png",'wb') as f: 
  
        # Saving received content as a png file in 
        # binary format 

        # write the contents of the response (r.content) 
        # to a new file in binary mode. 
            f.write(api_call.content)
        
        
        
#         #PubChemPy Method, will download the images 
#         download(outformat='PNG', path=f'./images/{image_class}_{entry}.png',
#                  identifier=entry , namespace='cid', overwrite=True)
        
        time.sleep(.3)
    
    
    print ('ALL IMAGES HAVE BEEN DOWNLOADED AND SAVED')
    
    #Save the filenames to the dataframe 
    dataframe[new_column_name] = item_list
    
    #Saving the dataframe to CSV 
    dataframe.to_csv(f'./data_multi_label/{image_name}.csv')
    
    #This is the list of images which where downloaded and saved 
    return item_list

In [186]:
image_collection(df, 'image_saved_as', 'cid', "4_fluoro_1h_pyrazole")

141992
12529273
57417610
57506627
101364802
118992218
ALL IMAGES HAVE BEEN DOWNLOADED AND SAVED


['4_fluoro_1h_pyrazole_141992',
 '4_fluoro_1h_pyrazole_12529273',
 '4_fluoro_1h_pyrazole_57417610',
 '4_fluoro_1h_pyrazole_57506627',
 '4_fluoro_1h_pyrazole_101364802',
 '4_fluoro_1h_pyrazole_118992218']

In [187]:
# df['ring'] = 0

df.head()

Unnamed: 0,cid,ring,AKE,AKH,AMN,image_saved_as
0,141992,1,1,1,1,4_fluoro_1h_pyrazole_141992
1,12529273,1,1,1,1,4_fluoro_1h_pyrazole_12529273
2,57417610,1,1,1,1,4_fluoro_1h_pyrazole_57417610
3,57506627,1,1,1,1,4_fluoro_1h_pyrazole_57506627
4,101364802,1,1,1,1,4_fluoro_1h_pyrazole_101364802
