In [None]:
import pandas as pd
from urllib.parse import quote
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import threading

In [None]:
df = pd.read_csv("../data/intended_data_names.csv")
df.shape

In [None]:
class RateLimiter:
    def __init__(self, max_per_second):
        self.lock = threading.Lock()
        self.last_request_time = time.time()
        self.min_interval = 1.0 / max_per_second

    def wait(self):
        with self.lock:
            current_time = time.time()
            time_since_last = current_time - self.last_request_time
            if time_since_last < self.min_interval:
                time.sleep(self.min_interval - time_since_last)
            self.last_request_time = time.time()

In [None]:
def create_session():
    session = requests.Session()
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    return session

In [None]:
def is_valid_compound(drug_name, rate_limiter, session):
    try:
        rate_limiter.wait()
        
        encoded_name = quote(drug_name)
        url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{encoded_name}/cids/TXT'
        response = session.get(url, timeout=10)
        
        if response.status_code == 200 and response.text.strip():
            return drug_name
        elif response.status_code == 404:
            return None
        else:
            print(f"Unexpected status code {response.status_code} for {drug_name}")
            return None
            
    except requests.RequestException as e:
        print(f"Error processing {drug_name}: {e}")
        return None

In [None]:
def get_valid_compounds(drug_set, max_workers=4):
    valid_drugs = []
    rate_limiter = RateLimiter(max_per_second=5)
    session = create_session()
    
    with tqdm(total=len(drug_set), desc="Validating unique compounds") as pbar:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_drug = {
                executor.submit(is_valid_compound, drug, rate_limiter, session): drug 
                for drug in drug_set
            }
            
            for future in as_completed(future_to_drug):
                drug = future_to_drug[future]
                try:
                    result = future.result()
                    if result:
                        valid_drugs.append(result)
                except Exception as e:
                    print(f"Error processing {drug}: {e}")
                finally:
                    pbar.update(1)

    return valid_drugs

In [None]:
def filter_valid_compounds(df, max_workers=4):
    unique_drugs = set(df['drug'])  # Collect unique drug entries
    valid_drugs = get_valid_compounds(unique_drugs, max_workers)  # Validate unique drugs
    
    # Filter the DataFrame to include only rows with valid drugs
    filtered_df = df[df['drug'].isin(valid_drugs)]
    print(f"\nFound {len(valid_drugs)} valid compounds out of {len(unique_drugs)} unique compounds")
    return filtered_df

In [None]:
filtered_df = filter_valid_compounds(df)

In [None]:
invalid = df[~df["drug"].isin(filtered_df["drug"])].reset_index(drop=True)

In [None]:
print(invalid.shape)
print(filtered_df.shape)

In [None]:
invalid["drug"].value_counts().sort_values(ascending=True).head(50)

In [None]:
filtered_df.to_csv("../data/intended_data_names.csv", index=False)