# Get host-subtype

This code will allow to extract the information on `host_subtype` from the SRA, that is stored in the BioSample "Description" slot.

In [None]:
import pandas as pd
import requests
import bs4
import re

## Functions

In [None]:
# Get url of specific BioSample
def get_url(sample):
    return f'http://www.ncbi.nlm.nih.gov/biosample/{sample}'

# Download html page of specific BioSample
def get_html(sample):
    url = get_url(sample)
    r = requests.get(url)
    if r.status_code == 200:
        return r.text
    else:
        print('ERROR: Full response object returned.')
        return r


def get_bs(sample):
    r = get_html(sample)
    if type(r) == str:
        soup = bs4.BeautifulSoup(r, features='lxml')
        return soup
    else:
        print('ERROR: Full response object returned.')
        return r


def get_table(sample):
    soup = get_bs(sample)
    if type(soup) == bs4.BeautifulSoup:
        tags = soup.find_all('div', class_='docsum')
        if len(tags) != 1:
            print('ERROR: Too many tables. Full soup object returned.')
            return soup
        else:
            return tags[0]
    else:
        print('ERROR: Full response object returned.')
        return soup

# Return dictionnary with essential info on BioSample
def build_data_dict(sample):
    table = get_table(sample)
    final_data = {}
    dls = table.find_all('dl')
    for dl in dls:
        descs = []
        values = []
        for child in dl.children:
            if child.name == 'dt':
                descs.append(' '.join([text for text in child.stripped_strings]))
            elif child.name == 'dd':
                values.append(' '.join([text for text in child.stripped_strings]))
            else:
                continue
        if not (len(descs) == 1 and len(values) == 1):
            print('ERROR: Input assumptions broken.')
            return
        final_data[descs[0]] = values[0]
    if 'Identifiers' in final_data:
        found = False
        for field in final_data['Identifiers'].split(';'):
            field = field.strip()
            if ':' in field:
                title, value = [x.strip() for x in field.split(':')]
                if title == 'BioSample':
                    if found:
                        print('ERROR: Multiple BioSample IDs found.')
                        return
                    sample = value
                    found = True
        df = pd.read_html(get_url(sample))
        if len(df) == 1:
            final_data['Attributes'] = df[0]
    return final_data

# Build pandas DF with info on IBS subtype and sample ID
def final_table(sampledf):
    for sampl in sampledf["BioSample"]:
        print(sampl)
        info = build_data_dict(sampl)
        disease = info["Description"]
        sampleID = re.search("name: (.*?)\;", info["Identifiers"]).group(1)
        if disease == "Alternating_IBS": disease = "IBS-M"
        elif disease == "IBS_Diarrhoea": disease = "IBS-D"
        elif disease == "IBS_Constipation": disease = "IBS-C"
        elif disease == "Control": disease = "HC"
        sampledf.loc[sampledf.BioSample == sampl, "host_subtype"] = disease
        sampledf.loc[sampledf.BioSample == sampl, "host_ID"] = sampleID
    return sampledf

## Obtain host_subtype

In [None]:
# Import dataframe with BioSample ID and Sample-Name
sampledf = pd.read_csv("PozueloList.csv") # replace with PozueloSraRunTable.txt maybe

# Create 2 new columns
sampledf["host_subtype"] = ""
sampledf["host_ID"] = ""

# Call function
df = final_table(sampledf)
#df["host_subtype"].value_counts() # sanity check

# Export as .csv file
df.to_csv("/Users/scarcy/Projects/MetaIBS/scripts/analysis-individual/Pozuelo-2015/00_Metadata-Pozuelo/host_subtype.csv",
          index=False)