# Temperature map
- should extract the temperature if present for all biosamples
- if a range of temperatures are given, the average value should be taken
- for each biosample, output should be a temperature value and units
- temperature exrtacted from attributes in {'air_temp', 'avg_temp', 'host_body_temp', 'soil_temp', 'surf_temp', 'temp', 'ww_temp'}

In [1]:
import xml.sax
import json
import re

----
# Pass 1
- takes the value from any matched attibutes, preprocesses the text and uses a regex expression to extract the temperature and units
- if the value can be converted to a float, assumed to be degrees celsius

In [84]:
class BioSamplesHandler(xml.sax.ContentHandler):
    '''
    SAX handler class to read in information form a BioSamples XML file 
    - Reads in attribtues that correspond to tmperature 
    - Uses regular expressions to extract one or two temperatures and the units from a string
        - Calculates the average temperature if two temperatures are given
    - Creates a dictionary of samples with temperature information
    '''

    def __init__(self, positive_attributes, output_file, multiple_temp_file):
        self.positive_attributes = positive_attributes
        self.output_file = output_file
        self.multiple_temp_file = multiple_temp_file
        self.biosample_id = ''
        self.attribute_name = ''
        self.temp_attribute = ''
        self.cur_dict = {}
        self.is_sra = False
        self.sra_id = ''
        self.count = 0
        self.temp_val = None
        self.temp_units = 'C'
        self.units_dict = {'\u00b0c': 'C', '\u2103': 'C', 'degrees c': 'C', 'degree c': 'C', 'degrees celsius': 'C', 'degree celsius': 'C', 'c': 'C', 
                           '\u00b0f': 'F', '\u2109': 'F', 'degrees f': 'F', 'degree f': 'F', 'degrees fahrenheit': 'F', 'degree fahrenheit': 'F', 'f': 'F',
                           '\u212a': 'K', 'kelvin': 'K', 'k': 'K'}

    def startDocument(self):
        open(self.output_file, 'w').close()
        open(self.multiple_temp_file, 'w').close()
        print('Started parsing...\n')

    def startElement(self, name, attrs):
        if name == 'BioSample':
            self.biosample_id = attrs['accession']
        elif name == 'Attribute':
            try:
                self.attribute_name = attrs['harmonized_name']
            except KeyError:
                self.attribute_name = attrs['attribute_name']
        elif name == 'Id':
            if 'db' in attrs and attrs['db'] == 'SRA':
                self.is_sra = True

    def characters(self, content):
        if self.is_sra:
            self.sra_id = content
            self.is_sra = False
        elif self.attribute_name != '':
            if self.positive_attributes.intersection({self.attribute_name}):
                if self.temp_val is not None:
                    print('Multiple temp attributes for {}:'.format(self.biosample_id), self.attribute_name)
                    with open(self.multiple_temp_file, 'a') as f:
                        f.write(self.biosample_id + '\n')
                else:
                    self.temp_attribute = self.attribute_name
                    # valid temperature attribute found so can extract temperature with regex
                    try:
                        self.temp_val = float(content)
                        self.temp_units = 'C'
                    except ValueError:
                        pattern = r'([-]?\d+[.]?\d*)\s*([-\u00b1]?)\s*(\d+[.]?\d*)?\s*(\u00b0c|\u2103|degrees? c|degrees? celsius|c|\u00b0f|\u2109|degrees? f|degrees? fahrenheit|f|\u212a|kelvin|k)?'
                        results = re.findall(pattern, content, re.IGNORECASE)
                        if results:
                            for result in results:
                                if result[3] == '':
                                    pass
                                else:
                                    # regex match found
                                    if result[1] == '\u00b1':
                                        # group 3 is the uncertainty
                                        self.temp_val = float(result[0])
                                    elif result[1] == '-':
                                        # group 1 is the lower bound, gruop 3 is the upper bound
                                        self.temp_val = (float(result[0]) + float(result[2])) / 2
                                    else:
                                        # no uncertainty
                                        self.temp_val = float(result[0])

                                    if result[3] is not None:
                                        # group 4 is the units
                                        self.temp_units = self.units_dict[result[3].lower()]

                self.attribute_name = ''

                    


    def endElement(self, name):
        if name == 'BioSample':
            if self.temp_val is not None:
                self.cur_dict['biosample_id'] = self.biosample_id
                self.cur_dict['sra_id'] = self.sra_id
                self.cur_dict['attribute_name'] = self.temp_attribute
                self.cur_dict['temperature'] = self.temp_val
                self.cur_dict['units'] = self.temp_units

                with open(self.output_file, 'a') as f:
                    json.dump(self.cur_dict, f)
                    f.write('\n')

            self.count += 1
            if self.count % 1000000 == 0:
                print('{} samples parsed'.format(self.count))
            # reset variables
            self.biosample_id = ''
            self.attribute_name = ''
            self.temp_val = None
            self.temp_units = 'C'
            self.sra_id = ''

    def endDocument(self):
        print('Finished parsing\n')



In [None]:
positive_attributes = {'air_temp', 'avg_temp', 'host_body_temp', 'soil_temp', 'surf_temp', 'temp', 'ww_temp'}

# Parse the biosamples file
biosamples_path = '/home/ec2-user/workspace/data/biosample_set.xml'

parser = xml.sax.make_parser()
handler = BioSamplesHandler(positive_attributes, '/home/ec2-user/workspace/data/results/temp_map_1.jsonl', '/home/ec2-user/workspace/data/results/multiple_temps_1.txt')
parser.setContentHandler(handler)

# parser.parse(biosamples_path)

# Code run in a python file to allow running in screen
# Code completed 2023-08-07 15:20:00 GMT-4

----
## Standardising output for input to Postgres
- want all temperature values in degrees celsius
    - for input to Postgres, want to enter a single numeric value 
- temperature should have 2 decimal places 

In [2]:
import pandas as pd
import numpy as np

temp_df = pd.read_json('/home/ec2-user/workspace/data/results/temp_map_1.jsonl', lines=True)
temp_df['temperature'] = temp_df['temperature'].astype(float)

In [3]:
print('Number of biosamples using degrees celsius: ', temp_df[temp_df['units'] == 'C'].shape[0])
print('Number of biosamples using degrees fahrenheit: ', temp_df[temp_df['units'] == 'F'].shape[0])
print('Number of biosamples using kelvin: ', temp_df[temp_df['units'] == 'K'].shape[0])

Number of biosamples using degrees celsius:  171250
Number of biosamples using degrees fahrenheit:  363
Number of biosamples using kelvin:  80


In [20]:
temp_df[temp_df['units'] == 'K']

Unnamed: 0,biosample_id,sra_id,attribute_name,temperature,units
6967,SAMN03248457,SRS777281,temp,298.15,K
6968,SAMN03248458,,temp,298.15,K
6969,SAMN03248459,,temp,298.15,K
6970,SAMN03248460,,temp,298.15,K
6971,SAMN03248461,,temp,298.15,K
...,...,...,...,...,...
169103,SAMN35449015,SRS17857065,temp,50.00,K
169104,SAMN35449018,SRS17857068,temp,100.00,K
169105,SAMN35449019,SRS17857070,temp,50.00,K
169106,SAMN35449022,SRS17857073,temp,100.00,K


In [23]:
def standardise_units(row):
    if row['units'] == 'F':
        row['temperature'] = (row['temperature'] - 32) * 5/9
        row['units'] = 'C'
    elif row['units'] == 'K':
        row['temperature'] = row['temperature'] - 273.15
        row['units'] = 'C'
    return row

    
temp_df = temp_df.apply(standardise_units, axis=1)

In [26]:
temp_df_postgres = temp_df[['biosample_id', 'sra_id', 'temperature']].rename(columns={'sra_id': 'run_id'})
temp_df_postgres.to_csv('biosample_temperature.csv', index=False)