In [1]:
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import os.path
import pickle
import pandas as pd
import yaml
from rdflib import Graph, URIRef, Literal, BNode, Namespace
from rdflib.namespace import DC, RDFS, FOAF, DCTERMS, VOID, RDF, XSD, OWL
import requests
import numpy as np

# Cleaning the spreadsheet

This notebook won't be necessary once the final spreadsheet is prepared as a supplementary material.

In [2]:
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f.read())
    SCOPES = config['SCOPES']
    SPREADSHEET_ID = config['SPREADSHEET']
    SHEET = config['SHEET']

FileNotFoundError: [Errno 2] No such file or directory: 'config.yaml'

In [None]:
def get_google_sheet(sheet, spreadsheet_id=SPREADSHEET_ID):
    creds = None

    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server()
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)
    service = build('sheets', 'v4', credentials=creds)
    gsheet = service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=sheet).execute()
    values = gsheet.get('values', [])
    if not values:
        print('No data found.')
    else:
        df = pd.DataFrame(values, columns=values[0]).tail(-1)
        return df

In [None]:
data = get_google_sheet(SHEET)

In [None]:
data.sample(4)

Unnamed: 0,reference_id,material_description,source,erm,material_type,placeholder,material_iri,material_synthesis_characterization_quote,s,p,...,pah_impurities_instrument,endotoxin_impurities,endotoxin_impurities_unit,endotoxin_impurities_instrument,endotoxins_impurities_instrument,crystal_structure,crystal_structure_instrument,polidispersity_index_(%),doping,polidispersity_index_instrument
128,27,GO,https://doi.org/10.1021/tx400385x,GO27,GO,http://purl.enanomapper.org/onto/,http://purl.enanomapper.org/onto/GO,,exposure,positively_correlates,...,,,,,,,,,,
224,38,GO,https://doi.org/10.3390%2Fnano9081180,GO38,GO,http://purl.enanomapper.org/onto/,http://purl.enanomapper.org/onto/GO,,Hydrophilicity,correlates,...,,,,,,,,,,
119,26,rGO,https://doi.org/10.1021/tx400385x,GO26,GO,http://purl.enanomapper.org/onto/,http://purl.enanomapper.org/onto/GO,,exposure,positively_correlates,...,,,,,,,,,,
368,40,,unspecific,,,,,,Hydrophilicity,correlates,...,,,,,,,,,,


In [None]:
data.head(2)

Unnamed: 0,reference_id,material_description,source,erm,material_type,placeholder,material_iri,material_synthesis_characterization_quote,s,p,...,pah_impurities_instrument,endotoxin_impurities,endotoxin_impurities_unit,endotoxin_impurities_instrument,endotoxins_impurities_instrument,crystal_structure,crystal_structure_instrument,polidispersity_index_(%),doping,polidispersity_index_instrument
1,2,GO,https://doi.org/10.1002/smll.201201546,GO2,GO,http://purl.enanomapper.org/onto/,http://purl.enanomapper.org/onto/GO,,exposure,negatively_correlates,...,,,,,,,,,,
2,1,Au-GO,https://doi.org/10.1002.smll.201102743,GO1,GO,http://purl.enanomapper.org/onto/,http://purl.enanomapper.org/onto/GO,,,,...,,,,,,,,,,


In [None]:
data = data[data['source']!= 'unspecific']
data = data[data['source']!= '']
data = data[data['s'] != ""]
data = data[data['o'] != ""]
data = data[data['s_iri'] != ""]
data = data[data['o_iri']!= '']
data=data
data.replace(r'^\s*$', "#N/A", regex=True, inplace=True)
data.replace('', np.nan, inplace=True)

In [None]:
nodes = [i for i in data.columns if "_iri" in i]
nodes

['material_iri',
 's_iri',
 'o_iri',
 'model_cell_iri',
 'organism_iri',
 'exposure_time_units_iri',
 'dose_units_iri',
 'endpoint_iri',
 'assay_iri',
 'primary_size_descriptor_iri',
 'hydrodymanic_diameter_method_iri']

In [None]:
units = [i for i in data.columns if "_units" in i]
units

['exposure_time_units',
 'exposure_time_units_iri',
 'dose_units',
 'dose_units_iri',
 'endpoint_units']

In [None]:
quotes = [i for i in data.columns if "quote" in i]
quotes

['material_synthesis_characterization_quote',
 'quote',
 'primary_size_quote',
 'bundle_diameter_quote']

In [None]:
qualifiers = [i for i in data.columns if "qualifier" in i]
qualifiers

['qualifier',
 'qualifier_value',
 'primary_size_qualifier',
 'primary_size_qualifier_value',
 'bundle_diameter_qualifier',
 'bundle_diameter_qualifier_value',
 'nominal_size_qualifier',
 'nominal_size_qualifier_value',
 'average_grain_size_qualifier',
 'average_grain_size_qualifier_value',
 'agglomerate_size_qualifier',
 'agglomerate_size_qualifier_value',
 'aggregate_size_qualifier',
 'aggregate_size_qualifier_value',
 'diameter_qualifier',
 'diameter_qualifier_value',
 'hydrodynamic_diameter_qualifier',
 'hydrodynamic_diameter_qualifier_value',
 'surface_area_qualifier',
 'surface_area_qualifier_value',
 'zeta_potential_qualifier',
 'zeta_potential_qualifier_value',
 'porosity_qualifier',
 'porosity_qualifier_value']

In [None]:
others = [i for i in data.columns if i not in quotes and i not in units and i not in nodes and i not in qualifiers]
others

['reference_id',
 'material_description',
 'source',
 'erm',
 'material_type',
 'placeholder',
 's',
 'p',
 'o',
 'original_study',
 'citesAsSourceDocument',
 'model_cell',
 'organism',
 'organism_age',
 'manufacturer_cell',
 'exposure_time',
 'dose',
 'exposure_route',
 'endpoint',
 'endpoint_measurement',
 'assay_equipment',
 'assay',
 'primary_size_descriptor__(ecd,_feret_diameter_,_length)',
 'primary_size',
 'primary_size',
 'primary_size_unit',
 'normalized_primary_size',
 'primary_size_method',
 'primary_size_instrument',
 'bundle_diameter',
 'bundle_diameter_unit',
 'normalized_bundle_diameter',
 'bundle_diameter_method',
 'bundle_diameter_instrument',
 'nominal_size',
 'nominal_size_unit',
 'average_grain_size',
 'average_grain_size_unit',
 'agglomerate_size',
 'agglomerate_size_unit',
 'agglomerate_size_method',
 'aggregate_size',
 'aggregate_size_unit',
 'aggregate_size_medium',
 'aggregate_size_instrument',
 'aggregate_size_method',
 'diameter_unit',
 'normalized_diameter',

In [None]:
data.to_csv('../data/causal_network.csv', index=True, index_label='row_id')