<a href="https://colab.research.google.com/github/dr-richard-barker/Space_Biology_and_AstroBotany.io/blob/main/OSDR_API_demo_GLDS-37.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Documentation for the OSDR API: https://genelab.nasa.gov/genelabAPIs

Useful tutorial on API json response parsing in Python: https://dataquest.io/blog/python-api-tutorial/


In [1]:
import requests
import json
import pandas as pd
pd.set_option('display.max_rows', None)

In [2]:
# Get all files for OSD-37
response = requests.get('https://osdr.nasa.gov/genelab/data/glds/files/37').json()

In [3]:
response

{'hits': 1,
 'input': '37',
 'page_number': 1,
 'page_size': 25,
 'page_total': 1,
 'studies': {'OSD-37': {'file_count': 1044,
   'study_files': [{'category': 'Study Metadata Files',
     'date_created': 1652504575.493,
     'date_updated': 1652504575.493,
     'file_name': 'GLDS-37_metadata_GLDS-37-ISA.zip',
     'file_size': 231237,
     'organization': 'genelab',
     'remote_url': '/geode-py/ws/studies/OSD-37/download?source=datamanager&file=GLDS-37_metadata_GLDS-37-ISA.zip',
     'restricted': False,
     'subcategory': '',
     'subdirectory': '',
     'visible': True},
    {'category': 'GeneLab Processed RNA-Seq Files',
     'date_created': 1633654070.937,
     'date_updated': 1633654070.937,
     'file_name': 'GLDS-37_rna_seq_infer_exp_multiqc_report.zip',
     'file_size': 358550,
     'organization': 'genelab',
     'remote_url': '/geode-py/ws/studies/OSD-37/download?source=datamanager&file=GLDS-37_rna_seq_infer_exp_multiqc_report.zip',
     'restricted': False,
     'subcate

In [4]:
# Get the remote URL for all files (need to put https://osdr.nasa.gov before each URL to access)
for item in response['studies']['OSD-37']['study_files']:
    print(item['remote_url'])

/geode-py/ws/studies/OSD-37/download?source=datamanager&file=GLDS-37_metadata_GLDS-37-ISA.zip
/geode-py/ws/studies/OSD-37/download?source=datamanager&file=GLDS-37_rna_seq_infer_exp_multiqc_report.zip
/geode-py/ws/studies/OSD-37/download?source=datamanager&file=GLDS-37_rna_seq_Atha_Col-0_sl-pool_FLT_Rep1_R1-FL-A1.genes.results
/geode-py/ws/studies/OSD-37/download?source=datamanager&file=GLDS-37_rna_seq_Atha_Col-0_sl-pool_FLT_Rep1_R1-FL-A1.isoforms.results
/geode-py/ws/studies/OSD-37/download?source=datamanager&file=GLDS-37_rna_seq_Atha_Col-0_sl-pool_FLT_Rep2_R1-FL-A4.genes.results
/geode-py/ws/studies/OSD-37/download?source=datamanager&file=GLDS-37_rna_seq_Atha_Col-0_sl-pool_FLT_Rep2_R1-FL-A4.isoforms.results
/geode-py/ws/studies/OSD-37/download?source=datamanager&file=GLDS-37_rna_seq_Atha_Col-0_sl-pool_FLT_Rep3_R1-FL-B1.genes.results
/geode-py/ws/studies/OSD-37/download?source=datamanager&file=GLDS-37_rna_seq_Atha_Col-0_sl-pool_FLT_Rep3_R1-FL-B1.isoforms.results
/geode-py/ws/studies/OS

In [5]:
# Use pandas to read in the Normalized Counts CSV file
pd.read_csv('https://osdr.nasa.gov/geode-py/ws/studies/OSD-255/download?source=datamanager&file=GLDS-37_rna_seq_Normalized_Counts.csv', nrows=10)

Unnamed: 0.1,Unnamed: 0,Atha_Col-0_sl-pool_FLT_Rep1_R1-FL-A1,Atha_Col-0_sl-pool_FLT_Rep2_R1-FL-A4,Atha_Col-0_sl-pool_FLT_Rep3_R1-FL-B1,Atha_Col-0_sl-pool_FLT_Rep4_R1-FL-B4,Atha_Col-0_sl-pool_FLT_Rep5_R2-FL-A1,Atha_Col-0_sl-pool_FLT_Rep6_R2-FL-A4,Atha_Col-0_sl-pool_FLT_Rep7_R2-FL-B1,Atha_Col-0_sl-pool_FLT_Rep8_R2-FL-B4,Atha_Col-0_sl-pool_GC_Rep1_R1-GC-A1,...,Atha_Ws-2_sl-pool_FLT_Rep7_R2-FL-D1,Atha_Ws-2_sl-pool_FLT_Rep8_R2-FL-D4,Atha_Ws-2_sl-pool_GC_Rep1_R1-GC-C1,Atha_Ws-2_sl-pool_GC_Rep2_R1-GC-C4,Atha_Ws-2_sl-pool_GC_Rep3_R1-GC-D1,Atha_Ws-2_sl-pool_GC_Rep4_R1-GC-D4,Atha_Ws-2_sl-pool_GC_Rep5_R2-GC-C1,Atha_Ws-2_sl-pool_GC_Rep6_R2-GC-C4,Atha_Ws-2_sl-pool_GC_Rep7_R2-GC-D1,Atha_Ws-2_sl-pool_GC_Rep8_R2-GC-D4
0,AT1G01010,860.359965,766.817977,642.892504,740.410719,698.590796,591.939345,489.853168,425.461984,766.412109,...,351.282861,845.868219,542.423509,574.71744,530.351384,475.606848,440.925727,557.001021,510.889201,453.38519
1,AT1G01020,62.516677,77.775597,70.307423,64.946479,86.373982,85.04318,59.288878,87.263123,58.938755,...,32.714288,14.060577,54.656554,47.606845,47.379252,56.791355,28.5076,41.021755,72.000632,67.677132
2,AT1G01030,36.37971,34.833112,49.252818,61.450811,13.564211,66.046789,21.514897,52.601673,26.265494,...,146.53115,188.283255,85.169128,25.144138,30.96861,31.222407,107.473063,96.846638,48.412666,143.854844
3,AT1G01040,597.928046,614.649008,520.56852,573.440435,601.192961,610.481403,673.572046,838.615962,497.753884,...,884.652702,867.24819,1039.840796,779.927105,769.069106,823.773927,947.078972,933.775598,1000.589103,1069.349182
4,AT1G01046,0.0,0.885578,0.813968,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.870804,0.0,1.600628,3.170742,0.0,0.0
5,AT1G01050,482.699505,474.655499,495.564827,513.542065,546.029031,523.080066,347.922072,391.046316,417.324397,...,535.405804,383.169282,567.150127,596.290481,495.740185,492.913093,585.70982,662.997102,461.515587,646.214029
6,AT1G01060,211.436923,222.562509,308.380078,219.62149,242.639225,317.202919,262.022609,273.682466,259.454743,...,357.306874,429.42884,305.715068,281.420754,382.206022,285.126236,293.896647,233.226845,396.907408,376.72214
7,AT1G01070,52.654086,61.439693,63.323476,73.893281,68.748829,68.752531,104.367747,127.673995,107.388229,...,160.589694,41.53976,129.213138,170.899595,124.524876,226.807157,161.409421,208.381163,137.758683,242.348629
8,AT1G01080,93.856282,93.056373,104.024025,94.097595,83.812553,114.166884,51.72056,100.663997,82.006727,...,109.990053,460.280077,107.935949,107.560722,98.80075,91.119568,83.97721,53.713166,94.129285,102.544668
9,AT1G01090,1389.585021,1602.624192,1663.67016,1410.724849,1449.189663,1676.354074,1947.918644,1731.375653,1524.523477,...,1726.105594,822.768486,1391.069607,858.23522,1000.493503,1181.285206,1496.039996,1160.629122,1071.892661,1030.593689


Principal Component Analysis (PCA) is a dimensionality reduction technique that is commonly used in data analysis and machine learning. It works by transforming the original data into a new set of uncorrelated variables called principal components. These components are ordered by the amount of variance they explain in the data, with the first component explaining the most variance and subsequent components explaining less.

PCA helps in simplifying the data by capturing the most important patterns and relationships among variables. It is often used for visualization, noise reduction, feature extraction, and data compression. The explained variance ratio of each principal component indicates how much of the total variance in the data is accounted for by that component.

In [9]:
data = pd.read_csv('https://osdr.nasa.gov/geode-py/ws/studies/OSD-255/download?source=datamanager&file=GLDS-37_rna_seq_Normalized_Counts.csv')



ValueError: could not convert string to float: 'AT1G01010'

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(data)
principal_components = pca.transform(data)
sns.pairplot(pd.DataFrame(principal_components))
plt.tight_layout()
plt.show()

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a matrix of PCA plots
sns.pairplot(pd.DataFrame(principal_components))
plt.tight_layout()
plt.show()

NameError: name 'principal_components' is not defined

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop('Unnamed: 0', axis=1))

# Perform PCA
pca = PCA()
pca.fit(scaled_data)

# Get the principal components
principal_components = pca.transform(scaled_data)

# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
explained_variance_ratio

I had the OSD-37-samples.csv sample table on my computer so it was quick to load. In the future this should be replaced by a API call.  

In [10]:
# Load the factors matrix
factors_df = pd.read_csv('OSD-37-samples.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'OSD-37-samples.csv'

In [11]:

# Merge the factors matrix with the principal components
merged_df = pd.merge(factors_df, pd.DataFrame(principal_components), left_on='Sample Name', right_index=True)

# Calculate the correlation between the factors and principal components
correlation_matrix = merged_df.corr()
correlation_matrix

NameError: name 'factors_df' is not defined

Working on code to plot the data on a pathway graph

In [12]:
install.packages(c("ggKEGG", "KEGGgraph"))
library(ggKEGG)
library(KEGGgraph)

NameError: name 'install' is not defined

In [None]:
# Example for Glycolysis pathway
pathway_graph <- map_pathway(map00010, data = your_data_frame)

# Add customizations (optional)
pathway_graph +
  # Change node color based on data values
  scale_color_gradientn(..., values = your_data_frame$values) +
  # Highlight specific elements
  theme_classic() +
  # ... more customizations

In [None]:
pathway_graph
## Plot the pathway