# AI/ML methods notebook



# install Python packages
This notebook is equipped with a dedicated login shell, tailored to the environment in which it is executed. If you are utilizing your personal compute system, such as a laptop, the login corresponds to your individual compute system login. Conversely, when running this notebook on Google Colab, the login is attributed to the root user. The initiation of Linux shell commands within Jupyter notebook code cells is denoted by a preceding exclamation point (!).

In the code cell below, the provided pip commands are employed to install a range of Python libraries essential for the tasks covered in this notebook. It's worth noting that additional Python libraries are automatically installed within our virtual environment.

In [6]:
pip install pybiomart


Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install graphviz

Collecting graphviz
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: graphviz
Successfully installed graphviz-0.20.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
!pip install scikit-learn --no-cache
!pip install scanpy --no-cache
!pip install gseapy --no-cache
!pip install pydeseq2 --no-cache
!pip install pybiomart==0.1 --no-cache
!pip install mygene --no-cache
!pip install sklearn_som  --no-cache
!pip install pandas --no-cache
!pip install numpy --no-cache
!pip install matplotlib --no-cache
!pip install sklearn-som --no-cache
!pip install pyDeseq2 --no-cache
!pip install Ensembl_converter --no-cache
!pip install mygene --no-cache

Collecting gseapy
  Downloading gseapy-1.1.2.tar.gz (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.9/106.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: gseapy
  Building wheel for gseapy (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gseapy: filename=gseapy-1.1.2-cp312-cp312-linux_x86_64.whl size=551121 sha256=de3af985c6e99b798578282eaf6713b8c94333eb3ad28c63308eef0123d00b44
  Stored in directory: /tmp/pip-ephem-wheel-cache-jl2uy8po/wheels/f4/a3/94/1f3656bd983eda161a9944b3cb35dd9267e65a4bd47b9ab24c
Successfully built gseapy
Installing collected packages: gseapy
Successfully installed gseapy-1.1.2
Collecting pydeseq2
  Downloading pydeseq2-0.4.8-py3-none-any.whl.metadata (7.0 kB)
Downloading pydeseq2-0.4.8-py3-none-any.whl

# import Python modules

This notebook imports a number of Python modules for use in several notebooks.

In [8]:
import requests
import json
import pandas as pd
from urllib.request import urlretrieve
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn_som.som import SOM
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import scanpy as sc
import gseapy as gp
from gseapy.plot import gseaplot
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from gseapy import Msigdb
from pybiomart import Server
import mygene
import seaborn as sns
from sklearn.decomposition import PCA, FastICA
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.linear_model import TweedieRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from math import log
import statsmodels.api as sm
import pylab
import operator
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import permutation_importance
from itertools import islice
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import argparse
import scipy.stats as stats
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')
from Ensembl_converter import EnsemblConverter

# define misc helper methods

In [10]:
def set_maxdisplay(n=None):
  pd.set_option('display.max_rows', n)
  from notebook.services.config import ConfigManager
  cm = ConfigManager().update('notebook', {'limit_output': n})

# Define data ingestion methods

In [13]:
def read_meta_data(dataset):
  # dataset=255
  url = 'https://osdr.nasa.gov/geode-py/ws/studies/OSD-' + str(dataset) + '/download?source=datamanager&file=OSD-' + dataset + '_metadata_OSD-' + dataset + '-ISA.zip'
  filename = dataset + '-meta.zip'
  urlretrieve(url, filename)
  !unzip -o {filename} > /dev/null
  df = pd.read_csv('s_OSD-' + dataset + '.txt', sep='\t', header=0)
  return df

In [11]:
def read_rnaseq_data(data):
  # data = '255_rna_seq_Normalized_Counts'
  dataset = data.split('_')[0]
  url='https://osdr.nasa.gov/geode-py/ws/studies/OSD-' + str(dataset) + '/download?source=datamanager&file=GLDS-' + data + '.csv'
  df = pd.read_csv(url)
  return df

In [12]:
def read_phenotype_data(dataset, data):
  # dataset = '557'
  # data = 'LSDS-1_immunostaining_microscopy_PNAtr_Transformed_Reusable_Results'
  url='https://osdr.nasa.gov//geode-py/ws/studies/OSD-' + str(dataset) + '/download?source=datamanager&file=' + data + '.csv'
  df = pd.read_csv(url)
  return df

# define data filtering methods

In [14]:
def filter_cvs(df, thresh=0.5):

  # calculate coefficient of variation
  cvs=list()
  for i in range(len(df)):
    m=np.mean(df.iloc[i][1:])
    sd=np.std(df.iloc[i][1:])
    cvs.append(sd/m)

  # plot hist of dist of coev of variation
  fig, axs = plt.subplots()
  axs.hist(cvs, bins=20)

  # keep genes with cv > thresh
  indices = list()
  for i in range(len(cvs)):
    if cvs[i] > thresh:
      indices.append(i)
  return df.iloc[indices]


In [15]:
def drop_nans(df):
  # drop NaN rows
  df.dropna(inplace=True)
  return df


In [16]:
def drop_lowcount(df, threshold=10):

  # let's drop any low-count genes
  print(len(df))
  if 'transcript' in df.columns:
    df = df[df.drop(columns=['transcript']).sum(axis=1) >= threshold]
  elif 'Unnamed: 0' in df.columns:
    df = df[df.drop(columns=['Unnamed: 0']).sum(axis=1) >= threshold]
    df.rename(columns={"Unnamed: 0":"transcript"}, inplace=True)
  else:
    raise Exception("check file format")
  return df


In [17]:
def filter_genes(df, drop='non-coding'):
  # let's filter protein/ non-protein-coding genes
  if drop is None:
    return df
  server = Server(host='http://www.ensembl.org')
  dataset = (server.marts['ENSEMBL_MART_ENSEMBL'].datasets['mmusculus_gene_ensembl'])
  gene_info = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name', 'gene_biotype'])
  if drop=='non-coding':
    filter_genes=gene_info[gene_info['Gene type'] == 'protein_coding']['Gene stable ID']
  elif drop=='coding':
    filter_genes=gene_info[gene_info['Gene type'] != 'protein_coding']['Gene stable ID']
  else:
    return df
  df=df[df['Unnamed: 0'].isin(filter_genes)]
  return df

In [18]:
def filter_data(df, dropnans=False, dropgenes='non-coding', droplowcvs=0):
  # drop NANs
  if dropnans:
    df = drop_nans(df)
  # drop non protein-coding genes
  if not dropgenes is None:
    df = filter_genes(df, drop=dropgenes)
  # drop low coef of var genes
  if droplowcvs != 0:
    df = filter_cvs(df, droplowcvs)
  return df

In [19]:
def exclude_samples_by_prefix(df, prefix="V", colname="Source Name"):
  sample_names=list(df[colname].values)
  exclude_names=list()
  for sn in sample_names:
    if sn.startswith(prefix):
      exclude_names.append(sn)
  return exclude_names

# data transformation methods

In [20]:
def transpose_df(df, cur_index_col, new_index_col):
  df = df.set_index(cur_index_col).T
  df.reset_index(level=0, inplace=True)
  cols = [new_index_col] + list(df.columns)[1:]
  df.columns = cols
  return df

In [21]:
def reduce_dims(df, current_key, new_key, n):
  #df_t = transpose_df(df, current_key, new_key)
  #sdList = df_t.var(axis=1)
  sdList = df.std(axis=1)
  print('len of sdlist: ', str(len(sdList)))
  sdDict = {k: v for v, k in enumerate(sdList)}
  if n < 0:
    sdDictSorted = sorted(sdDict.items(), key=operator.itemgetter(0), reverse=False)
  else:
    sdDictSorted = sorted(sdDict.items(), key=operator.itemgetter(0), reverse=True)
  topN = sdDictSorted[0:abs(n)]
  print('n: ', n)
  indices = [x[1] for x in topN]
  #df_t = df_t.iloc[indices]
  #df_tt= transpose_df(df_t, new_key, current_key)
  return df.iloc[indices]

In [22]:
def convert_pd_to_np(df):
  X=list()
  for col in df.columns[1:]:
    X.append(list(df[col]))
  return np.array(X)

In [25]:
def get_symbol_from_id(gene_id_list):
  # Create an instance of EnsemblConverter
  converter = EnsemblConverter()

  # Convert Ensembl IDs to gene symbols
  result = converter.convert_ids(gene_id_list)

  # Print the resulting DataFrame
  gene_symbol_list = list()
  for i in range(len(result)):
    gene_symbol_list.append(result.iloc[i]['Symbol'])

  return gene_symbol_list

# plotting methods

In [27]:


def plotbox_and_stats(data_, sample_key, field, treatment, space, exclude_samples=[]):
  print('field: ', field)
  print('excluding samples: ', exclude_samples)
  fieldValues = set(data_[field])
  value_dict=dict()
  results = dict()

  flight = str(field) + '_flight'
  nonflight= str(field) + '_nonflight'
  results[field] = dict()
  value_dict[flight] = list()
  value_dict[nonflight] = list()
  for i in range(len(data_)):
    if data_.iloc[i][sample_key] in exclude_samples:
      continue
    elif treatment is None:
      if data_.iloc[i][sample_key].startswith('F'):
        value_dict[flight].append(data_.iloc[i][field])
      else:
        value_dict[nonflight].append(data_.iloc[i][field])
    else:
      if data_.iloc[i][treatment] == space:
        value_dict[flight].append(data_.iloc[i][field])
      else:
        value_dict[nonflight].append(data_.iloc[i][field])


  if len(value_dict[flight]) != 0 and len(value_dict[nonflight]) != 0:
    results[field]['t-test p-value'] = float('%.5f' % (stats.ttest_ind(value_dict[flight], value_dict[nonflight], equal_var=False).pvalue))
    results[field]['wilcoxon p-value'] = float('%.5f' % (stats.ranksums(value_dict[flight], value_dict[nonflight]).pvalue))
    results[field]['ks-test p-value'] = float('%.5f' % (stats.kstest(value_dict[flight], value_dict[nonflight]).pvalue))


  print(results)
  print('n flight = ', len(value_dict[flight]))
  print('n nonflight = ', len(value_dict[nonflight]))
  fig,ax = plt.subplots()
  ax.boxplot(value_dict.values())
  ax.set_xticklabels(value_dict.keys())
  #plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
  plt.xticks(rotation=30, ha='right')



  plt.show()



# machine learning methods


In [28]:
# define a method to run the k-means algorithm and then print which cluster each sample belongs to
def my_kmeans(df, metadata, k):
  # convert df to np
  X = convert_pd_to_np(df)
  kmeans = KMeans(n_clusters=k, random_state=42, init="k-means++").fit(X)
  # and predict each sample
  samples = df.columns[1:]
  for sample in samples:
    print('sample: ', sample, ', cluster: ', kmeans.predict([list(df[sample])]), metadata['255'][metadata['255']['Sample Name']==sample]['Factor Value[Spaceflight]'].values[0])

In [29]:
# define a method that graphs the within-cluster-sum-of-squares metric to determine the optimum value of k (the elbow method)
def find_k_elbow(df):
  # convert df to np
  X = convert_pd_to_np(df)
  wcss = []
  for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42, n_init=i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

  # plot wcss
  x=[i for i in range(1, 11)]
  y=wcss

  plt.scatter(x, y)

In [30]:
def my_gmm(df, metadata, k):
  # convert df to np
  #df=data['255-normalized']
  X = convert_pd_to_np(df)
  gm = GaussianMixture(n_components=k, random_state=42).fit(X)
  # and predict each sample
  samples = df.columns[1:]
  # predict probability
  for sample in samples:
    print('sample: ', sample, ', cluster: ', gm.predict([list(df[sample])]), metadata['255'][metadata['255']['Sample Name']==sample]['Factor Value[Spaceflight]'].values[0])
  return gm

In [31]:
def find_gmm_elbow(df):
  X = convert_pd_to_np(df)
  n_components=range(1, 11)
  models = [GaussianMixture(n, n_init=42).fit(X) for n in n_components]
  aics = [model.aic(X) for model in models]
  bics = [model.bic(X) for model in models]
  plt.figure(dpi=100)
  plt.plot(n_components, aics, label='AIC')
  plt.plot(n_components, bics, label='BIC')
  plt.legend(loc='best')
  plt.xlabel('n_components')
  plt.ylabel('AIC or BIC')
  plt.show()

# Differential gene expression analysis methods


In [32]:
def map_samples_to_conditions(dfT, metadata, metadata_condition_param, condition_0, condition_1):
  # map conditions to samples for comparison in DESeq2
  condition_dict=dict()
  for sample in list(dfT['sample']):
    #val=metadata['255'][metadata['255']['Sample Name']==sample]['Factor Value[Spaceflight]'].values[0]
    val=metadata[metadata['Sample Name']==sample][metadata_condition_param].values[0]

    if val == condition_0:
      condition_dict[sample] = 0
    else:
      condition_dict[sample] = 1


  dfT["condition"] = dfT["sample"].map(condition_dict)
  conditions=dfT[['sample', 'condition']]

  return conditions

In [33]:
def run_deseq2(df, metadata):
  # transpose df
  dfT = df.T
  dfT.columns=dfT.iloc[0]
  dfT=dfT.iloc[1:]
  dfT.columns.name=None
  dfT = dfT.reset_index().rename(columns={"index":"sample"})

  # map conditions
  conditions = map_samples_to_conditions(dfT, metadata, 'Factor Value[Spaceflight]', 'Ground Control', 'Space Flight')

  # get count data set up for DESeq2
  counts=dfT.drop(columns=['sample', 'condition']).reset_index(drop=True)
  counts.applymap(np.isreal)
  counts=counts.astype(int)

  # run DESeq2
  dds=DeseqDataSet(counts=counts, metadata=conditions, design_factors="condition")
  dds.deseq2()

  return dds

In [34]:
def get_results(dds):
  # do DGEA
  stats_results=DeseqStats(dds, contrast = ('condition', '0', '1'))

  # run summary
  stats_results.summary()

  # get differentially expressed genes
  res = stats_results.results_df

  return res

In [36]:
def get_sig_genes(res, pval=0.05, l2fc=0):
  sigs = res[(res.padj < pval) & (abs(res.log2FoldChange) > l2fc)]
  return sigs

In [35]:
def get_dge_ranked_genes(res):
  # rank genes from most to least significantly differentially expressed
  ranking = res[['stat']].dropna().sort_values('stat', ascending=False)
  ranking_index=list(ranking.index)
  ranking_index_upper=[x.upper() for x in ranking_index]
  ranking.index=ranking_index_upper

  return ranking

In [37]:
def filter_by_dgea(data, metadata,  pval, l2fc):
  # run DESeq2
  dds = run_deseq2(data, metadata)

  # get results
  res = get_results(dds)

  # get sig genes
  sig_genes_df = get_sig_genes(res, pval=pval, l2fc=l2fc)

  # get top sig genes
  top_genes = list(sig_genes_df.sort_values('padj').index)

  # filter data by topn_genes
  return data[data['Unnamed: 0'].isin(top_genes)]
