# AI/ML methods notebook



# install Python packages
This notebook is equipped with a dedicated login shell, tailored to the environment in which it is executed. If you are utilizing your personal compute system, such as a laptop, the login corresponds to your individual compute system login. Conversely, when running this notebook on Google Colab, the login is attributed to the root user. The initiation of Linux shell commands within Jupyter notebook code cells is denoted by a preceding exclamation point (!).

In the code cell below, the provided pip commands are employed to install a range of Python libraries essential for the tasks covered in this notebook. It's worth noting that additional Python libraries are automatically installed within our virtual environment.

In [1]:
!pip install scikit-learn --no-cache
!pip install scanpy --no-cache
!pip install gseapy --no-cache
!pip install pydeseq2 --no-cache
!pip install pybiomart --no-cache
!pip install mygene --no-cache
!pip install sklearn_som  --no-cache
!pip install pandas --no-cache
!pip install numpy --no-cache
!pip install matplotlib --no-cache
!pip install sklearn-som --no-cache
!pip install pyDeseq2 --no-cache

Collecting scanpy
  Downloading scanpy-1.10.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting anndata>=0.8 (from scanpy)
  Downloading anndata-0.10.6-py3-none-any.whl (122 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.1/122.1 kB[0m [31m153.5 MB/s[0m eta [36m0:00:00[0m
Collecting legacy-api-wrap>=1.4 (from scanpy)
  Downloading legacy_api_wrap-1.4-py3-none-any.whl (15 kB)
Collecting pynndescent>=0.5 (from scanpy)
  Downloading pynndescent-0.5.12-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m127.3 MB/s[0m eta [36m0:00:00[0m
Collecting session-info (from scanpy)
  Downloading session_info-1.0.0.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting umap-learn!=0.5.0,>=0.5 (from scanpy)
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━

# import Python modules

This notebook imports a number of Python modules for use in several notebooks.

In [2]:
import requests
import json
import pandas as pd
from urllib.request import urlretrieve
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn_som.som import SOM
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import scanpy as sc
import gseapy as gp
from gseapy.plot import gseaplot
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from gseapy import Msigdb
from pybiomart import Server
import mygene
import seaborn as sns
from sklearn.decomposition import PCA, FastICA
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.linear_model import TweedieRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from math import log
import statsmodels.api as sm
import pylab
import operator
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import permutation_importance
from itertools import islice
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# define misc helper methods

In [3]:
def set_maxdisplay(n=None):
  pd.set_option('display.max_rows', n)
  from notebook.services.config import ConfigManager
  cm = ConfigManager().update('notebook', {'limit_output': n})

# Define data ingestion methods

In [4]:
def read_meta_data(dataset):
  # dataset=255
  url = 'https://osdr.nasa.gov/geode-py/ws/studies/OSD-' + str(dataset) + '/download?source=datamanager&file=OSD-' + dataset + '_metadata_OSD-' + dataset + '-ISA.zip'
  filename = dataset + '-meta.zip'
  urlretrieve(url, filename)
  !unzip -o {filename} > /dev/null
  df = pd.read_csv('s_OSD-' + dataset + '.txt', sep='\t', header=0)
  return df

In [5]:
def read_rnaseq_data(data):
  # data = '255_rna_seq_Normalized_Counts'
  dataset = data.split('_')[0]
  url='https://osdr.nasa.gov/geode-py/ws/studies/OSD-' + str(dataset) + '/download?source=datamanager&file=GLDS-' + data + '.csv'
  df = pd.read_csv(url)
  return df

In [6]:
def read_phenotype_data(dataset, data):
  # dataset = '557'
  # data = 'LSDS-1_immunostaining_microscopy_PNAtr_Transformed_Reusable_Results'
  url='https://osdr.nasa.gov//geode-py/ws/studies/OSD-' + str(dataset) + '/download?source=datamanager&file=' + data + '.csv'
  df = pd.read_csv(url)
  return df

# define data filtering methods

In [7]:
def filter_cvs(df, frac=0.5):

  # calculate coefficient of variation
  cvs=list()
  for i in range(len(df)):
    m=np.mean(df.iloc[i][1:])
    sd=np.std(df.iloc[i][1:])
    cvs.append(sd/m)

  # plot hist of dist of coev of variation
  fig, axs = plt.subplots()
  axs.hist(cvs, bins=20)

  # keep genes with cv > frac
  indices = list()
  for i in range(len(cvs)):
    if cvs[i] > frac:
      indices.append(i)
  return df.iloc[indices]


In [8]:
def drop_nans(df):
  # drop NaN rows
  df.dropna(inplace=True)
  return df


In [9]:
def drop_lowcount(df, threshold=10):

  # let's drop any low-count genes
  print(len(df))
  if 'transcript' in df.columns:
    df = df[df.drop(columns=['transcript']).sum(axis=1) >= threshold]
  elif 'Unnamed: 0' in df.columns:
    df = df[df.drop(columns=['Unnamed: 0']).sum(axis=1) >= threshold]
    df.rename(columns={"Unnamed: 0":"transcript"}, inplace=True)
  else:
    raise Exception("check file format")
  return df


In [10]:
def filter_protein(df, drop='non-coding'):
  # let's filter protein/ non-protein-coding genes
  server = Server(host='http://www.ensembl.org')
  dataset = (server.marts['ENSEMBL_MART_ENSEMBL'].datasets['mmusculus_gene_ensembl'])
  gene_info = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name', 'gene_biotype'])
  if drop=='non-coding':
    filter_genes=gene_info[gene_info['Gene type'] == 'protein_coding']['Gene stable ID']
  elif drop=='coding':
    filter_genes=gene_info[gene_info['Gene type'] != 'protein_coding']['Gene stable ID']

  df=df[df['Unnamed: 0'].isin(filter_genes)]
  return df

In [11]:
def filter_data(df, dropnans=False, dropprotein='non-coding', droplowvar=0):
  # drop NANs
  if dropnans:
    df = drop_nans(df)
  # drop non protein-coding genes
  df = filter_protein(df, drop=dropprotein)
  # drop low variation genes
  if droplowvar != 0:
    df = filter_cvs(df, droplowvar)
  return df

In [12]:
def exclude_samples_by_prefix(df, prefix="V", colname="Source Name"):
  sample_names=list(df[colname].values)
  exclude_names=list()
  for sn in sample_names:
    if sn.startswith(prefix):
      exclude_names.append(sn)
  return exclude_names

# data transformation methods

In [13]:
def transpose_df(df, cur_index_col, new_index_col):
  df = df.set_index(cur_index_col).T
  df.reset_index(level=0, inplace=True)
  cols = [new_index_col] + list(df.columns)[1:]
  df.columns = cols
  return df

In [14]:
def reduce_dims(df, current_key, new_key, n):
  #df_t = transpose_df(df, current_key, new_key)
  #sdList = df_t.var(axis=1)
  sdList = df.std(axis=1)
  print('len of sdlist: ', str(len(sdList)))
  sdDict = {k: v for v, k in enumerate(sdList)}
  if n < 0:
    sdDictSorted = sorted(sdDict.items(), key=operator.itemgetter(0), reverse=False)
  else:
    sdDictSorted = sorted(sdDict.items(), key=operator.itemgetter(0), reverse=True)
  topN = sdDictSorted[0:abs(n)]
  print('n: ', n)
  indices = [x[1] for x in topN]
  #df_t = df_t.iloc[indices]
  #df_tt= transpose_df(df_t, new_key, current_key)
  return df.iloc[indices]

In [15]:
def convert_pd_to_np(df):
  X=list()
  for col in df.columns[1:]:
    X.append(list(df[col]))
  return np.array(X)

# plotting methods

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import argparse
import scipy.stats as stats

def plotbox_and_stats(data_, sample_key, field, treatment, space, exclude_samples=[]):
  print('field: ', field)
  print('excluding samples: ', exclude_samples)
  fieldValues = set(data_[field])
  value_dict=dict()
  results = dict()

  flight = str(field) + '_flight'
  nonflight= str(field) + '_nonflight'
  results[field] = dict()
  value_dict[flight] = list()
  value_dict[nonflight] = list()
  for i in range(len(data_)):
    if data_.iloc[i][sample_key] in exclude_samples:
      continue
    elif treatment is None:
      if data_.iloc[i][sample_key].startswith('F'):
        value_dict[flight].append(data_.iloc[i][field])
      else:
        value_dict[nonflight].append(data_.iloc[i][field])
    else:
      if data_.iloc[i][treatment] == space:
        value_dict[flight].append(data_.iloc[i][field])
      else:
        value_dict[nonflight].append(data_.iloc[i][field])


  if len(value_dict[flight]) != 0 and len(value_dict[nonflight]) != 0:
    results[field]['t-test p-value'] = float('%.5f' % (stats.ttest_ind(value_dict[flight], value_dict[nonflight]).pvalue))
    results[field]['wilcoxon p-value'] = float('%.5f' % (stats.ranksums(value_dict[flight], value_dict[nonflight]).pvalue))
    results[field]['ks-test p-value'] = float('%.5f' % (stats.kstest(value_dict[flight], value_dict[nonflight]).pvalue))


  print(results)
  print('n flight = ', len(value_dict[flight]))
  print('n nonflight = ', len(value_dict[nonflight]))
  fig,ax = plt.subplots()
  ax.boxplot(value_dict.values())
  ax.set_xticklabels(value_dict.keys())
  plt.show()