# Demonstration of hscScore model
This notebook demonstrates how to load data for input into the hscScore model

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import StandardScaler

data_dir = '/Users/fiona/Desktop/Postdoc/HSC_scoring/Paper/Zenodo_model/'

Define function for total count normalisation

In [2]:
def total_count_normalise(count_matrix):
    """Normalise count matrix for input into hscScore model.
    Performs read depth normalisation normalising each cell so that normalised 
    counts sum to the same value.
    
    Parameters
    ----------
    count_matrix : pandas dataframe
        Gene count matrix of dimension cells x genes with column names as genes
        and index as cell names
    
    Returns
    -------
    **norm_matrix** : pandas dataframe
        Normalised count matrix of dimension cells x genes
    """
    
    # Set the value normalised counts will sum to for each cell
    wilson_molo_genes_median_counts = 18704.5
    
    # Scale rows
    count_matrix_expression = np.array(count_matrix, dtype='float')
    counts_per_cell = np.sum(count_matrix_expression, axis=1)
    counts_per_cell += (counts_per_cell == 0)
    counts_per_cell /= wilson_molo_genes_median_counts
    norm_matrix_expression =  count_matrix_expression/counts_per_cell[:, None]
    norm_matrix = pd.DataFrame(norm_matrix_expression, index=count_matrix.index,
                               columns=count_matrix.columns)
    
    return norm_matrix

Load the trained model. This can be downloaded from Zenodo.

In [3]:
hsc_score = pickle.load(open(data_dir + 'hscScore_model.pkl', 'rb'))

Load some test data. This should be a count matrix of cells x genes. Columns must be labelled with common gene names. Here we are using data from [Nestorowa et al. (2016)]( https://doi.org/10.1182/blood-2016-05-716480) that can be downloaded from Zenodo to reproduce this example.

In [4]:
count_data = pd.read_csv(data_dir + 'nestorowa_htseq_counts.txt', sep=' ').T
count_data.head()

Unnamed: 0,Gnai3,Pbsn,Cdc45,H19,Scml2,Apoh,Narf,Cav2,Klf6,Scmh1,...,RP24-571H12.6,RP24-246N21.1,RP23-301H20.2,RP24-103L16.5,RP23-149H20.3,RP24-343A12.4,RP23-408A1.4,RP23-8L20.8,RP24-351O18.4,RP23-57N5.3
HSPC_001,232,0,2,0,0,0,0,0,56,0,...,0,3179,0,0,0,0,0,16,0,0
HSPC_002,0,0,4,0,0,0,1,0,0,0,...,0,117,0,0,0,0,0,0,0,0
HSPC_003,478,0,693,1,0,0,1,0,2,201,...,0,1593,0,0,0,0,0,7,2,0
HSPC_004,6,0,4,0,1,0,205,0,1,1,...,0,259,0,0,0,0,1,5,0,0
HSPC_005,2,0,0,0,0,0,2,0,2,0,...,0,10,0,0,0,0,0,0,2,0


Subset to the same genes in the same order as used for training the model. This list can be downloaded from Zenodo.

In [5]:
model_genes = np.genfromtxt(data_dir + 'model_molo_genes.txt', dtype='str')
model_genes

array(['2810417H13Rik', 'Aqp1', 'Aqp9', 'Arhgap27', 'Arhgap5', 'Asf1b',
       'Aspm', 'Birc5', 'Cacybp', 'Ccna2', 'Ccne2', 'Cd82', 'Cdca8',
       'Cdk6', 'Cdkn1c', 'Cenpf', 'Cenph', 'Cenpi', 'Ckap2', 'Ckap2l',
       'Cldn10', 'Csgalnact1', 'Ctsf', 'Dlgap5', 'Dtnbp1', 'Fads3',
       'Fam64a', 'Fgfr3', 'Gata1', 'Gimap1', 'Gimap6', 'Gja1', 'Glipr1',
       'Gp1bb', 'Gp5', 'Gpd2', 'Gstm1', 'Hmmr', 'Hsp90aa1', 'Ifitm1',
       'Ints2', 'Itga2b', 'Kif2c', 'Kif4', 'Knstrn', 'Limd2', 'Ltb',
       'Ly6a', 'Mdm1', 'Mettl7a1', 'Mfsd2b', 'Mis18bp1', 'Mki67', 'Mllt3',
       'Muc13', 'Nasp', 'Ncapg', 'Ndrg1', 'Neil2', 'Nek2', 'Neo1', 'Nkg7',
       'Nuf2', 'Pa2g4', 'Pbk', 'Pde1b', 'Pdzk1ip1', 'Pf4', 'Plk1',
       'Procr', 'Ptpn14', 'Ramp2', 'Rasa2', 'Rasal3', 'Rnf168', 'Rrm1',
       'Sdf2l1', 'Serpinb1a', 'Sgol1', 'Sgpp1', 'Sh2d3c', 'Sh2d5',
       'Shcbp1', 'Ska1', 'Slc22a3', 'Smtnl1', 'Sox18', 'Spag5', 'Spc24',
       'Spred1', 'Sqrdl', 'Sult1a1', 'Syk', 'Top2a', 'Tor1b', 'Trim47',
       

In [6]:
count_data_molo = count_data[model_genes]
count_data_molo.head()

Unnamed: 0,2810417H13Rik,Aqp1,Aqp9,Arhgap27,Arhgap5,Asf1b,Aspm,Birc5,Cacybp,Ccna2,...,Top2a,Tor1b,Trim47,Trip13,Ube2c,Ubl3,Uhrf1,Vps18,Vwf,Yme1l1
HSPC_001,1,2,0,0,0,3,4,5,1064,7,...,22,588,0,2,12,1227,2,535,2,1709
HSPC_002,5,6,1,0,1,1,1,4,3,6,...,8,0,150,1,6,5,1,0,61,191
HSPC_003,207,3,0,39,0,69,5,8,118,2,...,217,4,0,1,4,250,658,0,1,423
HSPC_004,7,4,0,0,1,0,7,3,202,3,...,9,2,0,0,4,2,2,1,7,14
HSPC_005,1,5,0,1,0,0,3,5,5,4,...,5,1,0,1,5,0,1,1,1,5


Normalise for model input

In [7]:
normalised_data_molo = total_count_normalise(count_data_molo)
normalised_data_molo.head()

Unnamed: 0,2810417H13Rik,Aqp1,Aqp9,Arhgap27,Arhgap5,Asf1b,Aspm,Birc5,Cacybp,Ccna2,...,Top2a,Tor1b,Trim47,Trip13,Ube2c,Ubl3,Uhrf1,Vps18,Vwf,Yme1l1
HSPC_001,0.448044,0.896088,0.0,0.0,0.0,1.344133,1.792177,2.240221,476.718998,3.136309,...,9.856972,263.449972,0.0,0.896088,5.37653,549.750198,0.896088,239.703631,0.896088,765.707488
HSPC_002,17.682454,21.218945,3.536491,0.0,3.536491,3.536491,3.536491,14.145963,10.609472,21.218945,...,28.291927,0.0,530.473625,3.536491,21.218945,17.682454,3.536491,0.0,215.725941,675.469749
HSPC_003,183.873842,2.664838,0.0,34.642898,0.0,61.291281,4.441397,7.106235,104.816973,1.776559,...,192.756637,3.553118,0.0,0.888279,3.553118,222.069858,584.487866,0.0,0.888279,375.7422
HSPC_004,24.736728,14.135273,0.0,0.0,3.533818,0.0,24.736728,10.601455,713.831287,10.601455,...,31.804364,7.067637,0.0,0.0,14.135273,7.067637,7.067637,3.533818,24.736728,49.473456
HSPC_005,96.914508,484.572539,0.0,96.914508,0.0,0.0,290.743523,484.572539,484.572539,387.658031,...,484.572539,96.914508,0.0,96.914508,484.572539,0.0,96.914508,96.914508,96.914508,484.572539


Input into hscScore

In [8]:
predicted_hsc_scores = hsc_score.predict(np.array(normalised_data_molo))
predicted_hsc_scores

array([ 5.34877261e-01,  4.48499708e-01,  9.83350789e-02, ...,
       -2.42786872e-04,  2.18821752e-01, -2.44614656e-04])

These predicted scores can now be saved, plotted on dimensionality reductions etc...