# scRFE Tutorial


Here we present an example of how to use scRFE. We analyze the Kidney Facs data from the Tabula-Muris-Senis dataset that is available on Figshare. 

### Imports 

In [1]:
# Imports 
import numpy as np
import pandas as pd
import scanpy as sc
import random
from anndata import read_h5ad
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# cd /Users/madelinepark/Desktop
# adata = read_h5ad('/Users/madelinepark/Downloads/Kidney_facs.h5ad')
# tiss = adata
# # read in data and NORMALIZE
# # adata = read_h5ad('/Users/madelinepark/Downloads/Kidney_facs.h5ad')
# # sc.pp.normalize_total(adata, target_sum = 1e5, inplace = True)
# # # log it after
# # sc.pp.log1p(adata)
# # tiss = adata

In [3]:
# FILTER CELLS
np.random.seed(644685)
sc.logging.print_versions()
sc.settings.verbosity = 3      
sc.logging.print_versions()

tiss = read_h5ad('/Users/madelinepark/Downloads/Kidney_facs.h5ad')
tiss.obs['n_counts'] = tiss.X.sum(axis=1).A1

#basic filtering
sc.pp.filter_cells(tiss, min_genes=250)
sc.pp.filter_genes(tiss, min_cells=3)
#Because we're doing first exploratory pass let's remove cells that have more than 5000 genes given that those have higher chances of being doublets
#tiss = tiss[tiss.obs['n_genes'] < 5000, :]
tiss = tiss[tiss.obs['n_counts'] > 1500, :]

sc.pp.normalize_per_cell(tiss, counts_per_cell_after=1e5)
sc.pp.log1p(tiss)
tiss.raw = tiss
tiss = tiss[tiss.obs['cell_ontology_class']!='nan']

scanpy==1.4.3 anndata==0.6.21 umap==0.3.9 numpy==1.16.4 scipy==1.2.1 pandas==0.24.2 scikit-learn==0.21.2 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 
scanpy==1.4.3 anndata==0.6.21 umap==0.3.9 numpy==1.16.4 scipy==1.2.1 pandas==0.24.2 scikit-learn==0.21.2 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


Trying to set attribute `.obs` of view, making a copy.


filtered out 7032 genes that are detected in less than 3 cells


# run tutorial one vs all classification, 1000 estimators, cv = 5 for celltype

In [21]:
results_celltype_cv = pd.DataFrame() #create results data frame 

for c in list(set(tiss.obs['cell_ontology_class'])): 
    print(c)
    cell_of_interest = c
    tiss.obs['cell_type_of_interest'] = 'rest' #moved into, was outside before
    tiss.obs.loc[tiss.obs['cell_ontology_class'] == cell_of_interest,'cell_type_of_interest'] = cell_of_interest

    clf = RandomForestClassifier(n_estimators = 1000, random_state = 0, n_jobs = -1,
                                 oob_score = True)
    selector = RFECV(clf, step = 0.2, cv = 5, n_jobs = -1) 
    
    feat_labels = tiss.var_names 
    X = tiss.X
    y = tiss.obs['cell_type_of_interest']
    
    print('training...')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0) 
    clf.fit(X_train, y_train)
    selector.fit(X_train, y_train)
    feature_selected = feat_labels[selector.support_] 
    
    print('result writing')
    column_headings = []
    column_headings.append(c)
    column_headings.append(c + '_gini')
    
    resaux = pd.DataFrame(columns=column_headings)
    resaux[c] = feature_selected
    resaux[c + '_gini'] = (selector.estimator_.feature_importances_)
    resaux = resaux.sort_values(by = [c + '_gini'], ascending = False) #update pandas for this 
    resaux.reset_index(drop=True, inplace=True)

    results_celltype_cv = pd.concat([results_celltype_cv,resaux],axis=1)
    
    tiss.obs['cell_type_of_interest'] = 'rest'
    
results_celltype_cv

Trying to set attribute `.obs` of view, making a copy.


epithelial cell of proximal tubule
training...
result writing
Kidney-new-24m-cluster
training...
result writing
mesangial cell
training...
result writing
T cell
training...
result writing
kidney collecting duct principal cell
training...
result writing
B cell
training...
result writing
macrophage
training...
result writing
kidney loop of Henle ascending limb epithelial cell
training...
result writing
kidney collecting duct epithelial cell
training...
result writing
fenestrated cell
training...
result writing


Unnamed: 0,epithelial cell of proximal tubule,epithelial cell of proximal tubule_gini,Kidney-new-24m-cluster,Kidney-new-24m-cluster_gini,mesangial cell,mesangial cell_gini,T cell,T cell_gini,kidney collecting duct principal cell,kidney collecting duct principal cell_gini,B cell,B cell_gini,macrophage,macrophage_gini,kidney loop of Henle ascending limb epithelial cell,kidney loop of Henle ascending limb epithelial cell_gini,kidney collecting duct epithelial cell,kidney collecting duct epithelial cell_gini,fenestrated cell,fenestrated cell_gini
0,Kap,0.028445,Sec14l3,0.016539,Col1a2,0.016943,Ms4a4b,0.029839,Slc12a3,0.039419,Ly6d,0.030027,Fcer1g,0.025310,Slc12a1,0.047251,Gatm,0.035020,Flt1,0.028633
1,Slc7a13,0.017018,Gm16861,0.013101,Col3a1,0.012792,Cd3d,0.028578,Pgam2,0.032140,Cd79a,0.027970,Ctss,0.020847,Wfdc15b,0.023466,Spp2,0.026847,Emcn,0.022772
2,Cndp2,0.015347,Beta-s,0.010036,Loxl1,0.012265,Il2rb,0.022034,D630042F21Rik,0.030861,Ms4a1,0.027390,Tyrobp,0.019743,Egf,0.021014,Gsta2,0.023385,Gpr116,0.021714
3,Acy3,0.014652,Hba-a1,0.007363,Serping1,0.010327,Nkg7,0.021627,Calb1,0.025810,Faim3,0.023010,Plbd1,0.016563,Umod,0.020091,Slc5a12,0.018280,Ppap2a,0.021164
4,Slc27a2,0.014323,Gltpd1,0.007168,Fxyd1,0.010223,Cxcr6,0.020822,Clu,0.021923,Ccr7,0.019566,Slamf9,0.016549,Ppp1r1b,0.018298,Slc5a2,0.017285,Ly6c1,0.019157
5,Napsa,0.014293,Plin5,0.006829,Pdgfrb,0.009917,Gimap3,0.019339,Sfrp1,0.019462,Cd79b,0.013359,Cd68,0.016376,Ppp1r1a,0.016423,Ly6a,0.014526,Plvap,0.017774
6,Inmt,0.013328,Gm5820,0.006345,Cd248,0.008652,Cd3e,0.018456,Emx1,0.017431,Scd1,0.012910,Gm11428,0.015530,Slc5a3,0.015907,Gpx3,0.012392,Kdr,0.017691
7,Slc22a6,0.013015,Slc2a5,0.006089,Dcn,0.008112,Cd3g,0.015077,Pvalb,0.016285,Spib,0.009980,C1qc,0.015497,Kng2,0.015221,Car2,0.011175,Cd200,0.016700
8,Cyp2e1,0.012416,Xlr3a,0.005779,Col6a2,0.008059,Ccl5,0.014771,Tmem213,0.014829,1810046K07Rik,0.009887,Aif1,0.014816,Cldn10,0.014747,Miox,0.010979,Fabp4,0.015789
9,Ttc36,0.010684,C730036E19Rik,0.005675,C1s,0.007955,Thy1,0.014601,Clcnkb,0.013738,Tnfrsf13c,0.009172,Lst1,0.014326,Ptger3,0.014474,Spp1,0.010825,Slc9a3r2,0.015764


In [22]:
results_celltype_cv.to_csv('kidney1000TutorialReindex.csv')

In [None]:
pwd

### Run tutorial one vs all classification for age

In [9]:
cd /Users/madelinepark/Desktop

/Users/madelinepark/Desktop


In [14]:
# change n_estimators and cv 

In [15]:
results_age_cv = pd.DataFrame() #create results data frame 
for c in list(set(tiss.obs['age'])): 
    print(c)
    age_of_interest = c
    tiss.obs['age_type_of_interest'] = 'rest' #moved into, was outside before
    tiss.obs.loc[tiss.obs['age'] == age_of_interest,'age_type_of_interest'] = age_of_interest

    clf = RandomForestClassifier(n_estimators = 10, random_state = 0, n_jobs=-1, 
                                 oob_score = True)
    selector = RFECV(clf, step = 0.2, cv = 3, n_jobs=4) # step = % rounded down at each iteration  
    
    feat_labels = tiss.var_names 
    X = tiss.X
    y = tiss.obs['age_type_of_interest']
    
    print('training...')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0) 
    print(type(X_train))
    print(type(y_train))
    clf.fit(X_train, y_train)
    selector.fit(X_train, y_train)
    feature_selected = feat_labels[selector.support_] 
    
    print('result writing')
    column_headings = []
    column_headings.append(c)
    column_headings.append(c + '_gini')
    
    resaux = pd.DataFrame(columns = column_headings)
    resaux[c] = feature_selected
    resaux[c + '_gini'] = selector.estimator_.feature_importances_
    resaux = resaux.sort_values(by = [c + '_gini'], ascending = False) #update pandas for this 
    
    resaux.reset_index(drop=True, inplace=True)

    results_age_cv = pd.concat([results_age_cv,resaux], axis = 1, sort = False)    
    print(type(results_age_cv))
    tiss.obs['age_type_of_interest'] = 'rest'
    
results_age_cv

Trying to set attribute `.obs` of view, making a copy.


3m
training...
<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.series.Series'>


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


result writing
<class 'pandas.core.frame.DataFrame'>
24m
training...
<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.series.Series'>


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


result writing
<class 'pandas.core.frame.DataFrame'>


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


Unnamed: 0,3m,3m_gini,24m,24m_gini
0,Gsn,0.036301,Nme7,0.044438
1,Plp1,0.022974,Gsn,0.031495
2,Rpl13a,0.019663,Gcg,0.016351
3,Tm9sf2,0.018539,Pam,0.014728
4,Slc7a13,0.012377,Rps29,0.013943
5,Gm6981,0.011061,Ins1,0.011418
6,Csf1r,0.011029,Alb,0.011100
7,Try4,0.010020,Tmsb10,0.009658
8,Cpa1,0.009989,Ctrb1,0.009469
9,Azgp1,0.009853,Cox7b,0.008934


In [13]:
# results_age_cv.to_csv('KidneyTutorialAge1000.csv')

In [17]:
cd scRFE\ kidney\ 1000\ V1\ results\ and\ to\ compare

/Users/madelinepark/Desktop/scRFE kidney 1000 V1 results and to compare


In [18]:
testRez = pd.read_csv('KidneyTutorialAge1000.csv')

In [21]:
testRez.sort_values(by = '24m_gini', ascending = False)
testRez.to_csv('KidneyTutorialAge1000SORTEDv1.csv')

### Run scRFE one vs all classification for cell type 

In [None]:
results_celltype_cv = pd.DataFrame() #create results data frame 

for c in list(set(tiss.obs['cell_ontology_class'])): 
    print(c)
    cell_of_interest = c
    tiss.obs['cell_type_of_interest'] = 'rest' #moved into, was outside before
    tiss.obs.loc[tiss.obs['cell_ontology_class'] == cell_of_interest,'cell_type_of_interest'] = cell_of_interest

    clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, oob_score=True)
    selector = RFECV(clf, step=0.2, cv=5, n_jobs=4) # step = % rounded down at each iteration  
    
# #     tiss.obs.loc[tiss.obs[tiss.obs['age'] == age_of_interest].index,'age_type_of_interest'] = age_of_interest
#     tiss.obs.loc[tiss.obs['age'] == age_of_interest,'age_type_of_interest'] = age_of_interest

    feat_labels = tiss.var_names 
    X = tiss.X
    y = tiss.obs['cell_type_of_interest']
    
    print('training...')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0) 
    clf.fit(X_train, y_train)
    selector.fit(X_train, y_train)
    feature_selected = feat_labels[selector.support_] 
    
    print('result writing')
    column_headings = []
    column_headings.append(c)
    column_headings.append(c + '_gini')
    
    resaux = pd.DataFrame(columns=column_headings)
    resaux[c] = feature_selected
    resaux[c + '_gini'] = (selector.estimator_.feature_importances_)
    
    print(feature_selected)
    print (selector.estimator_.feature_importances_)
    
    results_celltype_cv = pd.concat([results_celltype_cv,resaux],axis=1)
    
    tiss.obs['cell_type_of_interest'] = 'rest'
    
results_celltype_cv

T cell
training...
result writing
Index(['0610007L01Rik', '0610007P08Rik', '0610007P14Rik', '0610007P22Rik',
       '0610009B22Rik', '0610009D07Rik', '0610010K14Rik', '0610010O12Rik',
       '0610031J06Rik', '0610037L13Rik',
       ...
       'Zswim3', 'Zufsp', 'Zwint', 'Zxda', 'Zxdb', 'Zxdc', 'Zyg11b', 'Zyx',
       'Zzz3', 'l7Rn6'],
      dtype='object', name='index', length=4583)
[2.27767960e-05 0.00000000e+00 3.81825681e-05 ... 2.61944884e-04
 0.00000000e+00 1.76973571e-05]
B cell
training...
result writing
Index(['Ccr7', 'Cd79a', 'Ly6d', 'Ms4a1'], dtype='object', name='index')
[0.21272157 0.38505464 0.28920124 0.11302256]
mesangial cell
training...


In [None]:
results_celltype_cv.to_csv('KidneyTutorialCelltype1000.csv')

In [17]:
results_age_cv = pd.DataFrame() #create results data frame 

for c in list(set(tiss.obs['age'])): 
    print(c)
    age_of_interest = c
    tiss.obs['age_type_of_interest'] = 'rest' #moved into, was outside before
    tiss.obs.loc[tiss.obs['age'] == age_of_interest,'age_type_of_interest'] = age_of_interest

    clf = RandomForestClassifier(n_estimators=10, random_state=0, n_jobs=-1, oob_score=True)
    selector = RFECV(clf, step=0.2, cv=5, n_jobs=4) # step = % rounded down at each iteration  
    
# #     tiss.obs.loc[tiss.obs[tiss.obs['age'] == age_of_interest].index,'age_type_of_interest'] = age_of_interest
#     tiss.obs.loc[tiss.obs['age'] == age_of_interest,'age_type_of_interest'] = age_of_interest

    feat_labels = tiss.var_names 
    X = tiss.X
    y = tiss.obs['age_type_of_interest']
    
    print('training...')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0) 
    print(type(X_train))
    print(type(y_train))
    clf.fit(X_train, y_train)
    selector.fit(X_train, y_train)
    feature_selected = feat_labels[selector.support_] 
    
    print('result writing')
    column_headings = []
    column_headings.append(c)
    column_headings.append(c + '_gini')
    
    resaux = pd.DataFrame(columns=column_headings)
    resaux[c] = feature_selected
    resaux[c + '_gini'] = (selector.estimator_.feature_importances_)

    resaux = resaux.sort_values(by = [c + '_gini'], ascending = False)
    print(resaux[c + '_gini'])
#     print(feature_selected)
#     print (selector.estimator_.feature_importances_)
    
    results_age_cv.append(resaux)
    print(type(results_age_cv))
#     results_age_cv = pd.concat([results_age_cv,resaux], axis=1, 
#                                sort = False, join_axes=[results_age_cv.index])
    
    tiss.obs['age_type_of_interest'] = 'rest'
    
results_age_cv

# to append df2 at the end of df1 dataframe 

3m
training...
<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.series.Series'>


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


result writing
8557    0.029177
6771    0.021032
7084    0.015858
6796    0.013565
3974    0.013249
6750    0.013113
3330    0.012321
5536    0.011450
8061    0.010340
2658    0.009462
6823    0.009207
6014    0.008477
6783    0.008323
6792    0.008072
2561    0.007936
4849    0.007849
2984    0.007827
6683    0.007548
7083    0.007003
6717    0.006826
4524    0.006776
6715    0.006256
6790    0.006179
8343    0.006154
6827    0.005773
2521    0.005757
8348    0.005442
3971    0.005432
8706    0.005286
6641    0.005276
          ...   
3172    0.000000
3171    0.000000
3170    0.000000
3169    0.000000
3168    0.000000
3167    0.000000
3166    0.000000
3165    0.000000
3164    0.000000
3163    0.000000
3162    0.000000
3160    0.000000
3159    0.000000
3157    0.000000
3139    0.000000
3156    0.000000
3155    0.000000
3154    0.000000
3153    0.000000
3152    0.000000
3151    0.000000
3149    0.000000
3148    0.000000
3147    0.000000
3144    0.000000
3143    0.000000
3142    0.000000

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


result writing
4248    0.057356
4281    0.037171
4440    0.023445
4370    0.020246
4269    0.019761
4527    0.015117
4501    0.014916
3034    0.013033
4027    0.012951
4261    0.011577
4318    0.011517
4264    0.010745
4235    0.010405
4129    0.009854
4520    0.009463
4288    0.008331
662     0.007446
2225    0.007270
4012    0.007251
3676    0.007179
2649    0.006834
4130    0.006529
2667    0.006365
4070    0.006190
4276    0.005985
617     0.005955
4277    0.005868
4033    0.005846
1600    0.005788
4271    0.005652
          ...   
1536    0.000000
1537    0.000000
1538    0.000000
1539    0.000000
1540    0.000000
1541    0.000000
1542    0.000000
1543    0.000000
1544    0.000000
1545    0.000000
1546    0.000000
1547    0.000000
1549    0.000000
1551    0.000000
1568    0.000000
1552    0.000000
1553    0.000000
1555    0.000000
1556    0.000000
1557    0.000000
1558    0.000000
1559    0.000000
1560    0.000000
1561    0.000000
1562    0.000000
1563    0.000000
1565    0.000000

In [None]:
# IRRELEVANT

## Now doing exact same code above for kidney 

In [111]:
adata = read_h5ad('/Users/madelinepark/Downloads/Kidney_facs.h5ad')
tiss = adata

In [113]:
results_celltype_cv = pd.DataFrame() #create results data frame 

for c in list(set(tiss.obs['cell_ontology_class'])): 
    print(c)
    cell_of_interest = c
    tiss.obs['cell_type_of_interest'] = 'rest' #moved into, was outside before
    tiss.obs.loc[tiss.obs['cell_ontology_class'] == cell_of_interest,'cell_type_of_interest'] = cell_of_interest

    clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, oob_score=True)
    selector = RFECV(clf, step=0.2, cv=3, n_jobs=4) # step = % rounded down at each iteration  
    
    feat_labels = tiss.var_names 
    X = tiss.X
    y = tiss.obs['cell_type_of_interest']
    
    print('training...')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0) 
    clf.fit(X_train, y_train)
    selector.fit(X_train, y_train)
    feature_selected = feat_labels[selector.support_] 
    
    print('result writing')
    column_headings = []
    column_headings.append(c)
    column_headings.append(c + '_gini')
    
    resaux = pd.DataFrame(columns=column_headings)
    resaux[c] = feature_selected
    resaux[c + '_gini'] = (selector.estimator_.feature_importances_)
    
    print(feature_selected)
    print (selector.estimator_.feature_importances_)
    
    results_celltype_cv = pd.concat([results_celltype_cv,resaux],axis=1)
    
    tiss.obs['cell_type_of_interest'] = 'rest'
    
results_celltype_cv

Kidney-new-24m-cluster
training...
result writing
Index(['Beta-s', 'Gm16861', 'Plin5', 'Sec14l3'], dtype='object', name='index')
[0.45796539 0.173545   0.10231781 0.26617181]
kidney loop of Henle ascending limb epithelial cell
training...
result writing
Index(['Slc12a1'], dtype='object', name='index')
[1.]
fenestrated cell
training...
result writing
Index(['0610007C21Rik', '0610007L01Rik', '0610007N19Rik', '0610007P14Rik',
       '0610009D07Rik', '0610009O20Rik', '0610010K14Rik', '0610010O12Rik',
       '0610011F06Rik', '0610012G03Rik',
       ...
       'Znfx1', 'Znhit1', 'Znhit6', 'Znrd1', 'Znrf1', 'Zpbp', 'Zranb1',
       'Zswim6', 'Zw10', 'Zxdc'],
      dtype='object', name='index', length=4583)
[0.00000000e+00 0.00000000e+00 5.86593394e-05 ... 2.91909742e-05
 0.00000000e+00 6.30596768e-05]
mesangial cell
training...
result writing
Index(['0610007C21Rik', '0610007L01Rik', '0610007N19Rik', '0610007P08Rik',
       '0610007P14Rik', '0610007P22Rik', '0610008F07Rik', '0610009B14Rik',
  

Unnamed: 0,Kidney-new-24m-cluster,Kidney-new-24m-cluster_gini,kidney loop of Henle ascending limb epithelial cell,kidney loop of Henle ascending limb epithelial cell_gini,fenestrated cell,fenestrated cell_gini,mesangial cell,mesangial cell_gini,B cell,B cell_gini,kidney collecting duct epithelial cell,kidney collecting duct epithelial cell_gini,T cell,T cell_gini,kidney collecting duct principal cell,kidney collecting duct principal cell_gini,macrophage,macrophage_gini,epithelial cell of proximal tubule,epithelial cell of proximal tubule_gini
0,Beta-s,0.457965,Slc12a1,1.0,0610007C21Rik,0.000000,0610007C21Rik,2.071603e-05,Ccr7,0.189172,0610005C13Rik,0.000239,0610007C21Rik,0.000042,Clu,0.120505,0610007N19Rik,0.000000,0610005C13Rik,0.000076
1,Gm16861,0.173545,,,0610007L01Rik,0.000000,0610007L01Rik,8.220964e-07,Cd79a,0.360559,0610007C21Rik,0.000141,0610007L01Rik,0.000100,D630042F21Rik,0.153364,0610007P08Rik,0.000000,0610007C21Rik,0.000065
2,Plin5,0.102318,,,0610007N19Rik,0.000059,0610007N19Rik,0.000000e+00,Ly6d,0.368927,0610007N19Rik,0.000100,0610007N19Rik,0.000000,Pgam2,0.439040,0610007P14Rik,0.000015,0610007L01Rik,0.000068
3,Sec14l3,0.266172,,,0610007P14Rik,0.000000,0610007P08Rik,0.000000e+00,Ms4a1,0.081342,0610007P14Rik,0.000009,0610007P14Rik,0.000054,Slc12a3,0.287090,0610007P22Rik,0.000000,0610007N19Rik,0.000084
4,,,,,0610009D07Rik,0.000000,0610007P14Rik,0.000000e+00,,,0610009D07Rik,0.000067,0610007P22Rik,0.000050,,,0610008F07Rik,0.000000,0610007P14Rik,0.000011
5,,,,,0610009O20Rik,0.000012,0610007P22Rik,0.000000e+00,,,0610010K14Rik,0.000058,0610009D07Rik,0.000049,,,0610009B22Rik,0.000013,0610009B22Rik,0.000019
6,,,,,0610010K14Rik,0.000018,0610008F07Rik,0.000000e+00,,,0610010O12Rik,0.000241,0610010K14Rik,0.000012,,,0610009D07Rik,0.000014,0610009D07Rik,0.000085
7,,,,,0610010O12Rik,0.000061,0610009B14Rik,0.000000e+00,,,0610011F06Rik,0.000118,0610010O12Rik,0.000026,,,0610009L18Rik,0.000000,0610010K14Rik,0.000106
8,,,,,0610011F06Rik,0.000117,0610009B22Rik,0.000000e+00,,,0610012G03Rik,0.000125,0610031J06Rik,0.000095,,,0610009O20Rik,0.000000,0610010O12Rik,0.000095
9,,,,,0610012G03Rik,0.000014,0610009D07Rik,2.743943e-05,,,0610012H03Rik,0.000171,0610037P05Rik,0.000141,,,0610010B08Rik,0.000000,0610011F06Rik,0.000435


In [114]:
results_celltype_cv.to_csv('KidneyFacsCellType1000.csv')

# Now do the same for kidney age 

In [131]:
adata = read_h5ad('/Users/madelinepark/Downloads/Kidney_facs.h5ad')
tiss = adata

In [136]:
results_age_cv = pd.DataFrame() #create results data frame 

for c in list(set(tiss.obs['age'])): 
    print(c)
    age_of_interest = c
    tiss.obs['age_type_of_interest'] = 'rest' #moved into, was outside before
    tiss.obs.loc[tiss.obs['age'] == age_of_interest,'age_type_of_interest'] = age_of_interest

    clf = RandomForestClassifier(n_estimators=10, random_state=0, n_jobs=-1, oob_score=True)
    selector = RFECV(clf, step=0.2, cv=3, n_jobs=4) # step = % rounded down at each iteration  
    
# #     tiss.obs.loc[tiss.obs[tiss.obs['age'] == age_of_interest].index,'age_type_of_interest'] = age_of_interest
#     tiss.obs.loc[tiss.obs['age'] == age_of_interest,'age_type_of_interest'] = age_of_interest

    feat_labels = tiss.var_names 
    X = tiss.X
    y = tiss.obs['age_type_of_interest']
    
    print('training...')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0) 
    clf.fit(X_train, y_train)
    selector.fit(X_train, y_train)
    feature_selected = feat_labels[selector.support_] 
    
    print('result writing')
    column_headings = []
    column_headings.append(c)
    column_headings.append(c + '_gini')
    
    resaux = pd.DataFrame(columns=column_headings)
    resaux[c] = feature_selected
    resaux[c + '_gini'] = (selector.estimator_.feature_importances_)
    
    print(feature_selected)
    print (selector.estimator_.feature_importances_)
    
    results_age_cv = pd.concat([results_age_cv,resaux],axis=1)
    
    #adding this smart cheeky line
    print(list(set(tiss.obs['age_type_of_interest'])))
    
    tiss.obs['age_type_of_interest'] = 'rest'
    
results_age_cv

24m
training...


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


result writing
Index(['0610007C21Rik', '0610007L01Rik', '0610007N19Rik', '0610007P08Rik',
       '0610007P14Rik', '0610007P22Rik', '0610008F07Rik', '0610009B14Rik',
       '0610009B22Rik', '0610009D07Rik',
       ...
       'Zxda', 'Zxdb', 'Zxdc', 'Zyg11a', 'Zyg11b', 'Zyx', 'Zzef1', 'Zzz3', 'a',
       'zsGreen_transgene'],
      dtype='object', name='index', length=18320)
[0. 0. 0. ... 0. 0. 0.]
['rest', '24m']
3m
training...


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


result writing
Index(['0610007P08Rik', '0610007P14Rik', '0610007P22Rik', '0610008F07Rik',
       '0610009B14Rik', '0610009B22Rik', '0610009D07Rik', '0610009L18Rik',
       '0610009O20Rik', '0610010B08Rik',
       ...
       'Zfp157', 'Zfp161', 'Zfp185', 'Zfp187', 'Zfp212', 'Zfp281', 'Zfp422',
       'Zfpl1', 'Zg16', 'zsGreen_transgene'],
      dtype='object', name='index', length=4583)
[0. 0. 0. ... 0. 0. 0.]
['rest', '3m']


Unnamed: 0,24m,24m_gini,3m,3m_gini
0,0610007C21Rik,0.000000,0610007P08Rik,0.000000
1,0610007L01Rik,0.000000,0610007P14Rik,0.000000
2,0610007N19Rik,0.000000,0610007P22Rik,0.000000
3,0610007P08Rik,0.000000,0610008F07Rik,0.000000
4,0610007P14Rik,0.000000,0610009B14Rik,0.000000
5,0610007P22Rik,0.000000,0610009B22Rik,0.000000
6,0610008F07Rik,0.000000,0610009D07Rik,0.000000
7,0610009B14Rik,0.000000,0610009L18Rik,0.000289
8,0610009B22Rik,0.000292,0610009O20Rik,0.000000
9,0610009D07Rik,0.000000,0610010B08Rik,0.000000


In [133]:
results_age_cv.to_csv('KidneyFacsAge1000TissReset.csv')

In [134]:
tiss.obs

Unnamed: 0_level_0,FACS.selection,age,batch,cell,cell_ontology_class,cell_ontology_id,cellid,free_annotation,method,mouse.id,plate,sex,subtissue,tissue,well,n_genes,n_counts,louvain,cluster_names,age_type_of_interest
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A11_B001261_S287_L003.mus-2-0,Viable,24m,0,,T cell,CL:0000084,A11_B001261,T cell,facs,24_59_M,B001261,male,,Kidney,A11,1488,1108365.0,8,8_T cell,rest
A12_B001261_S288_L003.mus-2-0,Viable,24m,0,,epithelial cell of proximal tubule,CL:0002306,A12_B001261,Proximal tube,facs,24_59_M,B001261,male,,Kidney,A12,1855,2755523.0,5,5_epithelial cell of proximal tubule,rest
A15_B001261_S291_L003.mus-2-0,Viable,24m,0,,kidney collecting duct principal cell,CL:1001431,A15_B001261,ductal princeple,facs,24_59_M,B001261,male,,Kidney,A15,718,708214.0,7,7_kidney collecting duct principal cell,rest
A16_B001261_S292_L003.mus-2-0,Viable,24m,0,,T cell,CL:0000084,A16_B001261,T cell,facs,24_59_M,B001261,male,,Kidney,A16,2113,2041432.0,8,8_T cell,rest
A17_B001261_S293_L003.mus-2-0,Viable,24m,0,,epithelial cell of proximal tubule,CL:0002306,A17_B001261,Proximal tube,facs,24_59_M,B001261,male,,Kidney,A17,1926,1743637.0,5,5_epithelial cell of proximal tubule,rest
A18_B001261_S294_L003.mus-2-0,Viable,24m,0,,macrophage,CL:0000235,A18_B001261,macrophage,facs,24_59_M,B001261,male,,Kidney,A18,1722,1671482.0,6,6_macrophage,rest
A20_B001261_S296_L003.mus-2-0,Viable,24m,0,,epithelial cell of proximal tubule,CL:0002306,A20_B001261,Proximal tube,facs,24_59_M,B001261,male,,Kidney,A20,1327,1335461.0,5,5_epithelial cell of proximal tubule,rest
A21_B001261_S297_L003.mus-2-0,Viable,24m,0,,T cell,CL:0000084,A21_B001261,T cell,facs,24_59_M,B001261,male,,Kidney,A21,1991,1374989.0,8,8_T cell,rest
A3_B001261_S279_L003.mus-2-0,Viable,24m,0,,epithelial cell of proximal tubule,CL:0002306,A3_B001261,Proximal tube,facs,24_59_M,B001261,male,,Kidney,A3,1358,953170.0,2,2_epithelial cell of proximal tubule,rest
A4_B001261_S280_L003.mus-2-0,Viable,24m,0,,fenestrated cell,CL:0000666,A4_B001261,fenest. Endo,facs,24_59_M,B001261,male,,Kidney,A4,2042,865225.0,1,1_fenestrated cell,rest


In [35]:
results_cell_cv.to_csv('test3.csv')

# Kidney Facs 1000 trees same code as mammary split for cell type

In [125]:
adata = read_h5ad('/Users/madelinepark/Downloads/Kidney_facs.h5ad')
tiss = adata

In [126]:
results_celltype_cv = pd.DataFrame() #create results data frame 

for c in list(set(tiss.obs['cell_ontology_class'])): 
    print(c)
    cell_of_interest = c
    tiss.obs['cell_type_of_interest'] = 'rest' #moved into, was outside before
    tiss.obs.loc[tiss.obs['cell_ontology_class'] == cell_of_interest,'cell_type_of_interest'] = cell_of_interest

    clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, oob_score=True)
    selector = RFECV(clf, step=0.2, cv=3, n_jobs=4) # step = % rounded down at each iteration  
    
# #     tiss.obs.loc[tiss.obs[tiss.obs['age'] == age_of_interest].index,'age_type_of_interest'] = age_of_interest
#     tiss.obs.loc[tiss.obs['age'] == age_of_interest,'age_type_of_interest'] = age_of_interest

    feat_labels = tiss.var_names 
    X = tiss.X
    y = tiss.obs['cell_type_of_interest']
    
    print('training...')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0) 
    clf.fit(X_train, y_train)
    selector.fit(X_train, y_train)
    feature_selected = feat_labels[selector.support_] 
    
    print('result writing')
    column_headings = []
    column_headings.append(c)
    column_headings.append(c + '_gini')
    
    resaux = pd.DataFrame(columns=column_headings)
    resaux[c] = feature_selected
    resaux[c + '_gini'] = (selector.estimator_.feature_importances_)
    
    print(feature_selected)
    print (selector.estimator_.feature_importances_)
    
    results_celltype_cv = pd.concat([results_celltype_cv,resaux],axis=1)
    
    tiss.obs['cell_type_of_interest'] = 'rest'
    
results_celltype_cv

Kidney-new-24m-cluster
training...
result writing
Index(['Beta-s', 'Gm16861', 'Plin5', 'Sec14l3'], dtype='object', name='index')
[0.45796539 0.173545   0.10231781 0.26617181]
kidney loop of Henle ascending limb epithelial cell
training...
result writing
Index(['Slc12a1'], dtype='object', name='index')
[1.]
fenestrated cell
training...
result writing
Index(['0610007C21Rik', '0610007L01Rik', '0610007N19Rik', '0610007P14Rik',
       '0610009D07Rik', '0610009O20Rik', '0610010K14Rik', '0610010O12Rik',
       '0610011F06Rik', '0610012G03Rik',
       ...
       'Znfx1', 'Znhit1', 'Znhit6', 'Znrd1', 'Znrf1', 'Zpbp', 'Zranb1',
       'Zswim6', 'Zw10', 'Zxdc'],
      dtype='object', name='index', length=4583)
[0.00000000e+00 0.00000000e+00 5.86593394e-05 ... 2.91909742e-05
 0.00000000e+00 6.30596768e-05]
mesangial cell
training...
result writing
Index(['0610007C21Rik', '0610007L01Rik', '0610007N19Rik', '0610007P08Rik',
       '0610007P14Rik', '0610007P22Rik', '0610008F07Rik', '0610009B14Rik',
  

Unnamed: 0,Kidney-new-24m-cluster,Kidney-new-24m-cluster_gini,kidney loop of Henle ascending limb epithelial cell,kidney loop of Henle ascending limb epithelial cell_gini,fenestrated cell,fenestrated cell_gini,mesangial cell,mesangial cell_gini,B cell,B cell_gini,kidney collecting duct epithelial cell,kidney collecting duct epithelial cell_gini,T cell,T cell_gini,kidney collecting duct principal cell,kidney collecting duct principal cell_gini,macrophage,macrophage_gini,epithelial cell of proximal tubule,epithelial cell of proximal tubule_gini
0,Beta-s,0.457965,Slc12a1,1.0,0610007C21Rik,0.000000,0610007C21Rik,2.071603e-05,Ccr7,0.189172,0610005C13Rik,0.000239,0610007C21Rik,0.000042,Clu,0.120505,0610007N19Rik,0.000000,0610005C13Rik,0.000076
1,Gm16861,0.173545,,,0610007L01Rik,0.000000,0610007L01Rik,8.220964e-07,Cd79a,0.360559,0610007C21Rik,0.000141,0610007L01Rik,0.000100,D630042F21Rik,0.153364,0610007P08Rik,0.000000,0610007C21Rik,0.000065
2,Plin5,0.102318,,,0610007N19Rik,0.000059,0610007N19Rik,0.000000e+00,Ly6d,0.368927,0610007N19Rik,0.000100,0610007N19Rik,0.000000,Pgam2,0.439040,0610007P14Rik,0.000015,0610007L01Rik,0.000068
3,Sec14l3,0.266172,,,0610007P14Rik,0.000000,0610007P08Rik,0.000000e+00,Ms4a1,0.081342,0610007P14Rik,0.000009,0610007P14Rik,0.000054,Slc12a3,0.287090,0610007P22Rik,0.000000,0610007N19Rik,0.000084
4,,,,,0610009D07Rik,0.000000,0610007P14Rik,0.000000e+00,,,0610009D07Rik,0.000067,0610007P22Rik,0.000050,,,0610008F07Rik,0.000000,0610007P14Rik,0.000011
5,,,,,0610009O20Rik,0.000012,0610007P22Rik,0.000000e+00,,,0610010K14Rik,0.000058,0610009D07Rik,0.000049,,,0610009B22Rik,0.000013,0610009B22Rik,0.000019
6,,,,,0610010K14Rik,0.000018,0610008F07Rik,0.000000e+00,,,0610010O12Rik,0.000241,0610010K14Rik,0.000012,,,0610009D07Rik,0.000014,0610009D07Rik,0.000085
7,,,,,0610010O12Rik,0.000061,0610009B14Rik,0.000000e+00,,,0610011F06Rik,0.000118,0610010O12Rik,0.000026,,,0610009L18Rik,0.000000,0610010K14Rik,0.000106
8,,,,,0610011F06Rik,0.000117,0610009B22Rik,0.000000e+00,,,0610012G03Rik,0.000125,0610031J06Rik,0.000095,,,0610009O20Rik,0.000000,0610010O12Rik,0.000095
9,,,,,0610012G03Rik,0.000014,0610009D07Rik,2.743943e-05,,,0610012H03Rik,0.000171,0610037P05Rik,0.000141,,,0610010B08Rik,0.000000,0610011F06Rik,0.000435


In [127]:
results_celltype_cv.to_csv('KidneyFacsCell1000TissReset.csv')

In [None]:
tiss.obs

# Age, Heart

In [138]:
adata = read_h5ad('/Users/madelinepark/Downloads/Heart_droplet.h5ad')
tiss = adata

In [139]:
results_age_cv = pd.DataFrame() #create results data frame 

for c in list(set(tiss.obs['age'])): 
    print(c)
    age_of_interest = c
    tiss.obs['age_type_of_interest'] = 'rest' #moved into, was outside before
    tiss.obs.loc[tiss.obs['age'] == age_of_interest,'age_type_of_interest'] = age_of_interest

    clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, oob_score=True)
    selector = RFECV(clf, step=0.2, cv=3, n_jobs=4) # step = % rounded down at each iteration  
    
# #     tiss.obs.loc[tiss.obs[tiss.obs['age'] == age_of_interest].index,'age_type_of_interest'] = age_of_interest
#     tiss.obs.loc[tiss.obs['age'] == age_of_interest,'age_type_of_interest'] = age_of_interest

    feat_labels = tiss.var_names 
    X = tiss.X
    y = tiss.obs['age_type_of_interest']
    
    print('training...')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0) 
    clf.fit(X_train, y_train)
    selector.fit(X_train, y_train)
    feature_selected = feat_labels[selector.support_] 
    
    print('result writing')
    column_headings = []
    column_headings.append(c)
    column_headings.append(c + '_gini')
    
    resaux = pd.DataFrame(columns=column_headings)
    resaux[c] = feature_selected
    resaux[c + '_gini'] = (selector.estimator_.feature_importances_)
    
    print(feature_selected)
    print (selector.estimator_.feature_importances_)
    
    results_age_cv = pd.concat([results_age_cv,resaux],axis=1)
    
    #adding this smart cheeky line
    print(list(set(tiss.obs['age_type_of_interest'])))
    
    tiss.obs['age_type_of_interest'] = 'rest'
    
results_age_cv

24m
training...
result writing
Index(['Sox17', 'Tcea1', 'Rb1cc1', 'Pcmtd1', 'Snhg6', 'Cops5', 'Tram1', 'Eya1',
       'Rpl7', 'Ube2w',
       ...
       'Rbbp7', 'Bmx', 'Pir', 'Figf', 'Tmsb4x', 'Kdm5d', 'Eif2s3y', 'Uty',
       'Ddx3y', 'Erdr1'],
      dtype='object', name='index', length=3972)
[0.00013201 0.00021865 0.00011146 ... 0.00028555 0.00272545 0.00143913]
['rest', '24m']
30m
training...
result writing
Index(['Sox17', 'Mrpl15', 'Lypla1', 'Tcea1', 'Rb1cc1', 'Pcmtd1', 'Snhg6',
       'Prex2', 'Tram1', 'Gm5523',
       ...
       'Rbbp7', 'Zrsr2', 'Gpm6b', 'Rab9', 'Tmsb4x', 'Mid1', 'Vamp7', 'Eif2s3y',
       'Ddx3y', 'Erdr1'],
      dtype='object', name='index', length=3972)
[1.39486771e-04 8.48414190e-05 8.22038217e-05 ... 1.08456808e-03
 9.17215743e-04 1.33128649e-04]
['rest', '30m']
18m
training...
result writing
Index(['Tcea1', 'Rb1cc1', 'Pcmtd1', 'Sgk3', 'Sulf1', 'Ncoa2', 'Tram1', 'Eya1',
       'Rpl7', 'Tceb1',
       ...
       'Pir', 'Gemin8', 'Rab9', 'Tmsb4x', 'Tlr7', 'P

Unnamed: 0,24m,24m_gini,30m,30m_gini,18m,18m_gini,21m,21m_gini,1m,1m_gini,3m,3m_gini
0,Sox17,0.000132,Sox17,0.000139,Tcea1,0.000192,Sox17,0.000148,Dbp,1.0,Tcea1,0.000329
1,Tcea1,0.000219,Mrpl15,0.000085,Rb1cc1,0.000212,Mrpl15,0.000091,,,Rb1cc1,0.000381
2,Rb1cc1,0.000111,Lypla1,0.000082,Pcmtd1,0.000221,Lypla1,0.000137,,,Pcmtd1,0.000152
3,Pcmtd1,0.000088,Tcea1,0.000103,Sgk3,0.000234,Pcmtd1,0.000140,,,Vcpip1,0.000188
4,Snhg6,0.000224,Rb1cc1,0.000067,Sulf1,0.000155,Rrs1,0.000141,,,Snhg6,0.000092
5,Cops5,0.000122,Pcmtd1,0.000224,Ncoa2,0.000209,Snhg6,0.000115,,,Arfgef1,0.000130
6,Tram1,0.000262,Snhg6,0.000093,Tram1,0.000159,Tram1,0.000199,,,Prex2,0.000533
7,Eya1,0.000153,Prex2,0.000079,Eya1,0.000101,Rpl7,0.000343,,,Sulf1,0.000277
8,Rpl7,0.000373,Tram1,0.000112,Rpl7,0.000339,Tceb1,0.000189,,,Tram1,0.000228
9,Ube2w,0.000139,Gm5523,0.000108,Tceb1,0.000159,Tmem70,0.000120,,,Lactb2,0.000347


In [140]:
results_age_cv.to_csv('HeartDropletAge1000TissReset.csv')