# scRFE source V3 - kidney split by celltype

In [1]:
# cleaning up code after meeting with Sevahn

In [2]:
# all code broken up into separate arguments for scRFE

In [1]:
# Imports 
import numpy as np
import pandas as pd
import scanpy as sc
from anndata import read_h5ad
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV

In [2]:
# read in data 
adata = read_h5ad('/Users/madelinepark/Downloads/Kidney_facs.h5ad')
tiss = adata

In [5]:
# tiss.obs['cell_ontology_class']

In [6]:
# split data for training
def loc_split_function(tiss, feature='cell_ontology_class'):
    tiss.obs['feature_type_of_interest'] = 'rest'
    for c in list(set(tiss.obs[feature])): 
        feature_of_interest = c
        tiss.obs.loc[tiss.obs[tiss.obs[feature] == feature_of_interest].index,'feature_type_of_interest'] = feature_of_interest
        feat_labels = tiss.var_names 
        X = tiss.X
        y = tiss.obs['feature_type_of_interest']
    return X, y, feature, feat_labels #this is returning only the last thing in the loop 

In [7]:
# create random forest and selector, then train
def train_function(X, y, test_size, random_state):
    print('training...')
    loc_split = loc_split_function(tiss=tiss, feature='cell_ontology_class')
    X = loc_split[0]
    y = loc_split[1]
    feat_labels = loc_split[3] #this should not be hardcoded #genes be argument
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0) 
    clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, oob_score=True)
    selector = RFECV(clf, step=0.2, cv=3, n_jobs=4)
    clf.fit(X_train, y_train)
    selector.fit(X_train, y_train)
    feature_selected = feat_labels[selector.support_]
    return selector, clf, feat_labels, feature_selected, selector.estimator_.feature_importances_, X_train, X_test, y_train, y_test

In [8]:
cd /Users/madelinepark/src3/scRFE/scRFE-results

/Users/madelinepark/src3/scRFE/scRFE-results


In [9]:
# result write
def result_write (c, feature_selected, tiss, feature='cell_ontology_class',test_size=0.05, random_state=0): 
    results_df = pd.DataFrame()
    print('result writing...')
    loc_split = loc_split_function(tiss=tiss, feature= feature)
    X = loc_split[0]
    y = loc_split[1]
    train = train_function(X, y, test_size, random_state)
    feat_labels = train[0] 
    feature_selected = train[3]
    gini_scores = train[4]
    column_headings = []
    column_headings.append(c)
    column_headings.append(c + '_gini')
    resaux = pd.DataFrame(columns=column_headings)
    resaux[c] = feature_selected 
    resaux[c + '_gini'] = (gini_scores) 
    results_df = pd.concat([results_df,resaux],axis=1)
    tiss.obs['feature_type_of_interest'] = 'rest'
    file_name = feature + c + ".csv"
#     results_df.sort_values(by = 'gini_index', ascending = True, axis = 0)
    print(results_df)
    results_df.to_csv(file_name) #trying to save results as a csv
    return results_df

In [10]:
# gini_3_m = result_write (c='3m', feature_selected, tiss, feature='age',test_size=0.05, random_state=0)
# gini_24_m = result_write (c='24m', feature_selected, tiss, feature='age',test_size=0.05, random_state=0)

In [11]:
# combined functions
def scRFE (tiss=tiss, X=tiss.X, feature='cell_ontology_class', n_estimators=1000, random_state=0, n_jobs=-1, oob_score=True, test_size = 0.05, step=0.2, cv=5) :
    all_trees = []
    for c in list(set(tiss.obs[feature])): 
        print(c)
        loc_split = loc_split_function(tiss=tiss, feature= feature)
        X = loc_split[0] #change age to feature
        y = loc_split[1]
        feature = loc_split[2]
        feat_labels = loc_split[3]
        train = train_function(X, y, test_size, random_state)
        feature_selected = train[1]
        X_train = train[2]
        X_test = train[3]
        y_train = train[4]
        y_test = train[5]        
        all_trees += [result_write(c, feature_selected, tiss, feature=feature,test_size=0.05, random_state=0)]
        #returns data frames from SCRFE in order
        tiss.obs['age_type_of_interest'] = 'rest'
    return all_trees
#         results_df.to_csv('scRFE-results.csv') #trying to save results as a csv - this didnt work

In [None]:
scRFE(tiss=tiss, feature='cell_ontology_class', n_estimators=10, random_state=0, n_jobs=-1, oob_score=True, test_size = 0.05, step=0.2, cv=5)

In [13]:
makeAllTrees = scRFE(tiss=tiss, feature='cell_ontology_class', n_estimators=10, random_state=0, n_jobs=-1, oob_score=True, test_size = 0.05, step=0.2, cv=5)
# age3 = makeAllTrees[1]
# age24 = makeAllTrees[0]

kidney loop of Henle ascending limb epithelial cell
training...
result writing...
training...
     kidney loop of Henle ascending limb epithelial cell  \
0                                         0610005C13Rik    
1                                         0610007C21Rik    
2                                         0610007N19Rik    
3                                         0610007P14Rik    
4                                         0610009D07Rik    
5                                         0610010O12Rik    
6                                         0610011F06Rik    
7                                         0610012G03Rik    
8                                         0610012H03Rik    
9                                         0610031J06Rik    
10                                        0610037L13Rik    
11                                        0610040J01Rik    
12                                        0910001L09Rik    
13                                            100043387    
14    

result writing...
training...
     kidney collecting duct principal cell  \
0                            0610005C13Rik   
1                            0610007C21Rik   
2                            0610007N19Rik   
3                            0610007P14Rik   
4                            0610009D07Rik   
5                            0610010O12Rik   
6                            0610011F06Rik   
7                            0610012G03Rik   
8                            0610012H03Rik   
9                            0610031J06Rik   
10                           0610037L13Rik   
11                           0610040J01Rik   
12                           0910001L09Rik   
13                               100043387   
14                           1110001A16Rik   
15                           1110001J03Rik   
16                           1110002B05Rik   
17                           1110003E01Rik   
18                           1110004F10Rik   
19                           1110005A03Rik   
20  

result writing...
training...
             B cell  B cell_gini
0     0610005C13Rik     0.000071
1     0610007C21Rik     0.000089
2     0610007N19Rik     0.000072
3     0610007P14Rik     0.000040
4     0610009D07Rik     0.000092
5     0610010O12Rik     0.000092
6     0610011F06Rik     0.000169
7     0610012G03Rik     0.000067
8     0610012H03Rik     0.000112
9     0610031J06Rik     0.000039
10    0610037L13Rik     0.000039
11    0610040J01Rik     0.000045
12    0910001L09Rik     0.000058
13        100043387     0.000087
14    1110001A16Rik     0.000051
15    1110001J03Rik     0.000181
16    1110002B05Rik     0.000053
17    1110003E01Rik     0.000210
18    1110004F10Rik     0.000059
19    1110005A03Rik     0.000040
20    1110008F13Rik     0.000084
21    1110008P14Rik     0.000042
22    1110032A03Rik     0.000063
23    1110038B12Rik     0.000008
24    1110038D17Rik     0.000036
25    1110049F12Rik     0.000045
26    1110051M20Rik     0.000061
27    1110057K04Rik     0.000047
28    1110059

result writing...
training...
     mesangial cell  mesangial cell_gini
0     0610005C13Rik             0.000071
1     0610007C21Rik             0.000089
2     0610007N19Rik             0.000072
3     0610007P14Rik             0.000040
4     0610009D07Rik             0.000092
5     0610010O12Rik             0.000092
6     0610011F06Rik             0.000169
7     0610012G03Rik             0.000067
8     0610012H03Rik             0.000112
9     0610031J06Rik             0.000039
10    0610037L13Rik             0.000039
11    0610040J01Rik             0.000045
12    0910001L09Rik             0.000058
13        100043387             0.000087
14    1110001A16Rik             0.000051
15    1110001J03Rik             0.000181
16    1110002B05Rik             0.000053
17    1110003E01Rik             0.000210
18    1110004F10Rik             0.000059
19    1110005A03Rik             0.000040
20    1110008F13Rik             0.000084
21    1110008P14Rik             0.000042
22    1110032A03Rik        

In [None]:
# run function
# scRFE(tiss=tiss, feature='age', n_estimators=1000, random_state=0, n_jobs=-1, oob_score=True, test_size = 0.05, step=0.2, cv=5)

In [51]:
# how do i get it to append after so the last dataframe doesnt completely overwrite the previous one

'/Users/madelinepark/src3/scRFE/scRFE-results'

In [None]:
# # combined functions
# def scRFE_save (tiss=tiss, X=tiss.X, feature='age', n_estimators=1000, random_state=0, n_jobs=-1, oob_score=True, test_size = 0.05, step=0.2, cv=5) :
#     for c in list(set(tiss.obs[feature])): 
#         print(c)
#         X = loc_split(tiss=tiss, feature= feature)[0] #change age to feature
#         y = loc_split(tiss=tiss, feature= feature)[1]
#         feature = loc_split(tiss=tiss, feature= feature)[2]
#         feat_labels = loc_split(tiss=tiss, feature= feature)[3]
#         feature_selected = train(X, y, test_size, random_state)[1]
#         X_train = train(X, y, test_size, random_state)[2]
#         X_test = train(X, y, test_size, random_state)[3]
#         y_train = train(X, y, test_size, random_state)[4]
#         y_test = train(X, y, test_size, random_state)[5]
#         result_write(c, feature_selected, tiss, feature=feature,test_size=0.05, random_state=0)
#         tiss.obs['age_type_of_interest'] = 'rest'
# #         not sorted