In [1]:
import numpy as np
import pandas as pd

import Modules.rnaseqTools as rnaT
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
def read_dataset(dataset):
    kwargs = {'sep':'\t', 'header':0, 'index_col':0}
    fname = 'Datasets/%s-tpm.tsv' % dataset
    df = pd.read_csv(fname, **kwargs)
    
    fname = 'Datasets/%s-labels.tsv' % dataset
    df_label = pd.read_csv(fname, **kwargs)
    
    arrays = [df_label.index, df_label.CellType]
    names = ['Cell', 'CellType']
    df.columns = pd.MultiIndex.from_arrays(arrays, names=names)
    
    return df.T

def get_gini_genes(cutoff=.10):
    fname = 'Gini Monte Carlo/Gene Scores.tsv'
    df_scores = pd.read_csv(fname, sep='\t', header=0, index_col=0)
    df_scores = df_scores.loc[df_scores.P_Adj <= cutoff]
    df_scores.sort_values(['Up_Down', 'Cutoff_Age'], ascending=False, inplace=True)
    
    return df_scores.index.tolist()

def get_testing_data():
    # read in data
    df_lin = read_dataset('Lab_Pvalb')
    
    genes = get_gini_genes(cutoff=.1)
    df_lin = np.log2(1+df_lin.loc[:,genes])
    
    fname = 'Datasets/Lab_Pvalb-transcriptional_labels.tsv'
    kwargs = {'sep':'\t', 'header':0, 'index_col':0}
    df_labels = pd.read_csv(fname, **kwargs)
    df_labels = df_labels.loc[df_lin.index.get_level_values('Cell')]
    
    df_old = df_lin.loc[np.logical_and(df_labels['Morph-PV-types']=='vBC', df_labels.Age>25).values]
    df_young = df_lin.loc[np.logical_and(df_labels['Morph-PV-types']=='vBC', df_labels.Age<21).values]
    df_train = pd.concat((df_old, df_young), axis=0)
    is_old = df_train.index.isin(df_old.index)[:,np.newaxis].astype(int)
    
    return df_train, is_old, df_lin

def equalize(df_train, is_old):
    inds = np.arange(is_old.shape[0],dtype=int)
    inds_0 = inds[is_old[:,0]==0]
    inds_1 = inds[is_old[:,0]==1]
    count_0 = inds_0.size
    count_1 = inds_1.size
    
    if count_1 > count_0:
        inds_sub_1 = np.random.choice(inds_1, size=count_0, replace=False)
        inds = np.hstack((inds_0, inds_sub_1))
    elif count_0 > count_1:
        inds_sub_0 = np.random.choice(inds_0, size=count_1, replace=False)
        inds = np.hstack((inds_sub_0, inds_1))
        
    df_train = df_train.iloc[inds,:]
    is_old = is_old[inds,:]
    
    return df_train, is_old

def run_test(df_train, is_old, df_eval, n=150):
    df_train, is_old = equalize(df_train, is_old)
    is_old, _, train_cells, _ = train_test_split(is_old, df_train.index, test_size=0.2, stratify=is_old[:,0])
    df_train = df_train.loc[train_cells,:]
    if df_train.shape[1] > n:
        df_train = trim_data(df_train, n=n)
    df_test = df_eval.loc[~(df_eval.index.isin(df_train.index)),df_train.columns]
    
    clf = RandomForestClassifier(n_estimators=100)
    clf = clf.fit(df_train.values, is_old[:,0])
    
    predictions = clf.predict(df_test.values)
    
    return pd.Series(predictions, index=df_test.index)

In [3]:
%%time

df_train, is_old, df_lin = get_testing_data()

CPU times: user 319 ms, sys: 8.18 ms, total: 327 ms
Wall time: 327 ms


In [4]:
%%time

df_predict = pd.DataFrame(np.NaN, index=df_lin.index, columns=np.arange(1000,dtype=int))
for col in df_predict.columns:
    df_predict[col] = run_test(df_train, is_old, df_lin, n=150)

CPU times: user 2min 10s, sys: 481 ms, total: 2min 11s
Wall time: 2min 11s


In [5]:
df_predict.to_csv('Datasets/Lab_Pvalb-Age-Predictions.tsv', sep='\t')