In [1]:
import glob
import time
import numpy as np
import pandas as pd
from statsmodels.stats import multitest

In [2]:
def read_data():
    kwargs = {'sep':'\t', 'header':0, 'index_col':0}
    fname = 'Datasets/Lab_Pvalb-tpm.tsv'
    df = pd.read_csv(fname, **kwargs)
    
    fname = 'Datasets/Lab_Pvalb-labels.tsv'
    df_labels = pd.read_csv(fname, **kwargs)
    
    return df, df_labels

def get_cutoffs(ages):
    vals = np.unique(ages)
    cutoffs = (vals[4:-5] + vals[5:-4]) / 2
    
    return cutoffs

def get_candidate_data(min_count=0, celltypes=('vBC', 'hBC', 'hBiC', 'vBiC', 'vAAC')):
    df, df_labels = read_data()
    df_labels = df_labels.loc[df_labels.CellType.isin(celltypes)]
    df = df.loc[:,df_labels.index]
    df = df.loc[(df.values>min_count).sum(axis=1)>5]
    df = df.loc[(df.values<=min_count).sum(axis=1)>10]
    df = (df>min_count).astype(int)
    
    df.columns = df_labels.Age
    df.sort_index(axis=1, inplace=True)
    
    cutoffs = get_cutoffs(df.columns)
    
    return df, cutoffs

def read_shuffle_scores():
    fnames = glob.glob('Gini Monte Carlo/Score *.tsv')
    kwargs = {'sep':'\t', 'header':0, 'index_col':0}
    df = pd.concat([pd.read_csv(fname, **kwargs) for fname in fnames], axis=0)
    df.columns = df.columns.astype(int)
    
    datalist = np.sort(df.values, axis=0)
    df = pd.DataFrame(datalist, index=df.index, columns=df.columns)
    
    return df

def create_counts_matrix(df):
    counts = np.unique(df.values.sum(axis=1))
    
    df_counts = pd.DataFrame(0, index=counts, columns=df.columns)
    for ind, count in enumerate(counts):
        df_counts.iloc[ind,:count] = 1
    
    return df_counts

def calc_gini(datalist):
    p = datalist.mean(axis=1)
    
    return (p * (1-p))

def calc_gini_split(df, cutoff):
    ages = df.columns
    datalist = df.values
    sep = ages < cutoff
    
    size1 = sep.sum()
    size2 = (~sep).sum()
    size = sep.size
    
    g1 = calc_gini(datalist[:,sep])
    g2 = calc_gini(datalist[:,~sep])
    
    split = (g1 * size1 + g2 * size2) / size
    
    return split

def get_gini(df, cutoffs):
    df_split = pd.DataFrame(np.NaN, index=df.index, columns=cutoffs)
    
    for cutoff in cutoffs:
        df_split[cutoff] = calc_gini_split(df, cutoff)
    
    base = calc_gini(df.values)
    splits = df_split.min(axis=1)
    ginis = base - splits
    cutoffs = df_split.idxmin(axis=1)
    
    return base, splits, ginis, cutoffs

def get_shuffle_scores(df_counts, cutoffs, n=10):
    df_scores = pd.DataFrame(np.NaN, index=np.arange(n,dtype=int), columns=df_counts.index)
    df_counts = df_counts.copy()
    
    for index in range(n):
        inds = np.random.permutation(df_counts.shape[1])
        df_counts.columns = df_counts.columns[inds]
        df_scores.loc[index] = get_gini(df_counts, cutoffs)[2]
    
    df_scores.columns.name = 'Counts'
    df_scores.index.name = 'Run'
    
    return df_scores

def add_p_values(df_scores, df_mont):
    if False:
        mont_val = df_mont.values.flatten()
        mont_val = np.sort(mont_val)

        df_scores.P_Value = 1 - (np.searchsorted(mont_val, df_scores.Gini_Score) - .5) / mont_val.size
        df_scores.P_Adj = multitest.multipletests(df_scores.P_Value.values, method='fdr_bh')[1]
    
        return
    
    for val in np.unique(df_scores.Count):
        is_val = df_scores.Count==val
        df_sub = df_scores.loc[is_val]
        monte_scores = df_mont[val].values
        p_vals = 1 - (np.searchsorted(monte_scores, df_sub.Gini_Score) - .5) / monte_scores.size
        df_scores.loc[df_sub.index,'P_Value'] = p_vals
    
    df_scores.P_Adj = multitest.multipletests(df_scores.P_Value.values, method='fdr_bh')[1]
    
    return

def get_gene_scores(df, cutoffs):
    columns = ['Base_Score', 'Split_Score', 'Gini_Score', 'Cutoff_Age', 'Count', 'P_Value', 'P_Adj']
    df_scores = pd.DataFrame(np.NaN, index=df.index, columns=columns)
    
    base, splits, ginis, cutoffs = get_gini(df, cutoffs)
    df_scores.Base_Score = base
    df_scores.Split_Score = splits
    df_scores.Gini_Score = ginis
    df_scores.Cutoff_Age = cutoffs
    df_scores.Count = df.sum(axis=1)
    
    df_mont = read_shuffle_scores()
    add_p_values(df_scores, df_mont)
    
    return df_scores

def add_regulation_direction(df_scores, df):
    
    for age in np.unique(df_scores.Cutoff_Age):
        is_age = df_scores.Cutoff_Age == age
        younger = df.columns < age
        older = df.columns > age
        
        before = df.loc[is_age, younger].mean(axis=1)
        after = df.loc[is_age, older].mean(axis=1)
        direction = ['Up' if aft > bef else 'Down' for bef, aft in zip(before, after)]
        
        df_scores.loc[is_age, 'Up_Down'] = direction
    
    return

def calc_gene_scores(df, df_mont, cutoffs):
    columns = ['Base_Score', 'Split_Score', 'Gini_Score', 'Cutoff_Age', 'Count', 'P_Value', 'P_Adj', 'Up_Down']
    df_scores = pd.DataFrame(np.NaN, index=df.index, columns=columns)
    
    base, splits, ginis, cutoffs = get_gini(df, cutoffs)
    df_scores.Base_Score = base
    df_scores.Split_Score = splits
    df_scores.Gini_Score = ginis
    df_scores.Cutoff_Age = cutoffs
    df_scores.Count = df.sum(axis=1)
    
    add_p_values(df_scores, df_mont)
    add_regulation_direction(df_scores, df)
    
    return df_scores

def get_regulation(df, cutoff=0.05):
    fname = 'Gini Monte Carlo/Gene Scores.tsv'
    df_scores = pd.read_csv(fname, sep='\t', header=0, index_col=0)
    df_scores = df_scores.loc[df_scores.P_Adj<=cutoff]
    df = df.loc[df_scores.index]
    
    columns = ['P_Adj', 'Score', 'Age', 'Before', 'After', 'Up_Down']
    df_stat = pd.DataFrame(np.NaN, index=df_scores.index, columns=columns)
    df_stat.P_Adj = df_scores.P_Adj
    df_stat.Score = df_scores.Gini_Score
    df_stat.Age = df_scores.Cutoff_Age
    
    for gene, row in df_scores.iterrows():
        age = row.Cutoff_Age
        df_stat.loc[gene, 'Before'] = df.loc[gene, df.columns<age].mean()
        df_stat.loc[gene, 'After'] = df.loc[gene, df.columns>age].mean()
    
    df_stat.loc[df_stat.Before < df_stat.After, 'Up_Down'] = 'Up Regulated'
    df_stat.loc[df_stat.Before > df_stat.After, 'Up_Down'] = 'Down Regulated'
    
    up_regulated = df_stat.Up_Down == 'Up Regulated'
    up_genes = df_stat.loc[up_regulated].index.tolist()
    down_genes = df_stat.loc[~up_regulated].index.tolist()
    
    return up_genes, down_genes

In [3]:
%%time

df, cutoffs = get_candidate_data(min_count=.5, celltypes=('vBC',))
df_counts = create_counts_matrix(df)

CPU times: user 439 ms, sys: 30.9 ms, total: 470 ms
Wall time: 471 ms


In [4]:
t0 = time.time()

for i in range(0,100):
    df_scores = get_shuffle_scores(df_counts, cutoffs, n=10000)
    df_scores.to_csv('Gini Monte Carlo/Score %04d.tsv' % i, sep='\t')
    dt = time.time() - t0
    print(i, dt, dt / (i+1))

0 81.63007760047913 81.63007760047913
1 158.6629936695099 79.33149683475494
2 236.39791345596313 78.79930448532104
3 314.2865164279938 78.57162910699844
4 390.7887713909149 78.15775427818298
5 467.1668965816498 77.86114943027496
6 543.1972596645355 77.59960852350507
7 615.4953827857971 76.93692284822464
8 679.4735758304596 75.49706398116217
9 765.206228017807 76.5206228017807
10 858.2720658779144 78.02473326162858
11 951.8454103469849 79.32045086224873
12 1047.3000209331512 80.56154007178087
13 1146.8424470424652 81.91731764589038
14 1248.133814573288 83.20892097155253
15 1339.4933545589447 83.71833465993404
16 1420.2861008644104 83.54624122731826
17 1502.330608844757 83.46281160248651
18 1585.2936277389526 83.43650672310277
19 1668.2429156303406 83.41214578151703
20 1751.141770362854 83.3877033506121
21 1834.1603367328644 83.37092439694838
22 1926.349301815033 83.75431747021882
23 2023.6311597824097 84.31796499093373
24 2123.181456565857 84.92725826263428
25 2242.4544949531555 86.2482

In [5]:
%%time

df_mont = read_shuffle_scores()

CPU times: user 6.95 s, sys: 244 ms, total: 7.19 s
Wall time: 7.27 s


In [6]:
%%time

df_scores = calc_gene_scores(df, df_mont, cutoffs)
df_scores.sort_values('P_Adj', inplace=True)
df_scores.to_csv('Gini Monte Carlo/Gene Scores.tsv', sep='\t')

CPU times: user 325 ms, sys: 0 ns, total: 325 ms
Wall time: 323 ms


In [7]:
df = pd.read_csv('Gini Monte Carlo/Gene Scores.tsv', sep='\t', header=0, index_col=0)
df = df.loc[df.P_Adj<0.1]
df.sort_values('Up_Down', inplace=True, ascending=False)
genes = df.index.tolist()
with open('References/GO/Genes_Gini.txt', 'w') as w:
    w.write('\n'.join(genes))

In [None]:
trpc3