In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import multiprocessing
from multiprocessing import Pool
multiprocessing.set_start_method('fork')


In [2]:
file_path = "/data/projects/shared_data/collab_data/subsample/all_input.feather"

In [3]:
df= pd.read_feather(file_path)

In [4]:
df.head()

Unnamed: 0,Bladder,Fallopian_Tube,Cervix_Uteri,Kidney,Liver,Ovary,Pituitary,Blood,Nerve,Testis,...,ENST00000640157.1,ENST00000640623.1,ENST00000640630.1,ENST00000638486.1,ENST00000381568.9,ENST00000622217.1,ENST00000408734.1,ENST00000385204.1,ENST00000583027.1,ENST00000638236.1
0,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.0,0.0,0.0,0.0,0.0,0.48,0.0,0.0,0.0,0.0
1,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.0,0.0,0.0,0.0,0.0,1.54,0.0,0.0,0.0,0.0
4,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.1,0.0,0.0,0.0,0.07,0.68,0.0,0.0,0.0,0.0


In [5]:
list(df.columns)

['Bladder',
 'Fallopian_Tube',
 'Cervix_Uteri',
 'Kidney',
 'Liver',
 'Ovary',
 'Pituitary',
 'Blood',
 'Nerve',
 'Testis',
 'Prostate',
 'Small_Intestine',
 'Colon',
 'Stomach',
 'Esophagus',
 'Pancreas',
 'Spleen',
 'Lung',
 'Thyroid',
 'Adrenal_Gland',
 'Brain',
 'Salivary_Gland',
 'Skin',
 'Breast',
 'Vagina',
 'Uterus',
 'Heart',
 'Blood_Vessel',
 'Muscle',
 'Adipose_Tissue',
 'sample',
 'ENST00000373020.8',
 'ENST00000494424.1',
 'ENST00000496771.5',
 'ENST00000612152.4',
 'ENST00000614008.4',
 'ENST00000373031.4',
 'ENST00000485971.1',
 'ENST00000371582.8',
 'ENST00000371584.8',
 'ENST00000371588.9',
 'ENST00000413082.1',
 'ENST00000466152.5',
 'ENST00000494752.1',
 'ENST00000367770.5',
 'ENST00000367771.10',
 'ENST00000367772.8',
 'ENST00000423670.1',
 'ENST00000470238.1',
 'ENST00000286031.10',
 'ENST00000359326.8',
 'ENST00000413811.3',
 'ENST00000459772.5',
 'ENST00000481744.5',
 'ENST00000496973.5',
 'ENST00000374003.7',
 'ENST00000374004.5',
 'ENST00000374005.7',
 'ENST000

In [6]:
tissue_name = "Bladder"
no_of_iteration = 5
sample_size = 0.8

In [7]:
print(df.groupby([tissue_name]).size())
groups_name = list(df.groupby([tissue_name]).groups.keys())
print(groups_name)

Bladder
no_tissue     17361
yes_tissue       21
dtype: int64
['no_tissue', 'yes_tissue']


In [8]:
def stratified_sample(df, strata, size=None, seed=None, keep_index= True):
    population = len(df)
    #print(population, size)
    size = __smpl_size(population, size)
    tmp = df.loc[:,strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()

    tmp_grpd['samp_size'] = np.round(size/population * tmp_grpd['size']).astype(int)

    # controlling variable to create the dataframe or append to it
    first = True 
    for i in range(len(tmp_grpd)):
        # query generator for each iteration
        qry=''
        for s in range(len(strata)):
            stratum = strata[s]
            value = tmp_grpd.iloc[i][stratum]
            n = tmp_grpd.iloc[i]['samp_size']

            if type(value) == str:
                value = "'" + str(value) + "'"
            
            if s != len(strata)-1:
                qry = qry + stratum + ' == ' + str(value) +' & '
            else:
                qry = qry + stratum + ' == ' + str(value)
        
        # final dataframe
        if first:
            stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            first = False
        else:
            tmp_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            stratified_df = stratified_df.append(tmp_df, ignore_index=True)
    
    return stratified_df

In [9]:
def __smpl_size(population, size):
    '''
    A function to compute the sample size. If not informed, a sampling 
    size will be calculated using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    '''
    if size is None:
        cochran_n = np.round(((1.96)**2 * 0.5 * 0.5)/ 0.02**2)
        n = np.round(cochran_n/(1+((cochran_n -1) /population)))
    elif size >= 0 and size < 1:
        n = np.round(population * size)
    elif size < 0:
        raise ValueError('Parameter "size" must be an integer or a proportion between 0 and 0.99.')
    elif size >= 1:
        n = size
    return n

In [10]:
NUM_WORKERS = 5

def prepare_and_save(i):
    print(i)
    return stratified_sample(df=df, strata=[tissue_name], size=sample_size)

p = Pool(NUM_WORKERS)

In [11]:

output_ = p.map(prepare_and_save, np.arange(5))

23410






  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)


In [17]:
output

['a0', 'b1', 'c2', 'd3', 'e4', 'f5']

In [17]:
no_of_iteration

5

In [12]:
temp_list = []
for i in range(no_of_iteration):
    temp_stratified = stratified_sample(df=df, strata=[tissue_name], size=sample_size)
    temp_list.append(temp_stratified)

  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)


In [14]:
temp_list[0]

Unnamed: 0,index,Bladder,Fallopian_Tube,Cervix_Uteri,Kidney,Liver,Ovary,Pituitary,Blood,Nerve,...,ENST00000640157.1,ENST00000640623.1,ENST00000640630.1,ENST00000638486.1,ENST00000381568.9,ENST00000622217.1,ENST00000408734.1,ENST00000385204.1,ENST00000583027.1,ENST00000638236.1
0,16764,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.00,0.00,0.00,0.00,0.00,0.03,0.0,0.0,0.0,1.58
1,13612,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.00
2,15414,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.00,0.00,0.00,0.00,0.00,0.37,0.0,0.0,0.0,0.19
3,8275,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.00,0.00,0.00,0.00,0.00,1.06,0.0,0.0,0.0,0.00
4,7140,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.00,0.01,0.00,0.14,0.00,1.49,0.0,0.0,0.0,0.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13901,14265,yes_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.00,0.07,0.00,0.00,0.00,2.77,0.0,0.0,0.0,0.23
13902,13219,yes_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,1.06
13903,14174,yes_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.00,0.01,0.00,0.00,0.00,0.00,0.0,0.0,0.0,3.59
13904,13915,yes_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,no_tissue,...,0.34,0.34,0.27,0.00,0.23,0.00,0.0,0.0,0.0,0.17
