__First we need to create a dummy dataset. Let's make a set which is 5 samples, 2 negatives. There will be 100 OTUs.__

In [4]:
# import random for working with random numbers
import random 
# import pandas for working with dataframes
import pandas as pd
  
# Function to generate 
# and append them  
# start = starting range, 
# end = ending range 
# num = number of  
# elements needs to be appended 
def Rand(start, end, number_of_numbers): 
    """Function to generate and append random numbers to a list
    start = starting range, end = ending range, num = number of elements to generate"""
    results_list_to_return = [] 
  
    for _ in range(number_of_numbers): 
        results_list_to_return.append(random.randint(start, end)) 
  
    return results_list_to_return 

# we will generate a set of lists
# each list will represent the sequences returned from one sample (or one negative)
# here we use range 7 because we want to produce 7 lists (5 samples plus 2 negatives)
# each list having 100 numbers between 0 (0 reads returned) and 2468 (a random number that I just made up)
two_dim_list_to_create_dataframe_from = [Rand(0,2468,100) for _ in range(7)]
    # Driver Code 

# we then use pandas which is a python module for making dataframes (similar to the R structure dataframes)
columns = ['sample_0', 'sample_1', 'sample_2', 'sample_3', 'sample_4', 'neg_pcr', 'neg_ext']
index = [f'OUT_{num}' for num in range(100)]
d_for_df = {col:col_list for col, col_list in zip(columns, two_dim_list_to_create_dataframe_from)}
df = pd.DataFrame(d_for_df, columns=columns, index=index)
print(df)

        sample_0  sample_1  sample_2  sample_3  sample_4  neg_pcr  neg_ext
OUT_0       1646       152      1411      1540      2132     2378     1411
OUT_1       1586      1536       585      2011      1134     1682     1610
OUT_2        442      1424      1900      1326      1290      873     1817
OUT_3       2201      1954       367      1445      2130      931     2262
OUT_4       2077        90      1484      1760      2156     1510      162
OUT_5       2347      2387      2386       562      2455     2271     1003
OUT_6        831       899       637       265      1944      680      500
OUT_7        860       473      1147      2145       131     1122      451
OUT_8        578       160       654      1234      2198     2278     2169
OUT_9       2070      1209       828      1307       357      684     2181
OUT_10      2231       975       396       758      2062      926      452
OUT_11       265      2438      1895      1886       938     2049     1151
OUT_12      1648      159

__Then convert the absolute abundances into proportions (i.e. columns sum to 1)__

In [5]:
df = df.div(df.sum(axis=0), axis=1)
print(df)

        sample_0  sample_1  sample_2  sample_3  sample_4   neg_pcr   neg_ext
OUT_0   0.013383  0.001161  0.011875  0.011547  0.016377  0.018278  0.010455
OUT_1   0.012895  0.011730  0.004923  0.015079  0.008711  0.012928  0.011929
OUT_2   0.003594  0.010875  0.015990  0.009943  0.009909  0.006710  0.013463
OUT_3   0.017895  0.014922  0.003089  0.010835  0.016362  0.007156  0.016760
OUT_4   0.016887  0.000687  0.012489  0.013197  0.016562  0.011606  0.001200
OUT_5   0.019082  0.018229  0.020080  0.004214  0.018859  0.017455  0.007432
OUT_6   0.006756  0.006865  0.005361  0.001987  0.014933  0.005227  0.003705
OUT_7   0.006992  0.003612  0.009653  0.016084  0.001006  0.008624  0.003342
OUT_8   0.004699  0.001222  0.005504  0.009253  0.016884  0.017509  0.016071
OUT_9   0.016830  0.009233  0.006968  0.009800  0.002742  0.005257  0.016160
OUT_10  0.018139  0.007446  0.003333  0.005684  0.015840  0.007117  0.003349
OUT_11  0.002155  0.018618  0.015948  0.014142  0.007205  0.015749  0.008528

In [7]:
# sanity check to see that the first column sums to 1
sum(df.iloc[:,0].values.tolist())

1.0000000000000002

__YAY! - it does__

In [10]:
# get a sorted list of OTUs by most abundant first in the neg_pcr
sorted_list_of_otus_neg_pcr = df.sort_values(by='neg_pcr', ascending=False).index.values.tolist()
print(sorted_list_of_otus_neg_pcr)

['OUT_91', 'OUT_56', 'OUT_51', 'OUT_73', 'OUT_66', 'OUT_0', 'OUT_83', 'OUT_69', 'OUT_87', 'OUT_8', 'OUT_5', 'OUT_32', 'OUT_35', 'OUT_71', 'OUT_50', 'OUT_37', 'OUT_18', 'OUT_67', 'OUT_84', 'OUT_13', 'OUT_60', 'OUT_36', 'OUT_11', 'OUT_25', 'OUT_45', 'OUT_64', 'OUT_29', 'OUT_53', 'OUT_14', 'OUT_30', 'OUT_24', 'OUT_41', 'OUT_17', 'OUT_95', 'OUT_89', 'OUT_79', 'OUT_1', 'OUT_98', 'OUT_68', 'OUT_62', 'OUT_46', 'OUT_4', 'OUT_75', 'OUT_93', 'OUT_43', 'OUT_40', 'OUT_19', 'OUT_76', 'OUT_22', 'OUT_28', 'OUT_55', 'OUT_52', 'OUT_33', 'OUT_97', 'OUT_7', 'OUT_74', 'OUT_38', 'OUT_86', 'OUT_34', 'OUT_63', 'OUT_96', 'OUT_92', 'OUT_85', 'OUT_88', 'OUT_3', 'OUT_10', 'OUT_80', 'OUT_2', 'OUT_12', 'OUT_42', 'OUT_31', 'OUT_44', 'OUT_78', 'OUT_9', 'OUT_6', 'OUT_99', 'OUT_81', 'OUT_27', 'OUT_58', 'OUT_21', 'OUT_77', 'OUT_23', 'OUT_61', 'OUT_90', 'OUT_15', 'OUT_72', 'OUT_47', 'OUT_26', 'OUT_59', 'OUT_82', 'OUT_94', 'OUT_54', 'OUT_70', 'OUT_65', 'OUT_57', 'OUT_39', 'OUT_16', 'OUT_20', 'OUT_48', 'OUT_49']


__run through the list of OTUs, keeping track of the cummulative abundance that they represent and stop once we reach__

In [11]:
otu_names_that_rep_eighty_percent_list = []
cummulative_score = 0
for otu_name in sorted_list_of_otus_neg_pcr:
    potential_score = cummulative_score + df.loc[otu_name, 'neg_pcr']
    if potential_score < 0.8:
        otu_names_that_rep_eighty_percent_list.append(otu_name)
        cummulative_score = potential_score
    else:
        break

In [12]:
print(otu_names_that_rep_eighty_percent_list)

['OUT_91', 'OUT_56', 'OUT_51', 'OUT_73', 'OUT_66', 'OUT_0', 'OUT_83', 'OUT_69', 'OUT_87', 'OUT_8', 'OUT_5', 'OUT_32', 'OUT_35', 'OUT_71', 'OUT_50', 'OUT_37', 'OUT_18', 'OUT_67', 'OUT_84', 'OUT_13', 'OUT_60', 'OUT_36', 'OUT_11', 'OUT_25', 'OUT_45', 'OUT_64', 'OUT_29', 'OUT_53', 'OUT_14', 'OUT_30', 'OUT_24', 'OUT_41', 'OUT_17', 'OUT_95', 'OUT_89', 'OUT_79', 'OUT_1', 'OUT_98', 'OUT_68', 'OUT_62', 'OUT_46', 'OUT_4', 'OUT_75', 'OUT_93', 'OUT_43', 'OUT_40', 'OUT_19', 'OUT_76', 'OUT_22', 'OUT_28', 'OUT_55', 'OUT_52', 'OUT_33', 'OUT_97', 'OUT_7', 'OUT_74']


__out of interest lets see what proportion of OTUs we're working with to reach the magical 80%__

In [14]:
print(len(otu_names_that_rep_eighty_percent_list)/len(sorted_list_of_otus_neg_pcr))

0.56


__OK, just over half, that sounds about right given the random distribution we created. Of course for a real world abundance table this value will likely be very different__

__Now for each sample, we calculate the cumulative abundance represented by these OTUs and divide by 0.8 to get our 'effect_of_negative_on_sample' metric (for the 'neg_pcr' negative)__

In [18]:
for sample in columns[:-2]:
    # the total abundance of the otus in the sample
    sum_abund = sum(df.loc[otu_names_that_rep_eighty_percent_list, sample].values.tolist())
    # the resulting metric for the sample
    metric = sum_abund/.8
    print(f'effect of \'neg_pcr\' negative on {sample} is {metric}')

effect of 'neg_pcr' negative on sample_0 is 0.6467234440424405
effect of 'neg_pcr' negative on sample_1 is 0.6679280008553146
effect of 'neg_pcr' negative on sample_2 is 0.7080128259076599
effect of 'neg_pcr' negative on sample_3 is 0.6936467112564111
effect of 'neg_pcr' negative on sample_4 is 0.6532205407896757


__These results are obviously nothing like what we would expect from  your dataset as they are way higher. You will expect to be seeing very small numbers. But... the process used to calculate them will be identical :)__