**How to perform a keyword search through a text file in Python** 

In [1]:
%%capture

!pip install pandas
!pip install numpy

In [2]:
import pandas as pd
import numpy as np 
import os, sys

In [27]:
def keyword_search(file_name,search_values,indicator_name,save_name): 
    '''Function to perform a keyword search in a text file and get 
    counts / nanmean / failure counts 
    
    Parameters
    ----------
        file_name : string
            name of text file with outputs in it 
        search_values : list
            names of methods we are searching for in the file 
            one of the names must be the indicator of the block 
        indicator_name : string
            name of the keyword that denotes a block in the text file 
            in JP's files, this is "Info" 
            we use to to determine which information is part of which run 
            
    Returns
    ----------
        Pandas dataframe 
            two dataframes, one with nanmeans and failure counts for each method
            one with re-formatted information from the text file 
    '''
    options = search_values 

    collect_values = {} 
    for op in search_values: 
        collect_values[op] = [] 

    counts = {} 
    for op in search_values: 
        counts[op] = [] 

    count = 0
    with open(file_name) as file:

        for line in file:
            
            split_line = line.strip('\n').split(':')
            
            for op in options:

                if split_line[0] == op:

                    if op == 'Info':

                        collect_values[op].append(split_line[1][0:])
                        if count == 0: 
                            counts[op].append(0)
                        else:
                            counts[op].append(counts[op][-1]+1)
                        count+=1
                    
                    else: 
                        collect_values[op].append(int(split_line[1]))
                        counts[op].append(counts['Info'][-1])

        

    file.close()

    store_df = pd.DataFrame()

    for op in options:

        indices = counts[op]
        
        target_indices = list(range(0,len(counts[indicator_name])))
        
        collect_overwrite = [] 
        for i in target_indices:
            if i in indices:
                idx = indices.index(i)
                collect_overwrite.append(collect_values[op][idx])
            else:
                collect_overwrite.append(np.nan) 
        
        store_df[op] = collect_overwrite


    averages = {}
    failure_count = {} 
    for method in list(store_df):

        if method != indicator_name: 

            averages[method] = np.nanmean(store_df[method])
            failure_count[method] = store_df[method].isna().sum()  

    average_df = pd.DataFrame(averages.items(), columns=['Method', 'Nan_Mean_'+save_name])
    failure_df = pd.DataFrame(failure_count.items(), columns=['Method', 'Failure_Count'])
    join_mean_failure = pd.concat([average_df, failure_df.reindex(average_df.index)], axis=1)
    join_mean_failure = join_mean_failure.loc[:,~join_mean_failure.columns.duplicated()]

    store_df.to_csv('counts_for_methods_'+save_name+'.txt',sep=',')
    join_mean_failure.to_csv('join_mean_failure_'+save_name+'.txt',sep=',')

    return store_df, join_mean_failure, average_df 


In [4]:
#Connect to google drive to get the data. 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
#Navigate to folder where data is stored in the drive. 
%cd /content/drive/MyDrive/jameslab/

dirname = '/content/drive/MyDrive/jameslab/'

/content/drive/MyDrive/jameslab


In [29]:
keyword_lookup = ['Info','PCADAPT','PCADAPT q-values','PCADAPT Benjamini-Hochberg',\
               'PCADAPT Bonferroni','PCADAPT componentwise','OutFLANK qvalues',\
               'RDAadapt','LEA snfm/fst','LEA adjusted p-values','LEA lfmm/BH',\
               'LEA lfmm/non_adj','LEA lfmm2']
target_file = 'CONCAT_outliers.txt'

means = {} 
for fp in os.listdir(dirname+'JP/'):
    df, jdf, mean_calc = keyword_search(dirname+'/JP/'+fp+'/CONCAT_outliers.txt',\
                                   keyword_lookup,'Info',fp)
    
    means[fp] = mean_calc
    

list_df = [x.set_index('Method') for x in means.values()]

concat_means = pd.concat(list_df,axis=1, join='inner',\
                         names=means.keys()).reset_index()

concat_means['Overall_Mean'] = concat_means.mean(axis=1)

print(concat_means)

concat_means.to_csv('means_for_methods_JP.txt')

                        Method  Nan_Mean_8520  ...  Nan_Mean_8670  Overall_Mean
0                      PCADAPT      11.060000  ...      10.640000     10.366667
1             PCADAPT q-values      10.021277  ...       9.877551      9.784773
2   PCADAPT Benjamini-Hochberg       9.744681  ...       9.714286      9.647778
3           PCADAPT Bonferroni       5.978723  ...       6.326531      6.894703
4        PCADAPT componentwise      27.042553  ...      23.836735     23.838833
5             OutFLANK qvalues       0.127660  ...       0.166667      0.161525
6                     RDAadapt       2.304348  ...       3.833333      5.877636
7                 LEA snfm/fst       2.909091  ...       2.895833      2.988075
8        LEA adjusted p-values       0.386364  ...       0.354167      0.659031
9                  LEA lfmm/BH       2.590909  ...       1.770833      2.735789
10            LEA lfmm/non_adj       0.681818  ...       0.291667      1.269958
11                   LEA lfmm2       8.9

  from ipykernel import kernelapp as app
