**How to perform a keyword search through a text file in Python** 

In [1]:
%%capture

!pip install pandas
!pip install numpy

In [2]:
import pandas as pd
import numpy as np 

In [13]:
def keyword_search(file_name,search_values,indicator_name): 
    '''Function to perform a keyword search in a text file and get 
    counts / nanmean / failure counts 
    
    Parameters
    ----------
        file_name : string
            name of text file with outputs in it 
        search_values : list
            names of methods we are searching for in the file 
            one of the names must be the indicator of the block 
        indicator_name : string
            name of the keyword that denotes a block in the text file 
            in JP's files, this is "Info" 
            we use to to determine which information is part of which run 
            
    Returns
    ----------
        Pandas dataframe 
            two dataframes, one with nanmeans and failure counts for each method
            one with re-formatted information from the text file 
    '''
    options = search_values 

    collect_values = {} 
    for op in search_values: 
        collect_values[op] = [] 

    counts = {} 
    for op in search_values: 
        counts[op] = [] 

    count = 0
    with open('CONCAT_outliers.txt') as file:

        for line in file:
            
            split_line = line.strip('\n').split(':')
            
            for op in options:

                if split_line[0] == op:

                    if op == 'Info':

                        collect_values[op].append(split_line[1][0:])
                        if count == 0: 
                            counts[op].append(0)
                        else:
                            counts[op].append(counts[op][-1]+1)
                        count+=1
                    
                    else: 
                        collect_values[op].append(int(split_line[1]))
                        counts[op].append(counts['Info'][-1])

        

    file.close()

    store_df = pd.DataFrame()

    for op in options:

        indices = counts[op]
        
        target_indices = list(range(0,len(counts[indicator_name])))
        
        collect_overwrite = [] 
        for i in target_indices:
            if i in indices:
                idx = indices.index(i)
                collect_overwrite.append(collect_values[op][idx])
            else:
                collect_overwrite.append(np.nan) 
        
        store_df[op] = collect_overwrite


    averages = {}
    failure_count = {} 
    for method in list(store_df):

        if method != indicator_name: 

            averages[method] = np.nanmean(store_df[method])
            failure_count[method] = store_df[method].isna().sum()  

    average_df = pd.DataFrame(averages.items(), columns=['Method', 'Nan_Mean'])
    failure_df = pd.DataFrame(failure_count.items(), columns=['Method', 'Failure_Count'])
    join_mean_failure = pd.concat([average_df, failure_df.reindex(average_df.index)], axis=1)
    join_mean_failure = join_mean_failure.loc[:,~join_mean_failure.columns.duplicated()]

    return store_df, join_mean_failure 


In [4]:
#Connect to google drive to get the data. 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
#Navigate to folder where data is stored in the drive. 
%cd /content/drive/MyDrive/jameslab/

dirname = '/content/drive/MyDrive/jameslab/'

/content/drive/MyDrive/jameslab


In [14]:
keyword_lookup = ['Info','PCADAPT','PCADAPT q-values','PCADAPT Benjamini-Hochberg',\
               'PCADAPT Bonferroni','PCADAPT componentwise','OutFLANK qvalues',\
               'RDAadapt','LEA snfm/fst','LEA adjusted p-values','LEA lfmm/BH',\
               'LEA lfmm/non_adj','LEA lfmm2']
target_file = 'CONCAT_outliers.txt'

keyword_search(target_file,keyword_lookup,'Info')

(            Info  PCADAPT  ...  LEA lfmm/non_adj  LEA lfmm2
 0    g101_s10332        9  ...               2.0       12.0
 1    g103_s10332        5  ...               0.0        8.0
 2    g105_s10332        6  ...               0.0        4.0
 3    g107_s10332        8  ...               1.0        0.0
 4    g109_s10332       10  ...               0.0        6.0
 5    g111_s10332       12  ...               0.0        6.0
 6     g13_s10332       10  ...               NaN        NaN
 7     g15_s10332       12  ...               0.0       11.0
 8     g17_s10332       13  ...               NaN        NaN
 9     g19_s10332        9  ...               NaN        NaN
 10    g21_s10332        7  ...               6.0       14.0
 11    g23_s10332       13  ...               0.0        6.0
 12    g25_s10332       17  ...               NaN        NaN
 13    g27_s10332        6  ...               NaN        NaN
 14    g29_s10332       12  ...               NaN        NaN
 15    g31_s10332       