This is the code for the Median of Means (MedMeanK) from Puechmaille (2016).



In [None]:
import csv
import re
import pandas as pd
import numpy as np
import glob

In [None]:
def import_df(filename):
    rows = []
    with open(filename) as data:
        for line in data:
            if line.startswith("Inferred ancestry of individuals"):
                # skip header of table
                next(data)
                break
        #Reads text until the end of the block:
        k = -1 #set the value  
        for line in data:
            if not line.strip():#If Read empty line, break at end of table
                break
            words = line.split()
            if k < 0 :
                k = len(words)- words.index(':') - 1

            d = dict(ind=words[1], pop=int(words[3]))
            for i in range(k):
                key = 'pop' + str(i)
                value = float(words[5+i])
                d[key] = value
            rows.append(d)
    
    #Create a dataframe from the 'rows' list of dictionaries
    #Will have format of: User Pop/Ind, PopId, and 
    #Population Proportion for each proposed K-value
    df = pd.DataFrame(rows)

    #Group the new Dataframe by PopId and take means for each population
    df1 = df.groupby('pop', as_index = True).mean()

    #Create an empty dataframe 'df2'
    df2 = pd.DataFrame(dtype = int)

    #Find the Median of the Means for each K-value over all Populations
    Kmeanmax = df1.max(axis = 0)

    #Append the empty dataframe with the Max Means of each K-value
    df2 = df2.append(Kmeanmax, ignore_index = True)

    #Assign the column names (i.e., pop0, pop1,..popN) to object 'cols'
    cols = df2.columns

    #Add a column to show original # of K-values tested for a certain file 
    df2['K_tested'] = df2.count(axis = 1)

    #Test each Max Mean of each K-value against a threshold value (user-defined) 
    #And sum the K-values that meet the threshhold, disgarding the ones that don't
    #Make the ouput a new column at the end of the dataframe
    df2['K_Actual'] = (df2[cols].values > 0.5).sum(axis = 1)

    df2['Filename'] = filename
    return df2

In [None]:
#Creates one large dataframe of all Results input!
allres = []
for ff in glob.glob("../data/Results.copy.NoK1/*_f"):
    allres.append(import_df(ff))

In [None]:
#Since all of the files in your Results Folder are now in a list with defined threshholds, and testing
#against those thresholds, we can concatenate all of the dataframes together and begin to parse the data
ar_concat = pd.concat(allres)

#Now we will sort values in the table on K-tested column, which is what we want
##Note, the proportion of individual membership columns get automatically sorted with the
#function 'concatenate' -- this sorts lexicographically (alphabetically, instead of numerically) and raises 
#issues while parsing the data -- note that if user wants to see the proportions for each K-value over each grouping
#of K, it is not trivial at this time

#this probelem has been noticed on github, yet there is no solution at the current time -- problems likely
#lie in the way I have input data that create the indices for the DataFrames -- online tutorials say I need 
#to convert my indices into objects to be sorted numerically
ar_concat = ar_concat.sort_values('K_tested')

#We can now 'groupby' the K-tested column (splits into groups of user-defined K-values) and take
#median of the 'K-Actual' column -- providing us our K-estimate for each iteration of K in this Results folder! 
groupby_K_tested = ar_concat['K_Actual'].groupby(ar_concat['K_tested']).median()

#Take the Median of the medians over all K-tested 
MedMeanK = groupby_K_tested.median()


#Output for User -- Median of Means K estimate
MedMeanK