In [1]:
import pandas as pd
import numpy
import itertools
import csv
import glob

# Test that glob is working

The following is just a check step to make a list of file names we're going to iterate through using glob.
The `stats.txt` files are the ones we're interested in.

In [3]:
ranafilenames = glob.glob('sampling_depth/ranaddrad*outfiles/*stats.txt')
print ranafilenames

['ddrad/sampling_depth/ranaddrad_t1_4_outfiles/ranaddrad_t1_4_s1_demultiplex_stats.txt', 'ddrad/sampling_depth/ranaddrad_t1_4_outfiles/ranaddrad_t1_4_s2_rawedit_stats.txt', 'ddrad/sampling_depth/ranaddrad_t1_4_outfiles/ranaddrad_t1_stats.txt', 'ddrad/sampling_depth/ranaddrad_t1_4_outfiles/ranaddrad_t1_4_s5_consens_stats.txt', 'ddrad/sampling_depth/ranaddrad_t1_4_outfiles/ranaddrad_t1_4_s6_cluster_stats.txt', 'ddrad/sampling_depth/ranaddrad_t1_4_outfiles/ranaddrad_t1_4_s3_cluster_stats.txt', 'ddrad/sampling_depth/ranaddrad_t3_4_outfiles/ranaddrad_t3_stats.txt', 'ddrad/sampling_depth/ranaddrad_t3_4_outfiles/ranaddrad_t3_4_s5_consens_stats.txt', 'ddrad/sampling_depth/ranaddrad_t3_4_outfiles/ranaddrad_t3_4_s1_demultiplex_stats.txt', 'ddrad/sampling_depth/ranaddrad_t3_4_outfiles/ranaddrad_t3_4_s6_cluster_stats.txt', 'ddrad/sampling_depth/ranaddrad_t3_4_outfiles/ranaddrad_t3_4_s3_cluster_stats.txt', 'ddrad/sampling_depth/ranaddrad_t3_4_outfiles/ranaddrad_t3_4_s2_rawedit_stats.txt', 'ddrad/sa

# Define directory

Define the directory for which you want all your files produced

In [14]:
#directory='sampling_depth/*outfiles/*stats.txt' # for sampling depths t1, t2, t3, total
directory='clust_threshold/*outfiles/*stats.txt' # for clustering threshold iterations 80-95

# SNP distributions

The following pulls info about the number of variable and parsimony-informative (as iPyrad calculates it) sites and produces `_SNPdist.csv` file

In [15]:
# Function to grab file names
def get_file_list(directory):
    return glob.glob(directory)

# Function to reference line numbers to pull out specific sections of the stats file
def get_line_numbers(filename):
    counter=0
    infile=open(filename, 'rt')
    for line in infile:
        counter+=1
        if 'var  sum_var' in line:
            varline=counter
#            print name,varline # not necessary but want to just check
#            print line
        elif '## Final' in line:
            endline=counter
#            print name,endline
#            print line
        else:
            continue
    infile.close()
    return [varline,endline]

# Function to cut out the SNP distribution section
def slice_data(start,end,filename):
    snp_dist=[]
    infile=open(filename,'rU')
    for lines in itertools.islice(infile, start, end-3):
        lines2 = lines.strip().split()
#        print lines2
        lines2.append(filename) # eventually change to whatever file it's on
        snp_dist.append(lines2)
    infile.close()
    return snp_dist
        
# Function to convert the output to an actual dataframe format that we can read into R
def pd_conversion(filename):
    nums = get_line_numbers(filename)
    snp_dist = slice_data(nums[0],nums[1],filename)
    snpdist_labels=['number','variable','sum_var','pis', 'sum_pis', 'file_name']
    df_snpdist = pd.DataFrame.from_records(snp_dist, columns=snpdist_labels)
    return df_snpdist  

def main():
    file_list = get_file_list(directory)
    dfs=[]
    for filename in file_list:
        print filename
        pd_df = pd_conversion(filename)
        dfs.append(pd_df)
        pd_df.to_csv(filename+"_snpdist.csv")
    return dfs

print main()

ddrad/clust_threshold/ranaddrad_clust_81_outfiles/ranaddrad_clust_81_stats.txt
ddrad/clust_threshold/ranaddrad_clust_80_outfiles/ranaddrad_clust_80_stats.txt
ddrad/clust_threshold/epiddrad_clust_86_outfiles/epiddrad_clust_86_stats.txt
ddrad/clust_threshold/epiddrad_clust_87_outfiles/epiddrad_clust_87_stats.txt
ddrad/clust_threshold/epiddrad_clust_91_outfiles/epiddrad_clust_91_stats.txt
ddrad/clust_threshold/epiddrad_clust_90_outfiles/epiddrad_clust_90_stats.txt
ddrad/clust_threshold/ranaddrad_clust_91_outfiles/ranaddrad_clust_91_stats.txt
ddrad/clust_threshold/ranaddrad_clust_90_outfiles/ranaddrad_clust_90_stats.txt
ddrad/clust_threshold/ranaddrad_clust_86_outfiles/ranaddrad_clust_86_stats.txt
ddrad/clust_threshold/ranaddrad_clust_87_outfiles/ranaddrad_clust_87_stats.txt
ddrad/clust_threshold/epiddrad_clust_81_outfiles/epiddrad_clust_81_stats.txt
ddrad/clust_threshold/epiddrad_clust_80_outfiles/epiddrad_clust_80_stats.txt
ddrad/clust_threshold/epiddrad_clust_92_outfiles/epiddrad_clust_

# Summary statistics

The following pulls info about the number of clusters, consensus reads, and loci in assembly for each sample and produces `_sumstats.csv` file

In [16]:
def get_file_list(directory):
    return glob.glob(directory)

def get_line_numbers(filename):
    counter=0
    infile=open(filename, 'rt')
    for line in infile:
        counter+=1
        if 'state  reads_raw' in line:
            varline=counter
#            print name,varline # not necessary but want to just check
#        print line
        else:
            continue
    infile.close()
    return [varline,varline]

def slice_data(start,end,filename):
    sum_stats=[]
    infile=open(filename,'rU')
    for lines in itertools.islice(infile, start, end+12):
        lines2 = lines.strip().split()
        print lines2
        lines2.append(filename) # eventually change to whatever file it's on
        sum_stats.append(lines2)
    infile.close()
    return sum_stats
        

def pd_conversion(filename):
    nums = get_line_numbers(filename)
    sum_stats = slice_data(nums[0],nums[1],filename)
    sumstats_labels=['sample', 'state', 'reads_raw', 'reads_passed', 'clust_total', 'clust_hidepth','hetero_est','error_est','reads_consens','loci_assembly','file_name']
    df_sumstats = pd.DataFrame.from_records(sum_stats, columns=sumstats_labels)
    return df_sumstats  

def main():
    file_list = get_file_list(directory)
    dfs=[]
    for filename in file_list:
        print filename
        pd_df = pd_conversion(filename)
        dfs.append(pd_df)
        pd_df.to_csv("./" +filename+ "_sumstats.csv")
    return dfs

print main()

ddrad/clust_threshold/ranaddrad_clust_81_outfiles/ranaddrad_clust_81_stats.txt
['Rber_T1113a', '7', '8159746', '8156623', '549972', '137437', '0.031612', '0.005291', '85208', '36505']
['Rber_T1113b', '7', '6091513', '6089311', '476295', '114232', '0.028926', '0.005986', '71894', '34284']
['Rber_T1114', '7', '3232832', '3231715', '256756', '74113', '0.026591', '0.005631', '50399', '28490']
['Rbla_D2864', '7', '6239552', '6237161', '360870', '95529', '0.026875', '0.004498', '66441', '27226']
['Rbla_D2865', '7', '4801774', '4799939', '313133', '87195', '0.025336', '0.004694', '61021', '26234']
['Rchi_T2034a', '7', '4598334', '4596360', '379772', '87642', '0.028708', '0.006277', '56492', '18118']
['Rchi_T2034b', '7', '5507763', '5505276', '417392', '98421', '0.028500', '0.006200', '64036', '19389']
['Rchi_T2049', '7', '4418342', '4416408', '370221', '85688', '0.028207', '0.006148', '55569', '17965']
['Rneo_T480', '7', '6657867', '6655296', '513846', '119397', '0.028702', '0.005041', '78488

['Rchi_T2034a', '7', '4598334', '4596360', '536519', '115983', '0.012805', '0.004117', '81986', '23740']
['Rchi_T2034b', '7', '5507763', '5505276', '591567', '130989', '0.012760', '0.003980', '93336', '25406']
['Rchi_T2049', '7', '4418342', '4416408', '520647', '113175', '0.012621', '0.004130', '80349', '23533']
['Rneo_T480', '7', '6657867', '6655296', '722831', '157618', '0.013120', '0.003240', '113295', '50410']
['Rneo_T527', '7', '6653066', '6650421', '605126', '143238', '0.013061', '0.003019', '106013', '51033']
['Rsph_T25870', '7', '6761860', '6759059', '820907', '182201', '0.015218', '0.003336', '125993', '38178']
['Rsph_T26064', '7', '6801065', '6798121', '843798', '186490', '0.014984', '0.003384', '129220', '38113']
ddrad/clust_threshold/ranaddrad_clust_90_outfiles/ranaddrad_clust_90_stats.txt
['Rber_T1113a', '7', '8159746', '8156623', '770163', '180101', '0.018089', '0.003873', '119598', '52104']
['Rber_T1113b', '7', '6091513', '6089311', '660487', '148747', '0.017891', '0.004

# Coverage information

The following pulls info about the number of loci for which N taxa have data and produces `_coverage.csv` file

In [17]:
def get_file_list(directory):
    return glob.glob(directory)

def get_line_numbers(filename):
    counter=0
    infile=open(filename, 'rt')
    for line in infile:
        counter+=1
        if 'locus_coverage' in line:
            varline=counter
#            print name,varline # not necessary but want to just check
#        print line
        elif '## The distribution' in line:
            endline=counter
            print filename,endline
        else:
            continue
    infile.close()
    return [varline,endline]

def slice_data(start,end,filename):
    coverage=[]
    infile=open(filename,'rU')
    for lines in itertools.islice(infile, start, end-3):
        lines2 = lines.strip().split()
        print lines2
        lines2.append(filename) # eventually change to whatever file it's on
        coverage.append(lines2)
    infile.close()
    return coverage

def pd_conversion(filename):
    nums = get_line_numbers(filename)
    coverage = slice_data(nums[0],nums[1],filename)
    cov_labels=['number','locus_coverage', 'sum_coverage','file_name']
    df_cov = pd.DataFrame.from_records(coverage, columns=cov_labels)
    return df_cov  

def main():
    file_list = get_file_list(directory)
    dfs=[]
    for filename in file_list:
        print filename
        pd_df = pd_conversion(filename)
        dfs.append(pd_df)
        pd_df.to_csv("./" +filename+ "_coverage.csv")
    return dfs

print main()

ddrad/clust_threshold/ranaddrad_clust_81_outfiles/ranaddrad_clust_81_stats.txt
ddrad/clust_threshold/ranaddrad_clust_81_outfiles/ranaddrad_clust_81_stats.txt 53
['1', '0', '0']
['2', '0', '0']
['3', '0', '0']
['4', '13697', '13697']
['5', '10901', '24598']
['6', '6072', '30670']
['7', '6346', '37016']
['8', '4168', '41184']
['9', '4324', '45508']
['10', '2343', '47851']
['11', '1583', '49434']
['12', '2377', '51811']
ddrad/clust_threshold/ranaddrad_clust_80_outfiles/ranaddrad_clust_80_stats.txt
ddrad/clust_threshold/ranaddrad_clust_80_outfiles/ranaddrad_clust_80_stats.txt 53
['1', '0', '0']
['2', '0', '0']
['3', '0', '0']
['4', '12954', '12954']
['5', '10325', '23279']
['6', '5785', '29064']
['7', '5974', '35038']
['8', '3955', '38993']
['9', '4145', '43138']
['10', '2253', '45391']
['11', '1514', '46905']
['12', '2299', '49204']
ddrad/clust_threshold/epiddrad_clust_86_outfiles/epiddrad_clust_86_stats.txt
ddrad/clust_threshold/epiddrad_clust_86_outfiles/epiddrad_clust_86_stats.txt 53
[

# Merge dataframes into a single file

All above code will produce separate files for each of the sampling depths (or each of the clustering thresholds). We want these all to be merged into a single file for subsequent analysis.

## Sampling depths

### Summary statistics

In [11]:
# For Epipedobates
df1 = pd.read_csv("./sampling_depth/epiddrad_t1_4_outfiles/epiddrad_t1_stats.txt_sumstats.csv")
df2 = pd.read_csv("./sampling_depth/epiddrad_t2_4_outfiles/epiddrad_t2_stats.txt_sumstats.csv")
df3 = pd.read_csv("./sampling_depth/epiddrad_t3_4_outfiles/epiddrad_t3_stats.txt_sumstats.csv")
df4 = pd.read_csv("./sampling_depth/epiddrad_total_4_outfiles/epiddrad_clust_91_stats.txt_sumstats.csv")

# For Rana
df5 = pd.read_csv("./sampling_depth/ranaddrad_t1_4_outfiles/ranaddrad_t1_stats.txt_sumstats.csv")
df6 = pd.read_csv("./sampling_depth/ranaddrad_t2_4_outfiles/ranaddrad_t2_stats.txt_sumstats.csv")
df7 = pd.read_csv("./sampling_depth/ranaddrad_t3_4_outfiles/ranaddrad_t3_stats.txt_sumstats.csv")
df8 = pd.read_csv("./sampling_depth/ranaddrad_total_4_outfiles/ranaddrad_clust_91_stats.txt_sumstats.csv")

# Concatenate everything together into single file
sumstats_concat = df1.append([df2,df3,df4,df5,df6,df7,df8])
sumstats_concat.to_csv("samplingdepth_ddrad_sumstats.csv")

### SNP distributions

In [10]:
# For Epipedobates
df1 = pd.read_csv("./sampling_depth/epiddrad_t1_4_outfiles/epiddrad_t1_stats.txt_snpdist.csv")
df2 = pd.read_csv("./sampling_depth/epiddrad_t2_4_outfiles/epiddrad_t2_stats.txt_snpdist.csv")
df3 = pd.read_csv("./sampling_depth/epiddrad_t3_4_outfiles/epiddrad_t3_stats.txt_snpdist.csv")
df4 = pd.read_csv("./sampling_depth/epiddrad_total_4_outfiles/epiddrad_clust_91_stats.txt_snpdist.csv")

# For Rana
df5 = pd.read_csv("./sampling_depth/ranaddrad_t1_4_outfiles/ranaddrad_t1_stats.txt_snpdist.csv")
df6 = pd.read_csv("./sampling_depth/ranaddrad_t2_4_outfiles/ranaddrad_t2_stats.txt_snpdist.csv")
df7 = pd.read_csv("./sampling_depth/ranaddrad_t3_4_outfiles/ranaddrad_t3_stats.txt_snpdist.csv")
df8 = pd.read_csv("./sampling_depth/ranaddrad_total_4_outfiles/ranaddrad_clust_91_stats.txt_snpdist.csv")

# Concatenate everything together into single file
sumstats_concat = df1.append([df2,df3,df4,df5,df6,df7,df8])
sumstats_concat.to_csv("samplingdepth_ddrad_snpdist.csv")

### Coverage

In [12]:
# For Epipedobates
df1 = pd.read_csv("./sampling_depth/epiddrad_t1_4_outfiles/epiddrad_t1_stats.txt_coverage.csv")
df2 = pd.read_csv("./sampling_depth/epiddrad_t2_4_outfiles/epiddrad_t2_stats.txt_coverage.csv")
df3 = pd.read_csv("./sampling_depth/epiddrad_t3_4_outfiles/epiddrad_t3_stats.txt_coverage.csv")
df4 = pd.read_csv("./sampling_depth/epiddrad_total_4_outfiles/epiddrad_clust_91_stats.txt_coverage.csv")

# For Rana
df5 = pd.read_csv("./sampling_depth/ranaddrad_t1_4_outfiles/ranaddrad_t1_stats.txt_coverage.csv")
df6 = pd.read_csv("./sampling_depth/ranaddrad_t2_4_outfiles/ranaddrad_t2_stats.txt_coverage.csv")
df7 = pd.read_csv("./sampling_depth/ranaddrad_t3_4_outfiles/ranaddrad_t3_stats.txt_coverage.csv")
df8 = pd.read_csv("./sampling_depth/ranaddrad_total_4_outfiles/ranaddrad_clust_91_stats.txt_coverage.csv")

# Concatenate everything together into single file
sumstats_concat = df1.append([df2,df3,df4,df5,df6,df7,df8])
sumstats_concat.to_csv("samplingdepth_ddrad_coverage.csv")

## Clustering thresholds

### Summary statistics

In [19]:
# For Epipedobates
df1 = pd.read_csv("./clust_threshold/epiddrad_clust_80_outfiles/epiddrad_clust_80_stats.txt_sumstats.csv")
df2 = pd.read_csv("./clust_threshold/epiddrad_clust_81_outfiles/epiddrad_clust_81_stats.txt_sumstats.csv")
df3 = pd.read_csv("./clust_threshold/epiddrad_clust_82_outfiles/epiddrad_clust_82_stats.txt_sumstats.csv")
df4 = pd.read_csv("./clust_threshold/epiddrad_clust_83_outfiles/epiddrad_clust_83_stats.txt_sumstats.csv")
df5 = pd.read_csv("./clust_threshold/epiddrad_clust_84_outfiles/epiddrad_clust_84_stats.txt_sumstats.csv")
df6 = pd.read_csv("./clust_threshold/epiddrad_clust_85_outfiles/epiddrad_clust_85_stats.txt_sumstats.csv")
df7 = pd.read_csv("./clust_threshold/epiddrad_clust_86_outfiles/epiddrad_clust_86_stats.txt_sumstats.csv")
df8 = pd.read_csv("./clust_threshold/epiddrad_clust_87_outfiles/epiddrad_clust_87_stats.txt_sumstats.csv")
df9 = pd.read_csv("./clust_threshold/epiddrad_clust_88_outfiles/epiddrad_clust_88_stats.txt_sumstats.csv")
df10 = pd.read_csv("./clust_threshold/epiddrad_clust_89_outfiles/epiddrad_clust_89_stats.txt_sumstats.csv")
df11 = pd.read_csv("./clust_threshold/epiddrad_clust_90_outfiles/epiddrad_clust_90_stats.txt_sumstats.csv")
df12 = pd.read_csv("./clust_threshold/epiddrad_clust_91_outfiles/epiddrad_clust_91_stats.txt_sumstats.csv")
df13 = pd.read_csv("./clust_threshold/epiddrad_clust_92_outfiles/epiddrad_clust_92_stats.txt_sumstats.csv")
df14 = pd.read_csv("./clust_threshold/epiddrad_clust_93_outfiles/epiddrad_clust_93_stats.txt_sumstats.csv")
df15 = pd.read_csv("./clust_threshold/epiddrad_clust_94_outfiles/epiddrad_clust_94_stats.txt_sumstats.csv")
df16 = pd.read_csv("./clust_threshold/epiddrad_clust_95_outfiles/epiddrad_clust_95_stats.txt_sumstats.csv")

# For Rana
df17 = pd.read_csv("./clust_threshold/ranaddrad_clust_80_outfiles/ranaddrad_clust_80_stats.txt_sumstats.csv")
df18 = pd.read_csv("./clust_threshold/ranaddrad_clust_81_outfiles/ranaddrad_clust_81_stats.txt_sumstats.csv")
df19 = pd.read_csv("./clust_threshold/ranaddrad_clust_82_outfiles/ranaddrad_clust_82_stats.txt_sumstats.csv")
df20 = pd.read_csv("./clust_threshold/ranaddrad_clust_83_outfiles/ranaddrad_clust_83_stats.txt_sumstats.csv")
df21 = pd.read_csv("./clust_threshold/ranaddrad_clust_84_outfiles/ranaddrad_clust_84_stats.txt_sumstats.csv")
df22 = pd.read_csv("./clust_threshold/ranaddrad_clust_85_outfiles/ranaddrad_clust_85_stats.txt_sumstats.csv")
df23 = pd.read_csv("./clust_threshold/ranaddrad_clust_86_outfiles/ranaddrad_clust_86_stats.txt_sumstats.csv")
df24 = pd.read_csv("./clust_threshold/ranaddrad_clust_87_outfiles/ranaddrad_clust_87_stats.txt_sumstats.csv")
df25 = pd.read_csv("./clust_threshold/ranaddrad_clust_88_outfiles/ranaddrad_clust_88_stats.txt_sumstats.csv")
df26 = pd.read_csv("./clust_threshold/ranaddrad_clust_89_outfiles/ranaddrad_clust_89_stats.txt_sumstats.csv")
df27 = pd.read_csv("./clust_threshold/ranaddrad_clust_90_outfiles/ranaddrad_clust_90_stats.txt_sumstats.csv")
df28 = pd.read_csv("./clust_threshold/ranaddrad_clust_91_outfiles/ranaddrad_clust_91_stats.txt_sumstats.csv")
df29 = pd.read_csv("./clust_threshold/ranaddrad_clust_92_outfiles/ranaddrad_clust_92_stats.txt_sumstats.csv")
df30 = pd.read_csv("./clust_threshold/ranaddrad_clust_93_outfiles/ranaddrad_clust_93_stats.txt_sumstats.csv")
df31 = pd.read_csv("./clust_threshold/ranaddrad_clust_94_outfiles/ranaddrad_clust_94_stats.txt_sumstats.csv")
df32 = pd.read_csv("./clust_threshold/ranaddrad_clust_95_outfiles/ranaddrad_clust_95_stats.txt_sumstats.csv")

sumstats_concat = df1.append([df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15,df16,
                             df17,df18,df19,df20,df21,df22,df23,df24,df25,df26,df27,df28,df29,df30,df31,df32])
sumstats_concat.to_csv("clustthreshold_ddrad_sumstats.csv")

### SNP distributions

In [21]:
df1 = pd.read_csv("./clust_threshold/epiddrad_clust_80_outfiles/epiddrad_clust_80_stats.txt_snpdist.csv")
df2 = pd.read_csv("./clust_threshold/epiddrad_clust_81_outfiles/epiddrad_clust_81_stats.txt_snpdist.csv")
df3 = pd.read_csv("./clust_threshold/epiddrad_clust_82_outfiles/epiddrad_clust_82_stats.txt_snpdist.csv")
df4 = pd.read_csv("./clust_threshold/epiddrad_clust_83_outfiles/epiddrad_clust_83_stats.txt_snpdist.csv")
df5 = pd.read_csv("./clust_threshold/epiddrad_clust_84_outfiles/epiddrad_clust_84_stats.txt_snpdist.csv")
df6 = pd.read_csv("./clust_threshold/epiddrad_clust_85_outfiles/epiddrad_clust_85_stats.txt_snpdist.csv")
df7 = pd.read_csv("./clust_threshold/epiddrad_clust_86_outfiles/epiddrad_clust_86_stats.txt_snpdist.csv")
df8 = pd.read_csv("./clust_threshold/epiddrad_clust_87_outfiles/epiddrad_clust_87_stats.txt_snpdist.csv")
df9 = pd.read_csv("./clust_threshold/epiddrad_clust_88_outfiles/epiddrad_clust_88_stats.txt_snpdist.csv")
df10 = pd.read_csv("./clust_threshold/epiddrad_clust_89_outfiles/epiddrad_clust_89_stats.txt_snpdist.csv")
df11 = pd.read_csv("./clust_threshold/epiddrad_clust_90_outfiles/epiddrad_clust_90_stats.txt_snpdist.csv")
df12 = pd.read_csv("./clust_threshold/epiddrad_clust_91_outfiles/epiddrad_clust_91_stats.txt_snpdist.csv")
df13 = pd.read_csv("./clust_threshold/epiddrad_clust_92_outfiles/epiddrad_clust_92_stats.txt_snpdist.csv")
df14 = pd.read_csv("./clust_threshold/epiddrad_clust_93_outfiles/epiddrad_clust_93_stats.txt_snpdist.csv")
df15 = pd.read_csv("./clust_threshold/epiddrad_clust_94_outfiles/epiddrad_clust_94_stats.txt_snpdist.csv")
df16 = pd.read_csv("./clust_threshold/epiddrad_clust_95_outfiles/epiddrad_clust_95_stats.txt_snpdist.csv")

# For Rana
df17 = pd.read_csv("./clust_threshold/ranaddrad_clust_80_outfiles/ranaddrad_clust_80_stats.txt_snpdist.csv")
df18 = pd.read_csv("./clust_threshold/ranaddrad_clust_81_outfiles/ranaddrad_clust_81_stats.txt_snpdist.csv")
df19 = pd.read_csv("./clust_threshold/ranaddrad_clust_82_outfiles/ranaddrad_clust_82_stats.txt_snpdist.csv")
df20 = pd.read_csv("./clust_threshold/ranaddrad_clust_83_outfiles/ranaddrad_clust_83_stats.txt_snpdist.csv")
df21 = pd.read_csv("./clust_threshold/ranaddrad_clust_84_outfiles/ranaddrad_clust_84_stats.txt_snpdist.csv")
df22 = pd.read_csv("./clust_threshold/ranaddrad_clust_85_outfiles/ranaddrad_clust_85_stats.txt_snpdist.csv")
df23 = pd.read_csv("./clust_threshold/ranaddrad_clust_86_outfiles/ranaddrad_clust_86_stats.txt_snpdist.csv")
df24 = pd.read_csv("./clust_threshold/ranaddrad_clust_87_outfiles/ranaddrad_clust_87_stats.txt_snpdist.csv")
df25 = pd.read_csv("./clust_threshold/ranaddrad_clust_88_outfiles/ranaddrad_clust_88_stats.txt_snpdist.csv")
df26 = pd.read_csv("./clust_threshold/ranaddrad_clust_89_outfiles/ranaddrad_clust_89_stats.txt_snpdist.csv")
df27 = pd.read_csv("./clust_threshold/ranaddrad_clust_90_outfiles/ranaddrad_clust_90_stats.txt_snpdist.csv")
df28 = pd.read_csv("./clust_threshold/ranaddrad_clust_91_outfiles/ranaddrad_clust_91_stats.txt_snpdist.csv")
df29 = pd.read_csv("./clust_threshold/ranaddrad_clust_92_outfiles/ranaddrad_clust_92_stats.txt_snpdist.csv")
df30 = pd.read_csv("./clust_threshold/ranaddrad_clust_93_outfiles/ranaddrad_clust_93_stats.txt_snpdist.csv")
df31 = pd.read_csv("./clust_threshold/ranaddrad_clust_94_outfiles/ranaddrad_clust_94_stats.txt_snpdist.csv")
df32 = pd.read_csv("./clust_threshold/ranaddrad_clust_95_outfiles/ranaddrad_clust_95_stats.txt_snpdist.csv")

snpdist_concat = df1.append([df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15,df16,
                             df17,df18,df19,df20,df21,df22,df23,df24,df25,df26,df27,df28,df29,df30,df31,df32])
snpdist_concat.to_csv("clustthreshold_ddrad_snpdist.csv")

### Coverage

In [20]:
df1 = pd.read_csv("./clust_threshold/epiddrad_clust_80_outfiles/epiddrad_clust_80_stats.txt_coverage.csv")
df2 = pd.read_csv("./clust_threshold/epiddrad_clust_81_outfiles/epiddrad_clust_81_stats.txt_coverage.csv")
df3 = pd.read_csv("./clust_threshold/epiddrad_clust_82_outfiles/epiddrad_clust_82_stats.txt_coverage.csv")
df4 = pd.read_csv("./clust_threshold/epiddrad_clust_83_outfiles/epiddrad_clust_83_stats.txt_coverage.csv")
df5 = pd.read_csv("./clust_threshold/epiddrad_clust_84_outfiles/epiddrad_clust_84_stats.txt_coverage.csv")
df6 = pd.read_csv("./clust_threshold/epiddrad_clust_85_outfiles/epiddrad_clust_85_stats.txt_coverage.csv")
df7 = pd.read_csv("./clust_threshold/epiddrad_clust_86_outfiles/epiddrad_clust_86_stats.txt_coverage.csv")
df8 = pd.read_csv("./clust_threshold/epiddrad_clust_87_outfiles/epiddrad_clust_87_stats.txt_coverage.csv")
df9 = pd.read_csv("./clust_threshold/epiddrad_clust_88_outfiles/epiddrad_clust_88_stats.txt_coverage.csv")
df10 = pd.read_csv("./clust_threshold/epiddrad_clust_89_outfiles/epiddrad_clust_89_stats.txt_coverage.csv")
df11 = pd.read_csv("./clust_threshold/epiddrad_clust_90_outfiles/epiddrad_clust_90_stats.txt_coverage.csv")
df12 = pd.read_csv("./clust_threshold/epiddrad_clust_91_outfiles/epiddrad_clust_91_stats.txt_coverage.csv")
df13 = pd.read_csv("./clust_threshold/epiddrad_clust_92_outfiles/epiddrad_clust_92_stats.txt_coverage.csv")
df14 = pd.read_csv("./clust_threshold/epiddrad_clust_93_outfiles/epiddrad_clust_93_stats.txt_coverage.csv")
df15 = pd.read_csv("./clust_threshold/epiddrad_clust_94_outfiles/epiddrad_clust_94_stats.txt_coverage.csv")
df16 = pd.read_csv("./clust_threshold/epiddrad_clust_95_outfiles/epiddrad_clust_95_stats.txt_coverage.csv")

# For Rana
df17 = pd.read_csv("./clust_threshold/ranaddrad_clust_80_outfiles/ranaddrad_clust_80_stats.txt_coverage.csv")
df18 = pd.read_csv("./clust_threshold/ranaddrad_clust_81_outfiles/ranaddrad_clust_81_stats.txt_coverage.csv")
df19 = pd.read_csv("./clust_threshold/ranaddrad_clust_82_outfiles/ranaddrad_clust_82_stats.txt_coverage.csv")
df20 = pd.read_csv("./clust_threshold/ranaddrad_clust_83_outfiles/ranaddrad_clust_83_stats.txt_coverage.csv")
df21 = pd.read_csv("./clust_threshold/ranaddrad_clust_84_outfiles/ranaddrad_clust_84_stats.txt_coverage.csv")
df22 = pd.read_csv("./clust_threshold/ranaddrad_clust_85_outfiles/ranaddrad_clust_85_stats.txt_coverage.csv")
df23 = pd.read_csv("./clust_threshold/ranaddrad_clust_86_outfiles/ranaddrad_clust_86_stats.txt_coverage.csv")
df24 = pd.read_csv("./clust_threshold/ranaddrad_clust_87_outfiles/ranaddrad_clust_87_stats.txt_coverage.csv")
df25 = pd.read_csv("./clust_threshold/ranaddrad_clust_88_outfiles/ranaddrad_clust_88_stats.txt_coverage.csv")
df26 = pd.read_csv("./clust_threshold/ranaddrad_clust_89_outfiles/ranaddrad_clust_89_stats.txt_coverage.csv")
df27 = pd.read_csv("./clust_threshold/ranaddrad_clust_90_outfiles/ranaddrad_clust_90_stats.txt_coverage.csv")
df28 = pd.read_csv("./clust_threshold/ranaddrad_clust_91_outfiles/ranaddrad_clust_91_stats.txt_coverage.csv")
df29 = pd.read_csv("./clust_threshold/ranaddrad_clust_92_outfiles/ranaddrad_clust_92_stats.txt_coverage.csv")
df30 = pd.read_csv("./clust_threshold/ranaddrad_clust_93_outfiles/ranaddrad_clust_93_stats.txt_coverage.csv")
df31 = pd.read_csv("./clust_threshold/ranaddrad_clust_94_outfiles/ranaddrad_clust_94_stats.txt_coverage.csv")
df32 = pd.read_csv("./clust_threshold/ranaddrad_clust_95_outfiles/ranaddrad_clust_95_stats.txt_coverage.csv")

coverage_concat = df1.append([df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14,df15,df16,
                             df17,df18,df19,df20,df21,df22,df23,df24,df25,df26,df27,df28,df29,df30,df31,df32])
coverage_concat.to_csv("clustthreshold_ddrad_coverage.csv")