## Notebook 0: Sample information

In this notebook we will load the master data spreadsheet with sample information, including barcodes, and extract new barcode files matching barcode sequences to the cleaned up sample names that we want appended to each demultiplexed fastq file. 

In [1]:
import pandas as pd

### Load full sample info table


In [2]:
df = pd.read_csv("../data_sample_info/mastersheetSamples_RADstrep_20191017.csv")
df.info(True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 15 columns):
n                           106 non-null int64
sampleID_NickJ              106 non-null object
barcode                     106 non-null object
run                         106 non-null object
tip_label_ivalu             106 non-null object
species                     106 non-null object
4balanced                   106 non-null int64
4BPP                        106 non-null int64
sample_uid                  106 non-null object
HebrariumAccession_NickJ    106 non-null object
locality                    69 non-null object
whoseSample                 106 non-null object
collection_N                106 non-null object
sample_name_NickJ           106 non-null object
notes                       1 non-null object
dtypes: int64(3), object(12)
memory usage: 12.5+ KB


### Extract barcodes map file for library 1

In [3]:
# select the first library only
subdf = df[df.run == "run_1"].reset_index()

# strip the cutsite (TGCAG) from the end of barcode sequence
subdf.loc[:, "barcode"] = subdf.barcode.str.rsplit("TGCAG", 1, expand=True)[0]

In [4]:
# there are 66 samples all with 10bp barcodes
subdf.barcode.apply(len).describe()

count    66.0
mean     10.0
std       0.0
min      10.0
25%      10.0
50%      10.0
75%      10.0
max      10.0
Name: barcode, dtype: float64

In [9]:
# select the two columns for the barcodes file
final = subdf.loc[:, ["tip_label_ivalu", "barcode"]]

# write this data to a file
final.to_csv(
    "../data_rad_and_barcodes/barcodes_UO_C601_1.csv", 
    sep="\t",
    header=False, 
    index=False,
)

# show the whole thing here for prosperity
with pd.option_context('display.max_rows', 150):
    display(final)

Unnamed: 0,tip_label_ivalu,barcode
0,C_amp_barbarae_e020,TTGATGGTGC
1,C_anceps_nic053,CTTCGACATA
2,C_flavescens_e003,TCATAGGCTA
3,S_hesperidis_e043,ATCCGTCTAC
4,S_howellii_nic152,TAGTGCGGTC
5,S_tortuosus_e040,GCCGGTGATT
6,The_crispum_e084,TCACTGAGAA
7,C_amp_amplexicaulis_e068,CAACATGAAG
8,C_coulteri_e015,GCGTCCTGCC
9,C_hallii_e078,ATGAAGGCAG


### Extract barcodes map file for library 2

In [32]:
# select the first library only
subdf = df[df.run == "run_2"].reset_index()

# cutsite (TGCAG) has already been stripped from the sequences
#subdf.loc[:, "barcode"] = subdf.barcode.str.rsplit("TGCAG", 1, expand=True)[0]

In [33]:
# there are 40 samples all with 7-10bp barcodes
subdf.barcode.apply(len).describe()

count    40.000000
mean      8.725000
std       1.109111
min       7.000000
25%       8.000000
50%       9.000000
75%      10.000000
max      10.000000
Name: barcode, dtype: float64

In [34]:
# select the two columns for the barcodes file
final = subdf.loc[:, ["tip_label_ivalu", "barcode"]]

# write this data to a file
final.to_csv("../data_rad_and_barcodes/barcodes_Undet_lib2.csv")

# show the whole thing here for prosperity
with pd.option_context('display.max_rows', 150):
    display(final)

Unnamed: 0,tip_label_ivalu,barcode
0,S_glandulosus_nic202,GCCGTAGGT
1,C_californicus_NJ_3881,ACCATATAT
2,C_glaucus_Porter_15587,AGACGGTTC
3,C_major_NJ_4561,TCCGTCGAA
4,S_barbatus_NJ_4393,CTCGATT
5,S_bernardinus_NJ_4775,TCAGAAC
6,S_glandulosus_albidus_d108A,AGACGTTAGG
7,S_juneae_NJ_4772,ACCAGGCCG
8,Thly_aurea_Porter_15531,AACCATGGA
9,S_barbatus_NJ_4388,GCGTCCAGG
