# Parse Phenobooth data

Description

## Initial bolierplate

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = ''
if folder_name != '':
    #make folders if they don't exist
    if not exists(join(RAW_EXTERNAL, folder_name)):
        os.makedirs(join(RAW_EXTERNAL, folder_name))

    if not exists(join(INTERMEDIATE, folder_name)):
        os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))

print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


## Load Data 

In [2]:
import pandas as pd

data = pd.read_csv(join(RAW_INTERNAL,"Betax05.12.19_ColonyData_AllPlates.txt"), sep="\t")
print(data)

       Run  Plate     Type Row  Col  Size  Circularity  Brightness  Redness  \
0        2     14  Control   A    1     0       0.0000    0.000000      0.0   
1        2     14  Control   A    2     0       0.0000    0.000000      0.0   
2        2     14  Control   A    3   528       0.9261   20.986111    -52.2   
3        2     14  Control   A    4   411       0.9294   19.626115    -48.9   
4        2     14  Control   A    5   509       0.9489   19.151277    -47.7   
...    ...    ...      ...  ..  ...   ...          ...         ...      ...   
39931    3     13  Control  AF   44     0       0.0000    0.000000      0.0   
39932    3     13  Control  AF   45   537       0.9572  137.873371      8.8   
39933    3     13  Control  AF   46   513       0.9532  130.765432     10.9   
39934    3     13  Control  AF   47   431       0.9629  129.105955     11.3   
39935    3     13  Control  AF   48   551       0.9673  129.872958     10.9   

       Avg. Red  Avg. Blue  Avg. Green  Multi  Gene

In [3]:
gene_ids = pd.read_csv(join(RAW_EXTERNAL,"geneIDs.txt"), sep="\t")
print(gene_ids)

      Plate # Row  Column        ORF       Gene  \
0           1   A       1      Blank      Blank   
1           1   A       2    YLL040C      VPS13   
2           1   A       3    YAL068C       PAU8   
3           1   A       4      Blank      Blank   
4           1   A       5    YAL067C       SEO1   
...       ...  ..     ...        ...        ...   
4987       13   P      20      Blank      Blank   
4988       13   P      21    YPR072W       NOT5   
4989       13   P      22      Blank      Blank   
4990       13   P      23  YOR008C-A  YOR008C-A   
4991       13   P      24      Blank      Blank   

                                             Decription  \
0                                                 Blank   
1     Protein of unknown function; heterooligomeric ...   
2     Protein of unknown function, member of the ser...   
3                                                 Blank   
4     Putative permease, member of the allantoate tr...   
...                              

## Parsing data location

In [4]:
letters = ["-", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "AA", "AB", "AC", "AD", "AE", "AF"]

In [5]:
import math
data["mapped.pos"] = ""

for index, row in data.iterrows():
    if row["Row"] == "A":
        letter_number = 1
        current_letter = "A"
    elif row["Row"] != current_letter:
        letter_number += 1
        current_letter = row["Row"]
    letter = letters[math.ceil(letter_number/2)]
    number = math.ceil(row["Col"]/2)
    data.iloc[index,-1] = letter + str(number)

In [6]:
print(data)

       Run  Plate     Type Row  Col  Size  Circularity  Brightness  Redness  \
0        2     14  Control   A    1     0       0.0000    0.000000      0.0   
1        2     14  Control   A    2     0       0.0000    0.000000      0.0   
2        2     14  Control   A    3   528       0.9261   20.986111    -52.2   
3        2     14  Control   A    4   411       0.9294   19.626115    -48.9   
4        2     14  Control   A    5   509       0.9489   19.151277    -47.7   
...    ...    ...      ...  ..  ...   ...          ...         ...      ...   
39931    3     13  Control  AF   44     0       0.0000    0.000000      0.0   
39932    3     13  Control  AF   45   537       0.9572  137.873371      8.8   
39933    3     13  Control  AF   46   513       0.9532  130.765432     10.9   
39934    3     13  Control  AF   47   431       0.9629  129.105955     11.3   
39935    3     13  Control  AF   48   551       0.9673  129.872958     10.9   

       Avg. Red  Avg. Blue  Avg. Green  Multi  Gene

In [7]:
for index, row in data.iterrows():
    if row["Run"] == 2:
        data.iloc[index,data.columns.get_loc("Plate")] -= 13
print(data)

       Run  Plate     Type Row  Col  Size  Circularity  Brightness  Redness  \
0        2      1  Control   A    1     0       0.0000    0.000000      0.0   
1        2      1  Control   A    2     0       0.0000    0.000000      0.0   
2        2      1  Control   A    3   528       0.9261   20.986111    -52.2   
3        2      1  Control   A    4   411       0.9294   19.626115    -48.9   
4        2      1  Control   A    5   509       0.9489   19.151277    -47.7   
...    ...    ...      ...  ..  ...   ...          ...         ...      ...   
39931    3     13  Control  AF   44     0       0.0000    0.000000      0.0   
39932    3     13  Control  AF   45   537       0.9572  137.873371      8.8   
39933    3     13  Control  AF   46   513       0.9532  130.765432     10.9   
39934    3     13  Control  AF   47   431       0.9629  129.105955     11.3   
39935    3     13  Control  AF   48   551       0.9673  129.872958     10.9   

       Avg. Red  Avg. Blue  Avg. Green  Multi  Gene

## Add gene information

In [8]:
gene_ids["mapped.pos"] = gene_ids["Row"] + gene_ids["Column"].astype('str')
gene_ids = gene_ids.rename(columns={'Plate #':'Plate'})
print(gene_ids)

      Plate Row  Column        ORF       Gene  \
0         1   A       1      Blank      Blank   
1         1   A       2    YLL040C      VPS13   
2         1   A       3    YAL068C       PAU8   
3         1   A       4      Blank      Blank   
4         1   A       5    YAL067C       SEO1   
...     ...  ..     ...        ...        ...   
4987     13   P      20      Blank      Blank   
4988     13   P      21    YPR072W       NOT5   
4989     13   P      22      Blank      Blank   
4990     13   P      23  YOR008C-A  YOR008C-A   
4991     13   P      24      Blank      Blank   

                                             Decription  \
0                                                 Blank   
1     Protein of unknown function; heterooligomeric ...   
2     Protein of unknown function, member of the ser...   
3                                                 Blank   
4     Putative permease, member of the allantoate tr...   
...                                                 ...  

In [9]:
data = pd.merge(data, gene_ids, how='left', on=['Plate', 'mapped.pos'])
print(data)

       Run  Plate     Type Row_x  Col  Size  Circularity  Brightness  Redness  \
0        2      1  Control     A    1     0       0.0000    0.000000      0.0   
1        2      1  Control     A    2     0       0.0000    0.000000      0.0   
2        2      1  Control     A    3   528       0.9261   20.986111    -52.2   
3        2      1  Control     A    4   411       0.9294   19.626115    -48.9   
4        2      1  Control     A    5   509       0.9489   19.151277    -47.7   
...    ...    ...      ...   ...  ...   ...          ...         ...      ...   
39931    3     13  Control    AF   44     0       0.0000    0.000000      0.0   
39932    3     13  Control    AF   45   537       0.9572  137.873371      8.8   
39933    3     13  Control    AF   46   513       0.9532  130.765432     10.9   
39934    3     13  Control    AF   47   431       0.9629  129.105955     11.3   
39935    3     13  Control    AF   48   551       0.9673  129.872958     10.9   

       Avg. Red  ...  mappe

## Add normalized columns

In [10]:
data["Norm.Brightness"] = data["Brightness"] / data["Size"]
data["Norm.Avg. Red"] = data["Avg. Red"] / data["Size"]
data["Norm.Redness"] = data["Redness"] / data["Size"]

data = data.rename(columns={"Gene_y":"Gene"})
print(data)

       Run  Plate     Type Row_x  Col  Size  Circularity  Brightness  Redness  \
0        2      1  Control     A    1     0       0.0000    0.000000      0.0   
1        2      1  Control     A    2     0       0.0000    0.000000      0.0   
2        2      1  Control     A    3   528       0.9261   20.986111    -52.2   
3        2      1  Control     A    4   411       0.9294   19.626115    -48.9   
4        2      1  Control     A    5   509       0.9489   19.151277    -47.7   
...    ...    ...      ...   ...  ...   ...          ...         ...      ...   
39931    3     13  Control    AF   44     0       0.0000    0.000000      0.0   
39932    3     13  Control    AF   45   537       0.9572  137.873371      8.8   
39933    3     13  Control    AF   46   513       0.9532  130.765432     10.9   
39934    3     13  Control    AF   47   431       0.9629  129.105955     11.3   
39935    3     13  Control    AF   48   551       0.9673  129.872958     10.9   

       Avg. Red  ...       

In [11]:
data.to_csv(join(INTERMEDIATE, "Betax05.12.19_ColonyData_merged.csv"))

In [12]:
# Filter relevant info:
data = data[["Run", "Plate", "mapped.pos", "Gene", "Size", "Brightness", "Avg. Red", "Redness",
             "Norm.Brightness", "Norm.Avg. Red", "Norm.Redness"]]

# Replace any value=0 (no growth) with NaN for proper computations:
data[data == 0] = math.nan

# Compute operations:
mean_data = data.groupby(["Run", "Plate", "mapped.pos", "Gene"]).mean()
std_data = data.groupby(["Run", "Plate", "mapped.pos", "Gene"]).std()
count_data = data.groupby(["Run", "Plate", "mapped.pos", "Gene"]).count()

# Rename columns:
def change_col_names(data, names, operation):
    for name in names:
        data = data.rename(columns={name:(name + "." + operation)})
    return data

names = ["Size", "Brightness", "Avg. Red", "Redness", "Norm.Brightness", "Norm.Avg. Red", "Norm.Redness"]
mean_data = change_col_names(mean_data, names, "mean")
std_data = change_col_names(std_data, names, "std")
count_data = count_data.rename(columns={"Size":"Count"})
count_data = count_data[["Count"]]

# Merge both dataframes:
combined_data = pd.merge(mean_data, std_data, how='left', on=["Run", "Plate", "mapped.pos", "Gene"])
combined_data = pd.merge(count_data, combined_data, how='left', on=["Run", "Plate", "mapped.pos", "Gene"])
print(combined_data)

                             Count  Size.mean  Brightness.mean  Avg. Red.mean  \
Run Plate mapped.pos Gene                                                       
2   1     A1         Blank       0        NaN              NaN            NaN   
          A10        FPS1        4     318.75        16.602205            NaN   
          A11        GDH3        4     367.00        17.454330            NaN   
          A12        RPL8B       4     332.00        16.041620            NaN   
          A13        BDH2        4     371.75        16.680479            NaN   
...                            ...        ...              ...            ...   
3   13    P5         VPS15       4     403.25       126.947385         141.50   
          P6         Blank       0        NaN              NaN            NaN   
          P7         FES1        4     474.25       120.776146         137.25   
          P8         Blank       0        NaN              NaN            NaN   
          P9         MRPL36 

In [13]:
combined_data.to_csv(join(FINAL, "Betax05.12.19_ColonyData_means.csv"))