# Process Hazraty-Kari *et al.*, Persian Gulf Disease Data

This manuscript dataset has total numbers of corals for each genus, and the percentage of those taht had each of 6 categories of disease. This script converts the raw data from the manuscript into Tidy format (e.g. filling in genus name in each row), and combines the total counts with the percentage disease to infer counts of healthy vs. diseased corals.

## Import data as a pandas DataFrame

In [1]:
import pandas as pd
Persian_gulf = pd.read_csv("../Raw_Data/Sanaz_Hazraty-Kari_et_al.csv", keep_default_na=False, skiprows=[0])
Persian_gulf

Unnamed: 0,Genus,year,n,Rel. Abundance (%),BBD,SGA,WMD,AYBD,WS,PPS
0,Acropora,2014,486,21.86,2.71 ± 0.7,0,0,0,0.95 ± 0.6,0
1,,2015,379,12.61,0.78 ± 0.78,0.45 ± 0.27,0,0,4.23 ± 1.33,0
2,Porites,2014,659,29.64,0,0,3.53 ± 2.23,0.52 ± 0.52,0,4.96 ± 2.91
3,,2015,405,13.47,0,0.23 ± 0.17,1.3 ± 0.59,0,0,0.62 ± 0.39
4,Cyphastrea,2014,56,2.52,0,0,0,0,0,0
5,,2015,101,3.36,0,0,4.68 ± 3.44,0,0,0
6,Platygyra,2014,308,13.86,0,0,0,0,0,0
7,,2015,379,12.61,0,0.78 ± 0.78,0,0,0,0
8,Favites,2014,312,14.04,0,0,2.48 ± 0.9,0,0,0
9,,2015,344,11.44,0,0,5.12 ± 1.85,0,0,0


## Infer missing genus labels

Next, we'll infer missing genus labels from those in previous rows, and add them to the DataFrame

In [2]:
for index, row in Persian_gulf.iterrows():
    if not row['Genus']:
        row['Genus'] = current_genus
        Persian_gulf.at[index,'Genus'] = current_genus
    current_genus = row['Genus']
    print(current_genus)

Acropora
Acropora
Porites
Porites
Cyphastrea
Cyphastrea
Platygyra
Platygyra
Favites
Favites
Dipsastrea
Dipsastrea
Conscinaraea
Conscinaraea
Leptastrea
Leptastrea


In [3]:
Persian_gulf

Unnamed: 0,Genus,year,n,Rel. Abundance (%),BBD,SGA,WMD,AYBD,WS,PPS
0,Acropora,2014,486,21.86,2.71 ± 0.7,0,0,0,0.95 ± 0.6,0
1,Acropora,2015,379,12.61,0.78 ± 0.78,0.45 ± 0.27,0,0,4.23 ± 1.33,0
2,Porites,2014,659,29.64,0,0,3.53 ± 2.23,0.52 ± 0.52,0,4.96 ± 2.91
3,Porites,2015,405,13.47,0,0.23 ± 0.17,1.3 ± 0.59,0,0,0.62 ± 0.39
4,Cyphastrea,2014,56,2.52,0,0,0,0,0,0
5,Cyphastrea,2015,101,3.36,0,0,4.68 ± 3.44,0,0,0
6,Platygyra,2014,308,13.86,0,0,0,0,0,0
7,Platygyra,2015,379,12.61,0,0.78 ± 0.78,0,0,0,0
8,Favites,2014,312,14.04,0,0,2.48 ± 0.9,0,0,0
9,Favites,2015,344,11.44,0,0,5.12 ± 1.85,0,0,0


## Split columns that have ± values

We want mean and standard deviation values in separate columns so we can multiply the percent disease value by the number of corals. So we'll next separate out those values.

In [15]:
import pandas as pd
Persian_gulf = pd.read_csv("../Raw_Data/Hazraty-Kari_et_al.csv", keep_default_na=False, skiprows=[0])

#Adding in missing genus names 
for index, row in Persian_gulf.iterrows():
    if not row['Genus']:
        row['Genus'] = current_genus
        Persian_gulf.at[index,'Genus'] = current_genus
    current_genus = row['Genus']
    
#Dealing with columns that have ± and are 0s 
print("Columns in the DataFrame:",Persian_gulf.columns)
columns_to_split = ['BBD', 'SGA', 'WMD', 'AYBD', 'WS', 'PPS']

def split_value(value):
    return pd.Series(value.split("±", 1))

def split_column_of_values(col):
    col = list(col)
    number_col_data = []
    SD_col_data = []
    for entry in col:
        print(f"Entry:",entry)
        if entry == "0":
            entry = "0 ± 0"
        print("BEFORE SPLIT:",entry)
        try:
            number, SD = entry.split("±")
        except ValueError:
            raise ValueError(f"entry {entry} doesn't split neatly. Does it not have a ±?")
        number_col_data.append(number)
        SD_col_data.append(SD)
        print("AFTER SPLIT:",number,SD)
    return number_col_data,SD_col_data

for col in Persian_gulf.columns:
    print(col)
    if col not in columns_to_split:
        continue
        
    percent_col_data, sd_col_data = split_column_of_values(Persian_gulf[col])
    #print(col)
    #new_column_name = col + "_split"
    #Persian_gulf[Persian_gulf[col]== "0"] = "0 ± 0"
    #Persian_gulf[col] = Persian_gulf[col].str.replace("0", "0±0", regex=False)
    #print(Persian_gulf[col])
    #percent_col_data, sd_col_data = Persian_gulf[col].apply(split_value)
    
    #print(f"Percent column data {percent_col_data}")
    #rint(f"Standard Deviation column data {sd_col_data}")
    
    Persian_gulf[f"{col}_percent"] = percent_col_data
    Persian_gulf[f"{col}_SD"] = sd_col_data
    #Persian_gulf[f"{col}_percent"], Persian_gulf[f"{col}_SD"] = Persian_gulf[col].str.split('±', n=1, expand=False)
    #print(f"After splitting {col}:")
    #print(Persian_gulf[[col, new_column_name]])


Columns in the DataFrame: Index(['Genus', 'year', 'n ', 'Rel. Abundance (%)', 'BBD', 'SGA', 'WMD',
       'AYBD', 'WS', 'PPS'],
      dtype='object')
Genus
year
n 
Rel. Abundance (%)
BBD
Entry: 2.71 ± 0.7
BEFORE SPLIT: 2.71 ± 0.7
AFTER SPLIT: 2.71   0.7
Entry: 0.78 ± 0.78
BEFORE SPLIT: 0.78 ± 0.78
AFTER SPLIT: 0.78   0.78
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
Entry: 0
BEFORE SPLIT: 0 ± 0
AFTER SPLIT: 0   0
SGA


In [16]:
Persian_gulf

Unnamed: 0,Genus,year,n,Rel. Abundance (%),BBD,SGA,WMD,AYBD,WS,PPS,...,SGA_percent,SGA_SD,WMD_percent,WMD_SD,AYBD_percent,AYBD_SD,WS_percent,WS_SD,PPS_percent,PPS_SD
0,Acropora,2014,486,21.86,2.71 ± 0.7,0,0,0,0.95 ± 0.6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.95,0.6,0.0,0.0
1,Acropora,2015,379,12.61,0.78 ± 0.78,0.45 ± 0.27,0,0,4.23 ± 1.33,0,...,0.45,0.27,0.0,0.0,0.0,0.0,4.23,1.33,0.0,0.0
2,Porites,2014,659,29.64,0,0,3.53 ± 2.23,0.52 ± 0.52,0,4.96 ± 2.91,...,0.0,0.0,3.53,2.23,0.52,0.52,0.0,0.0,4.96,2.91
3,Porites,2015,405,13.47,0,0.23 ± 0.17,1.3 ± 0.59,0,0,0.62 ± 0.39,...,0.23,0.17,1.3,0.59,0.0,0.0,0.0,0.0,0.62,0.39
4,Cyphastrea,2014,56,2.52,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Cyphastrea,2015,101,3.36,0,0,4.68 ± 3.44,0,0,0,...,0.0,0.0,4.68,3.44,0.0,0.0,0.0,0.0,0.0,0.0
6,Platygyra,2014,308,13.86,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Platygyra,2015,379,12.61,0,0.78 ± 0.78,0,0,0,0,...,0.78,0.78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Favites,2014,312,14.04,0,0,2.48 ± 0.9,0,0,0,...,0.0,0.0,2.48,0.9,0.0,0.0,0.0,0.0,0.0,0.0
9,Favites,2015,344,11.44,0,0,5.12 ± 1.85,0,0,0,...,0.0,0.0,5.12,1.85,0.0,0.0,0.0,0.0,0.0,0.0


GLTMP_corals_2010-2019.csv  Hazraty-Kari_et_al.csv
