# Process NCRMP Florida Disease Data

This manuscript dataset has a total number of corals for each species, and counts of disease presence. The scipt coneverts raw data from the manuscript into Tidy format(e.g. filling in genus name in each row). Using the diseased and healthy counts to infer the total disease percentages.  

## Import data as a pandas DataFrame

In [1]:
import pandas as pd
data= pd.read_csv("../Raw_Data/NCRMP_Florida_raw.csv")
print(data.columns)

Index(['time', 'latitude', 'longitude', 'REGION', 'PRIMARY_SAMPLE_UNIT',
       'STATION_NR', 'YEAR', 'MONTH', 'DAY', 'Date_UTC', 'HABITAT_CD',
       'HABITAT_TYPE', 'STRAT', 'Description', 'RUGOSITY_CD', 'WTD_RUG',
       'MAPGRID_NR', 'SUB_REGION_NAME', 'SUB_REGION_NR', 'ZONE_NAME',
       'ZONE_NR', 'MPA_NAME', 'MPA_NR', 'PROT', 'ADMIN', 'DEPTH_STRAT',
       'MIN_DEPTH', 'MAX_DEPTH', 'METERS_COMPLETED', 'SPECIES_CD',
       'SPECIES_NAME', 'N', 'JUV', 'MAX_DIAMETER', 'PERP_DIAMETER', 'HEIGHT',
       'OLD_MORT', 'RECENT_MORT', 'BLEACH_CONDITION', 'DISEASE',
       'accession_url'],
      dtype='object')


  data= pd.read_csv("../Raw_Data/NCRMP_Florida_raw.csv")


## Isolate desired columns

In [2]:
data = data[["SPECIES_CD","SPECIES_NAME","DISEASE"]]
data = data.dropna(how='all').reset_index(drop=True)
data

Unnamed: 0,SPECIES_CD,SPECIES_NAME,DISEASE
0,ACR CERV,Acropora cervicornis,
1,ACR CERV,Acropora cervicornis,
2,ACR CERV,Acropora cervicornis,
3,ACR CERV,Acropora cervicornis,
4,ACR CERV,Acropora cervicornis,
...,...,...,...
69403,STE INTE,Stephanocoenia intersepta,A
69404,STE INTE,Stephanocoenia intersepta,A
69405,STE INTE,Stephanocoenia intersepta,
69406,STE INTE,Stephanocoenia intersepta,


## Selecting Disease column and setting empty cells to healthy

In [3]:
data['DISEASE'][data['DISEASE'].isnull()] = "Unknown"
data

Unnamed: 0,SPECIES_CD,SPECIES_NAME,DISEASE
0,ACR CERV,Acropora cervicornis,Unknown
1,ACR CERV,Acropora cervicornis,Unknown
2,ACR CERV,Acropora cervicornis,Unknown
3,ACR CERV,Acropora cervicornis,Unknown
4,ACR CERV,Acropora cervicornis,Unknown
...,...,...,...
69403,STE INTE,Stephanocoenia intersepta,A
69404,STE INTE,Stephanocoenia intersepta,A
69405,STE INTE,Stephanocoenia intersepta,Unknown
69406,STE INTE,Stephanocoenia intersepta,Unknown


## Count corals by genus and Disease column

In [4]:
Grouped_data = data.groupby(['SPECIES_NAME','DISEASE']).agg(Disease_count = ("DISEASE", "count")).reset_index()

Grouped_data

Unnamed: 0,SPECIES_NAME,DISEASE,Disease_count
0,Acropora cervicornis,A,233
1,Acropora cervicornis,F,2
2,Acropora cervicornis,P,7
3,Acropora cervicornis,Unknown,1050
4,Acropora palmata,A,14
...,...,...,...
169,Stephanocoenia intersepta,P,64
170,Stephanocoenia intersepta,S,5
171,Stephanocoenia intersepta,Unknown,469
172,Undaria spp,A,2


## Create Disease table 

We need to organise the data so we are able to calculate total disease percent and include a genus column. 

In [5]:
from numpy import isfinite
from math import isnan
disease_df = data
unique_diseases = set(disease_df["DISEASE"].unique())

df = data
unique_species = set(df["SPECIES_NAME"].unique())
unique_species = list(unique_species)
print(unique_species)
print(unique_diseases)

disease_table = pd.DataFrame(0, index=unique_species,columns=unique_diseases)
for species_name in unique_species:
    for disease in unique_diseases: 
        disease_subtable = df.loc[(df['SPECIES_NAME'] == species_name) & (df['DISEASE'] == disease)]
        empty = bool(disease_subtable.empty)
        if not empty:
            count = disease_subtable['DISEASE'].count()
        
            disease_table.loc[species_name, disease] = count
disease_table.sort_index()

disease_table = disease_table.rename(columns ={'A':'healthy_count', 'P':'total_diseased_count', 'Unknown':'unknown_count'})
disease_table

disease_table["total_diseased_percent"] = disease_table["total_diseased_count"]/(disease_table["healthy_count"]+ disease_table["total_diseased_count"])

disease_table = disease_table.drop(columns=['F','S'])
disease_table = disease_table.rename_axis('species')

disease_table['genus'] = disease_table.index.str.split().str[0] 
#move total disease column to right hand side of genus 
disease_table = disease_table[['genus','total_diseased_percent', 'healthy_count','total_diseased_count','unknown_count']]


disease_table

['Mycetophyllia spp', 'Scolymia lacera', 'Madracis auretenra', 'Scolymia cubensis', 'Favia fragum', 'Meandrina jacksoni', 'Scolymia spp', 'Agaricia lamarcki', 'Agaricia agaricites', 'Pseudodiploria strigosa', 'Agaricia grahamae', 'Siderastrea radians', 'Mycetophyllia danaana', 'Madracis formosa', 'Other coral', 'Acropora palmata', 'Siderastrea siderea', 'Madracis pharensis', 'Agaricia spp', 'Orbicella spp', 'Orbicella franksi', 'Diploria labyrinthiformis', 'Pseudodiploria clivosa', 'Oculina diffusa', 'Meandrina danae', 'Meandrina meandrites', 'Porites colonensis', 'Dendrogyra cylindrus', 'Porites branneri', 'Solenastrea bournoni', 'Orbicella annularis species complex', 'Mycetophyllia ferox', 'Solenastrea hyades', 'Madracis decactis', 'Acropora cervicornis', 'Madracis spp', 'Manicina areolata', 'Meandrina spp', 'Eusmilia fastigiata', 'Porites divaricata', 'Stephanocoenia intersepta', 'Madracis senaria', 'Colpophyllia natans', 'Siderastrea spp', 'Porites spp', 'Dichocoenia stokesii', 'Mu

Unnamed: 0_level_0,genus,total_diseased_percent,healthy_count,total_diseased_count,unknown_count
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mycetophyllia spp,Mycetophyllia,0.000000,5,0,560
Scolymia lacera,Scolymia,0.000000,7,0,448
Madracis auretenra,Madracis,0.000000,7,0,410
Scolymia cubensis,Scolymia,0.000000,32,0,1029
Favia fragum,Favia,0.000000,11,0,1085
...,...,...,...,...,...
Agaricia fragilis,Agaricia,0.000000,142,0,984
Solenastrea spp,Solenastrea,0.000000,2,0,178
Isophyllastrea rigida,Isophyllastrea,0.000000,1,0,94
Montastraea cavernosa,Montastraea,0.016135,2622,43,568


In [6]:
disease_table.to_csv("../Processed_Data/NCRMP_Florida_corals.csv",index_label="species")