# Process NCRMP Puerto Rico Disease Data

This manuscript dataset has a total number of corals for each species, and counts of disease presence. The scipt coneverts raw data from the manuscript into Tidy format(e.g. filling in genus name in each row). Using the diseased and healthy counts to infer the total disease percentages.  

## Import data as a pandas DataFrame

In [4]:
import pandas as pd
Puerto_Rico_data= pd.read_csv("../Raw_Data/NCRMP_Puerto_Rico_raw.csv")
print(Puerto_Rico_data.columns)

Index(['time', 'latitude', 'longitude', 'REGION', 'REGION_DESCRIPTION',
       'PRIMARY_SAMPLE_UNIT', 'STATION_NR', 'YEAR', 'MONTH', 'DAY', 'Date_UTC',
       'HABITAT_CD', 'HABITAT_TYPE', 'STRAT', 'STRAT_Description',
       'RUGOSITY_CD', 'WTD_RUG', 'MEAN_RUG', 'MAPGRID_NR', 'SUB_REGION_NAME',
       'SUB_REGION_NAME_DESCRIPTION', 'SUB_REGION_NR', 'ZONE_NAME', 'ZONE_NR',
       'MPA_NAME', 'MPA_NR', 'ADMIN', 'Administration_Description', 'PROT',
       'DEPTH_STRAT', 'DEPTH_STRAT_DESCRIPTION', 'MIN_DEPTH', 'MAX_DEPTH',
       'METERS_COMPLETED', 'SPECIES_CD', 'SPECIES_NAME', 'N', 'JUV',
       'MAX_DIAMETER', 'PERP_DIAMETER', 'HEIGHT', 'OLD_MORT', 'RECENT_MORT',
       'BLEACH_CONDITION', 'DISEASE', 'accession_url'],
      dtype='object')


  Puerto_Rico_data= pd.read_csv("../Raw_Data/NCRMP_Puerto_Rico_raw.csv")


## Isolate desired columns

In [5]:
Puerto_Rico_data = Puerto_Rico_data[["SPECIES_CD","SPECIES_NAME","DISEASE"]]
Puerto_Rico_data = Puerto_Rico_data.dropna(how='all').reset_index(drop=True)
Puerto_Rico_data

Unnamed: 0,SPECIES_CD,SPECIES_NAME,DISEASE
0,ACR CERV,Acropora cervicornis,
1,ACR CERV,Acropora cervicornis,
2,ACR CERV,Acropora cervicornis,
3,ACR CERV,Acropora cervicornis,
4,ACR CERV,Acropora cervicornis,
...,...,...,...
41450,ACR PALM,Acropora palmata,
41451,ACR CERV,Acropora cervicornis,
41452,ACR CERV,Acropora cervicornis,
41453,ACR CERV,Acropora cervicornis,


## Selecting Disease column and setting empty cells to healthy

In [6]:
Puerto_Rico_data['DISEASE'][Puerto_Rico_data['DISEASE'].isnull()] = "Unknown"
Puerto_Rico_data

Unnamed: 0,SPECIES_CD,SPECIES_NAME,DISEASE
0,ACR CERV,Acropora cervicornis,Unknown
1,ACR CERV,Acropora cervicornis,Unknown
2,ACR CERV,Acropora cervicornis,Unknown
3,ACR CERV,Acropora cervicornis,Unknown
4,ACR CERV,Acropora cervicornis,Unknown
...,...,...,...
41450,ACR PALM,Acropora palmata,Unknown
41451,ACR CERV,Acropora cervicornis,Unknown
41452,ACR CERV,Acropora cervicornis,Unknown
41453,ACR CERV,Acropora cervicornis,Unknown


## Count corals by genus and Disease column

In [7]:
Grouped_Puerto_Rico_data = Puerto_Rico_data.groupby(['SPECIES_NAME','DISEASE']).agg(Disease_count = ("DISEASE", "count")).reset_index()

Grouped_Puerto_Rico_data

Unnamed: 0,SPECIES_NAME,DISEASE,Disease_count
0,Acropora cervicornis,A,76
1,Acropora cervicornis,P,7
2,Acropora cervicornis,Unknown,572
3,Acropora palmata,A,27
4,Acropora palmata,P,2
...,...,...,...
159,Tubastraea coccinea,A,4
160,Tubastraea coccinea,Unknown,269
161,Undaria spp,A,23
162,Undaria spp,P,2


## Create Disease table 

We need to organise the data so we are able to calculate total disease percent and include a genus column. 

In [8]:
from numpy import isfinite
from math import isnan
disease_df = Puerto_Rico_data
unique_diseases = set(disease_df["DISEASE"].unique())

df = Puerto_Rico_data
unique_species = set(df["SPECIES_NAME"].unique())
unique_species = list(unique_species)
print(unique_species)
print(unique_diseases)

disease_table = pd.DataFrame(0, index=unique_species,columns=unique_diseases)
for species_name in unique_species:
    for disease in unique_diseases: 
        disease_subtable = df.loc[(df['SPECIES_NAME'] == species_name) & (df['DISEASE'] == disease)]
        empty = bool(disease_subtable.empty)
        if not empty:
            count = disease_subtable['DISEASE'].count()
        
            disease_table.loc[species_name, disease] = count
disease_table.sort_index()

disease_table = disease_table.rename(columns ={'A':'healthy_count', 'P':'total_diseased_count', 'Unknown':'unknown_count'})
disease_table

disease_table["total_diseased_percent"] = disease_table["total_diseased_count"]/(disease_table["healthy_count"]+ disease_table["total_diseased_count"])

disease_table = disease_table.drop(columns=['F','S'])
disease_table = disease_table.rename_axis('species')


disease_table['genus'] = disease_table.index.str.split().str[0] #add to other similar sets 
#move total disease column to right hand side of genus 
disease_table = disease_table[['genus','total_diseased_percent', 'healthy_count','total_diseased_count','unknown_count']]

disease_table

['Meandrina danae', 'Mycetophyllia spp', 'Orbicella faveolata', 'Madracis auretenra', 'Porites furcata', 'Manicina areolata', 'Oculina diffusa', 'Dichocoenia stokesii', 'Solenastrea bournoni', 'Montastraea cavernosa', 'Agaricia fragilis', 'Madracis pharensis', 'Acropora palmata', 'Agaricia humilis', 'Colpophyllia natans', 'Porites porites', 'Meandrina spp', 'Acropora cervicornis', 'Madracis formosa', 'Agaricia agaricites', 'Mycetophyllia aliciae', 'Agaricia spp', 'Orbicella franksi', 'Mycetophyllia ferox', 'Porites spp', 'Siderastrea spp', 'Stephanocoenia intersepta', 'Porites astreoides', 'Acropora prolifera', 'Madracis senaria', 'Porites divaricata', 'Isophyllastrea rigida', 'Madracis carmabi', 'Mycetophyllia danaana', 'Siderastrea siderea', 'Agaricia lamarcki', 'Pseudodiploria clivosa', 'Siderastrea radians', 'Madracis spp', 'Madracis decactis', 'Mussa angulosa', 'Orbicella annularis species complex', 'Favia fragum', 'Orbicella spp', 'Pseudodiploria strigosa', 'Helioceris cucullata'

Unnamed: 0_level_0,genus,total_diseased_percent,healthy_count,total_diseased_count,unknown_count
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Meandrina danae,Meandrina,0.000000,41,0,474
Mycetophyllia spp,Mycetophyllia,0.000000,4,0,317
Orbicella faveolata,Orbicella,0.050091,1043,55,384
Madracis auretenra,Madracis,0.000000,9,0,438
Porites furcata,Porites,0.004926,202,1,527
...,...,...,...,...,...
Orbicella annularis,Orbicella,0.033520,346,12,493
Mycetophyllia lamarckiana,Mycetophyllia,0.000000,2,0,268
Dendrogyra cylindrus,Dendrogyra,0.047619,20,1,585
Eusmilia fastigiata,Eusmilia,0.000000,48,0,562


In [9]:
disease_table.to_csv("../Processed_Data/NCRMP_Puerto_Rico_corals.csv",index_label="species")