# Process NCRMP US Virgin Islands Disease Data

This manuscript dataset has a total number of corals for each species, and counts of disease presence. The scipt coneverts raw data from the manuscript into Tidy format(e.g. filling in genus name in each row). Using the diseased and healthy counts to infer the total disease percentages.  

## Import data as a pandas DataFrame

In [1]:
import pandas as pd
data= pd.read_csv("../Raw_Data/NCRMP_USVI_raw.csv")
print(data.columns)

Index(['time', 'latitude', 'longitude', 'REGION', 'REGION_DESCRIPTION',
       'PRIMARY_SAMPLE_UNIT', 'STATION_NR', 'YEAR', 'MONTH', 'DAY', 'Date_UTC',
       'HABITAT_CD', 'HABITAT_TYPE', 'STRAT', 'STRAT_Description',
       'RUGOSITY_CD', 'WTD_RUG', 'MEAN_RUG', 'MAPGRID_NR', 'SUB_REGION_NAME',
       'SUB_REGION_NAME_DESCRIPTION', 'SUB_REGION_NR', 'ZONE_NAME', 'ZONE_NR',
       'MPA_NAME', 'MPA_NR', 'ADMIN', 'Administration_Description', 'PROT',
       'DEPTH_STRAT', 'DEPTH_STRAT_DESCRIPTION', 'MIN_DEPTH', 'MAX_DEPTH',
       'METERS_COMPLETED', 'SPECIES_CD', 'SPECIES_NAME', 'N', 'JUV',
       'MAX_DIAMETER', 'PERP_DIAMETER', 'HEIGHT', 'OLD_MORT', 'RECENT_MORT',
       'BLEACH_CONDITION', 'DISEASE', 'accession_url'],
      dtype='object')


  data= pd.read_csv("../Raw_Data/NCRMP_USVI_raw.csv")


## Isolate desired columns

In [2]:
data = data[["SPECIES_CD","SPECIES_NAME","DISEASE"]]
data = data.dropna(how='all').reset_index(drop=True)
data

Unnamed: 0,SPECIES_CD,SPECIES_NAME,DISEASE
0,AGA GRAH,Agaricia grahamae,
1,AGA HUMI,Agaricia humilis,
2,AGA LAMA,Agaricia lamarcki,
3,AGA SPE.,Agaricia spp,
4,COL NATA,Colpophyllia natans,
...,...,...,...
175500,SID SIDE,Siderastrea siderea,A
175501,SID SIDE,Siderastrea siderea,A
175502,SID SPE.,Siderastrea spp,
175503,SOL BOUR,Solenastrea bournoni,


## Selecting Disease column and setting empty cells to healthy

In [3]:
data['DISEASE'][data['DISEASE'].isnull()] = "Unknown"
data

Unnamed: 0,SPECIES_CD,SPECIES_NAME,DISEASE
0,AGA GRAH,Agaricia grahamae,Unknown
1,AGA HUMI,Agaricia humilis,Unknown
2,AGA LAMA,Agaricia lamarcki,Unknown
3,AGA SPE.,Agaricia spp,Unknown
4,COL NATA,Colpophyllia natans,Unknown
...,...,...,...
175500,SID SIDE,Siderastrea siderea,A
175501,SID SIDE,Siderastrea siderea,A
175502,SID SPE.,Siderastrea spp,Unknown
175503,SOL BOUR,Solenastrea bournoni,Unknown


## Count corals by genus and Disease column

In [4]:
Grouped_data = data.groupby(['SPECIES_NAME','DISEASE']).agg(Disease_count = ("DISEASE", "count")).reset_index()

Grouped_data

Unnamed: 0,SPECIES_NAME,DISEASE,Disease_count
0,Acropora cervicornis,A,70
1,Acropora cervicornis,P,3
2,Acropora cervicornis,S,2
3,Acropora cervicornis,Unknown,2089
4,Acropora palmata,A,106
...,...,...,...
193,Stephanocoenia intersepta,F,1
194,Stephanocoenia intersepta,P,16
195,Stephanocoenia intersepta,Unknown,1462
196,Tubastraea coccinea,A,43


## Create Disease table 

We need to organise the data so we are able to calculate total disease percent and include a genus column. 

In [5]:
from numpy import isfinite
from math import isnan
disease_df = data
unique_diseases = set(disease_df["DISEASE"].unique())

df = data
unique_species = set(df["SPECIES_NAME"].unique())
unique_species = list(unique_species)
print(unique_species)
print(unique_diseases)

disease_table = pd.DataFrame(0, index=unique_species,columns=unique_diseases)
for species_name in unique_species:
    for disease in unique_diseases: 
        disease_subtable = df.loc[(df['SPECIES_NAME'] == species_name) & (df['DISEASE'] == disease)]
        empty = bool(disease_subtable.empty)
        if not empty:
            count = disease_subtable['DISEASE'].count()
        
            disease_table.loc[species_name, disease] = count
disease_table.sort_index()

disease_table = disease_table.rename(columns ={'A':'healthy_count', 'P':'total_diseased_count', 'Unknown':'unknown_count'})
disease_table

disease_table["total_diseased_percent"] = disease_table["total_diseased_count"]/(disease_table["healthy_count"]+ disease_table["total_diseased_count"])

disease_table = disease_table.drop(columns=['F','S'])
disease_table = disease_table.rename_axis('species')

disease_table['genus'] = disease_table.index.str.split().str[0] #add to other similar sets 
#move total disease column to right hand side of genus 
disease_table = disease_table[['genus','total_diseased_percent', 'healthy_count','total_diseased_count','unknown_count']]

disease_table

['Isophyllia sinuosa', 'Scolymia spp', 'Acropora palmata', 'Scolymia lacera', 'Madracis formosa', 'Diploria labyrinthiformis', 'Montastraea cavernosa', 'Siderastrea spp', 'Madracis pharensis', 'Orbicella franksi', 'Siderastrea siderea', 'Porites divaricata', 'Agaricia agaricites', 'Tubastraea coccinea', 'Other coral', 'Madracis auretenra', 'Mycetophyllia ferox', 'Porites porites', 'Porites spp', 'Isophyllastrea rigida', 'Porites astreoides', 'Mycetophyllia spp', 'Madracis spp', 'Agaricia grahamae', 'Mycetophyllia danaana', 'Mycetophyllia aliciae', 'Scolymia cubensis', 'Agaricia lamarcki', 'Agaricia spp', 'Stephanocoenia intersepta', 'Pseudodiploria strigosa', 'Porites branneri', 'Helioceris cucullata', 'Porites colonensis', 'Dichocoenia stokesii', 'Scleractinia spp', 'Solenastrea spp', 'Cladocora arbuscula', 'Meandrina danae', 'Colpophyllia natans', 'Mycetophyllia lamarckiana', 'Acropora cervicornis', 'Madracis carmabi', 'Meandrina spp', 'Mussa angulosa', 'Agaricia fragilis', 'Orbicell

Unnamed: 0_level_0,genus,total_diseased_percent,healthy_count,total_diseased_count,unknown_count
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Isophyllia sinuosa,Isophyllia,0.000000,19,0,2219
Scolymia spp,Scolymia,0.000000,12,0,2228
Acropora palmata,Acropora,0.045045,106,5,2191
Scolymia lacera,Scolymia,0.000000,3,0,573
Madracis formosa,Madracis,0.000000,32,0,1858
...,...,...,...,...,...
Madracis decactis,Madracis,0.011331,1047,12,1832
Favia fragum,Favia,0.000000,37,0,2212
Pseudodiploria clivosa,Pseudodiploria,0.009804,303,3,2122
Manicina areolata,Manicina,0.000000,51,0,2206


In [6]:
disease_table.to_csv("../Processed_Data/NCRMP_USVI_corals.csv",index_label="species")