# Process NCRMP US Virgin Islands Disease Data

This manuscript dataset has a total number of corals for each species, and counts of disease presence. The scipt coneverts raw data from the manuscript into Tidy format(e.g. filling in genus name in each row). Using the diseased and healthy counts to infer the total disease percentages.  

## Import data as a pandas DataFrame

In [1]:
import pandas as pd
data= pd.read_csv("../Raw_Data/NCRMP_USVI_raw.csv")
print(data.columns)

Index(['time', 'latitude', 'longitude', 'REGION', 'REGION_DESCRIPTION',
       'PRIMARY_SAMPLE_UNIT', 'STATION_NR', 'YEAR', 'MONTH', 'DAY', 'Date_UTC',
       'HABITAT_CD', 'HABITAT_TYPE', 'STRAT', 'STRAT_Description',
       'RUGOSITY_CD', 'WTD_RUG', 'MEAN_RUG', 'MAPGRID_NR', 'SUB_REGION_NAME',
       'SUB_REGION_NAME_DESCRIPTION', 'SUB_REGION_NR', 'ZONE_NAME', 'ZONE_NR',
       'MPA_NAME', 'MPA_NR', 'ADMIN', 'Administration_Description', 'PROT',
       'DEPTH_STRAT', 'DEPTH_STRAT_DESCRIPTION', 'MIN_DEPTH', 'MAX_DEPTH',
       'METERS_COMPLETED', 'SPECIES_CD', 'SPECIES_NAME', 'N', 'JUV',
       'MAX_DIAMETER', 'PERP_DIAMETER', 'HEIGHT', 'OLD_MORT', 'RECENT_MORT',
       'BLEACH_CONDITION', 'DISEASE', 'accession_url'],
      dtype='object')


  data= pd.read_csv("../Raw_Data/NCRMP_USVI_raw.csv")


## Isolate desired columns

In [2]:
data = data[["SPECIES_CD","SPECIES_NAME","DISEASE"]]
data = data.dropna(how='all').reset_index(drop=True)
data

Unnamed: 0,SPECIES_CD,SPECIES_NAME,DISEASE
0,AGA GRAH,Agaricia grahamae,
1,AGA HUMI,Agaricia humilis,
2,AGA LAMA,Agaricia lamarcki,
3,AGA SPE.,Agaricia spp,
4,COL NATA,Colpophyllia natans,
...,...,...,...
175500,SID SIDE,Siderastrea siderea,A
175501,SID SIDE,Siderastrea siderea,A
175502,SID SPE.,Siderastrea spp,
175503,SOL BOUR,Solenastrea bournoni,


## Selecting Disease column and setting empty cells to healthy

In [3]:
data['DISEASE'][data['DISEASE'].isnull()] = "Unknown"
data

Unnamed: 0,SPECIES_CD,SPECIES_NAME,DISEASE
0,AGA GRAH,Agaricia grahamae,Unknown
1,AGA HUMI,Agaricia humilis,Unknown
2,AGA LAMA,Agaricia lamarcki,Unknown
3,AGA SPE.,Agaricia spp,Unknown
4,COL NATA,Colpophyllia natans,Unknown
...,...,...,...
175500,SID SIDE,Siderastrea siderea,A
175501,SID SIDE,Siderastrea siderea,A
175502,SID SPE.,Siderastrea spp,Unknown
175503,SOL BOUR,Solenastrea bournoni,Unknown


## Count corals by genus and Disease column

In [4]:
Grouped_data = data.groupby(['SPECIES_NAME','DISEASE']).agg(Disease_count = ("DISEASE", "count")).reset_index()

Grouped_data

Unnamed: 0,SPECIES_NAME,DISEASE,Disease_count
0,Acropora cervicornis,A,70
1,Acropora cervicornis,P,3
2,Acropora cervicornis,S,2
3,Acropora cervicornis,Unknown,2089
4,Acropora palmata,A,106
...,...,...,...
193,Stephanocoenia intersepta,F,1
194,Stephanocoenia intersepta,P,16
195,Stephanocoenia intersepta,Unknown,1462
196,Tubastraea coccinea,A,43


## Create Disease table 

We need to organise the data so we are able to calculate total disease percent and include a genus column. 

In [5]:
from numpy import isfinite
from math import isnan
disease_df = data
unique_diseases = set(disease_df["DISEASE"].unique())

df = data
unique_species = set(df["SPECIES_NAME"].unique())
unique_species = list(unique_species)
print(unique_species)
print(unique_diseases)

disease_table = pd.DataFrame(0, index=unique_species,columns=unique_diseases)
for species_name in unique_species:
    for disease in unique_diseases: 
        disease_subtable = df.loc[(df['SPECIES_NAME'] == species_name) & (df['DISEASE'] == disease)]
        empty = bool(disease_subtable.empty)
        if not empty:
            count = disease_subtable['DISEASE'].count()
        
            disease_table.loc[species_name, disease] = count
disease_table.sort_index()

disease_table = disease_table.rename(columns ={'A':'healthy_count', 'P':'total_diseased_count', 'Unknown':'unknown_count'})
disease_table

disease_table["total_diseased_percent"] = disease_table["total_diseased_count"]/(disease_table["healthy_count"]+ disease_table["total_diseased_count"])

disease_table = disease_table.drop(columns=['F','S'])
disease_table = disease_table.rename_axis('species')

disease_table['genus'] = disease_table.index.str.split().str[0] #add to other similar sets 
#move total disease column to right hand side of genus 
disease_table = disease_table[['genus','total_diseased_percent', 'healthy_count','total_diseased_count','unknown_count']]

disease_table

['Pseudodiploria clivosa', 'Porites branneri', 'Meandrina jacksoni', 'Porites astreoides', 'Porites divaricata', 'Orbicella annularis', 'Meandrina spp', 'Madracis carmabi', 'Isophyllia sinuosa', 'Agaricia agaricites', 'Solenastrea spp', 'Orbicella spp', 'Montastraea cavernosa', 'Agaricia tenuifolia', 'Dichocoenia stokesii', 'Orbicella faveolata', 'Isophyllia spp', 'Porites colonensis', 'Meandrina meandrites', 'Pseudodiploria strigosa', 'Oculina spp', 'Mycetophyllia lamarckiana', 'Acropora prolifera', 'Madracis formosa', 'Agaricia humilis', 'Madracis pharensis', 'Mycetophyllia ferox', 'Madracis decactis', 'Madracis spp', 'Porites furcata', 'Porites spp', 'Scolymia spp', 'Mycetophyllia aliciae', 'Oculina diffusa', 'Favia fragum', 'Helioceris cucullata', 'Cladocora arbuscula', 'Siderastrea siderea', 'Colpophyllia natans', 'Pseudodiploria spp', 'Stephanocoenia intersepta', 'Scolymia lacera', 'Acropora cervicornis', 'Scolymia cubensis', 'Eusmilia fastigiata', 'Dendrogyra cylindrus', 'Other 

Unnamed: 0_level_0,genus,total_diseased_percent,healthy_count,total_diseased_count,unknown_count
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pseudodiploria clivosa,Pseudodiploria,0.009804,303,3,2122
Porites branneri,Porites,0.000000,28,0,1278
Meandrina jacksoni,Meandrina,0.000000,21,0,1020
Porites astreoides,Porites,0.003150,18673,59,2388
Porites divaricata,Porites,0.008104,612,5,2029
...,...,...,...,...,...
Mussa angulosa,Mussa,0.000000,17,0,1596
Agaricia lamarcki,Agaricia,0.004178,715,3,1963
Meandrina danae,Meandrina,0.100000,9,1,1310
Orbicella annularis species complex,Orbicella,0.015385,320,5,1814


In [6]:
disease_table.to_csv("../Processed_Data/NCRMP_USVI_corals.csv",index_label="species")

## Summarize disease at the genus level 

In [8]:
final_column_names = ['genus', 'healthy_count', 'total_diseased_count','unknown_count']

genus_table = disease_table.loc[:,final_column_names]
genus_table = genus_table.groupby('genus')['healthy_count','total_diseased_count','unknown_count'].sum().reset_index()
genus_table['total_diseased_percent'] = genus_table['total_diseased_count']/(genus_table['healthy_count'] + genus_table['total_diseased_count'])
genus_table = genus_table.dropna()
genus_table.to_csv("../Processed_Data/per_genus_disease_data/NCRMP_USVI_corals.csv",index_label="genus")

genus_table

  genus_table = genus_table.groupby('genus')['healthy_count','total_diseased_count','unknown_count'].sum().reset_index()


Unnamed: 0,genus,healthy_count,total_diseased_count,unknown_count,total_diseased_percent
0,Acropora,178,8,4842,0.043011
1,Agaricia,10754,49,13341,0.004536
2,Cladocora,1,0,488,0.0
3,Colpophyllia,290,2,2030,0.006849
4,Dendrogyra,37,1,1541,0.026316
5,Dichocoenia,297,3,2041,0.01
6,Diploria,303,3,2067,0.009804
7,Eusmilia,231,1,2071,0.00431
8,Favia,37,0,2212,0.0
9,Helioceris,181,1,2145,0.005495
