# Minimal Code Needed for Individual Species KZFP Conservation Profiles for KZFP Shiny App
The goal of this script is to first recreate Anna's analysis of the Trono (https://doi.org/10.1038/nature21683) data. This will then be used to create an interactive Shiny app using R, which will be published for free on http://www.shinyapps.io/.

In [1]:
import pandas as pd

In [2]:
df2 = pd.read_excel("41586_2017_BFnature21683_MOESM103_ESM.xlsx")

In [3]:
# df2.head()

In [4]:
rows = []

# Get rows for each species, cluster, and presence/absence boolean value

In [5]:
species = df2['Latin name'].unique()
# species

In [6]:
clusters = df2['Cluster #'].unique()
# clusters

In [7]:
for organism in species:
    for cluster in clusters:
        rows.append({'Species': organism, 'Cluster': cluster, 'Present': False})

# rows

In [8]:
# Check expected number of rows
expectedRows = len(species)*len(clusters)

In [9]:
# Check that actual number of rows is equal to expected
actualRows = len(rows)

In [10]:
if expectedRows == actualRows:
    print(f"Rejoice! Actual equals expected. \nActual: {actualRows} \tExpected: {expectedRows}")
else:
    print(f"Go ahead and retry that. The actual does not equal expected.\nActual: {actualRows} \tExpected: {expectedRows}")

Rejoice! Actual equals expected. 
Actual: 2883909 	Expected: 2883909


In [11]:
# It is, so populate dataframe with False values for now
dfClusterIDBool = pd.DataFrame(rows)
dfClusterIDBool

Unnamed: 0,Species,Cluster,Present
0,Acanthisitta chloris,204,False
1,Acanthisitta chloris,205,False
2,Acanthisitta chloris,206,False
3,Acanthisitta chloris,207,False
4,Acanthisitta chloris,208,False
...,...,...,...
2883904,Xenopus tropicalis,15298,False
2883905,Xenopus tropicalis,15299,False
2883906,Xenopus tropicalis,15300,False
2883907,Xenopus tropicalis,15301,False


In [12]:
df2.rename(columns={'Species': 'Species_code', 'Latin name': 'Species','Cluster #': 'Cluster'}, inplace=True)
# df2

In [13]:
dfClusterIDBool['Present'] = dfClusterIDBool[['Species', 'Cluster']].merge(
    df2[['Species', 'Cluster']].drop_duplicates(),
    on=['Species', 'Cluster'],
    how='left',
    indicator=True
)['_merge'].eq('both')


In [14]:
dfClusterIDBool.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2883909 entries, 0 to 2883908
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   Species  object
 1   Cluster  int64 
 2   Present  bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 46.8+ MB


In [15]:
# Count duplicates for all (Species, Cluster) pairs
duplicates = df2.groupby(['Species', 'Cluster']).size()

# Keep only pairs that appear more than once
duplicates = duplicates[duplicates > 1]

# Number of duplicate pairs
num_duplicate_pairs = duplicates.shape[0]

print("Number of duplicate (Species, Cluster) pairs:", num_duplicate_pairs)
print(duplicates)


Number of duplicate (Species, Cluster) pairs: 3547
Species                 Cluster
Ailuropoda melanoleuca  219        2
                        235        2
                        1034       2
                        1231       2
                        1270       2
                                  ..
Xenopus tropicalis      15277      2
                        15279      2
                        15283      2
                        15289      2
                        15295      2
Length: 3547, dtype: int64


### Note:
With this methodology, I exclude all duplicates. By 'duplicate,' I mean all cases such that there are two or more Species-Cluster pairs.

In [16]:
# dfClusterIDBool

In [17]:
# Save as parquet
# dfClusterIDBool.to_parquet("dfClusterIDBool_data.parquet", compression="snappy")


In [18]:
# Ensure it saved by checking working directory

In [19]:
# ls -lh

In [20]:
# Save as CSV and ensure it saved by checking working directory
# dfClusterIDBool.to_csv("dfClusterIDBool_data.csv.gz", index=False, compression="gzip")

In [21]:
# ls -lh

In [22]:
# Go to next step

In [23]:
df2.rename(columns={'Estimated evolutionary distance from human (MY)': 'timeFromHuman_MY', 'Common name': 'CommonName'}, inplace=True)

# df2

In [24]:
# dfClusterIDBool

In [25]:
# Convert long to wide
df_wide = dfClusterIDBool.pivot(index='Species', columns='Cluster', values='Present').reset_index()
df_wide.columns.name = None

In [26]:
# df_wide

In [27]:
# df_wide.insert(1, 'Order', 'TODO')
# df_wide.insert(2, 'Class', 'TODO')
# df_wide.insert(3, 'CommonName', 'TODO')
# df_wide.insert(4, 'timeFromHuman_MY', 'TODO')


In [28]:
# df_wide

In [29]:
# df_wide['Order'] = 'a'
# df_wide

In [30]:
# print(df_wide.dtypes)
# print(df2.dtypes)

In [31]:
df_wide = df_wide.merge(
    df2[['Species', 'Order', 'Class', 'CommonName', 'timeFromHuman_MY']],
    on='Species',
    how='left'
)
# df_wide

In [32]:
df_wide = df_wide.drop_duplicates()
# df_wide

In [33]:
col0 = df_wide.pop('Order')        # remove the column
df_wide.insert(1, 'Order', col0)   # insert it at position 1 (0-based index)

col1 = df_wide.pop('Class')
df_wide.insert(2, 'Class', col1)

col2 = df_wide.pop('CommonName')
df_wide.insert(3, 'CommonName', col2)

col3 = df_wide.pop('timeFromHuman_MY')
df_wide.insert(4, 'timeFromHuman_MY', col3)

# Check df_wide
df_wide

Unnamed: 0,Species,Order,Class,CommonName,timeFromHuman_MY,204,205,206,207,208,...,15293,15294,15295,15296,15297,15298,15299,15300,15301,15302
0,Acanthisitta chloris,Passeriformes,Aves,Rifleman,320.5,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Ailuropoda melanoleuca,Carnivora,Mammalia,Giant Panda,97.5,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
305,Alligator mississippiensis,Crocodilia,Reptilia,American alligator,320.5,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
518,Alligator sinensis,Crocodilia,Reptilia,Chinese alligator,320.5,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
784,Amazona vittata,Psittaciformes,Aves,Puerto Rican amazon,320.5,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47563,Vicugna pacos huacaya,Artiodactyla,Mammalia,Huacaya alpaca,97.5,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
47867,Vipera berus berus,Squamata,Reptilia,Common European viper,320.5,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
48061,Xenopus laevi,Anura,Reptilia,African clawed frog,355.7,False,False,False,False,False,...,False,False,True,False,True,False,True,False,False,False
48191,Xenopus tropicalis,Anura,Reptilia,Western clawed frog,355.7,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,False


In [34]:
# # Save as parquet
# df_wide.to_parquet("dfWide_data.parquet", compression="snappy")

# # Save as CSV and ensure it saved by checking working directory
# df_wide.to_csv("dfWide_data.csv.gz", index=False, compression="gzip")

In [35]:
# ls -lh

In [36]:
unique_species = df_wide['Species'].unique()

In [37]:
species_dfs = {}

for species in unique_species:
    species_dfs[species] = df_wide.loc[df_wide['Species'] == species].copy()


In [38]:
# List of metadata columns
id_vars = ['Species', 'Order', 'Class', 'CommonName', 'timeFromHuman_MY']


In [39]:
df_long = df_wide.melt(
    id_vars=id_vars,         # columns to keep as-is
    var_name='Cluster',      # name of the new column for the old column headers
    value_name='Present'     # name of the new column for the cell values
)


In [40]:
df_long


Unnamed: 0,Species,Order,Class,CommonName,timeFromHuman_MY,Cluster,Present
0,Acanthisitta chloris,Passeriformes,Aves,Rifleman,320.5,204,True
1,Ailuropoda melanoleuca,Carnivora,Mammalia,Giant Panda,97.5,204,False
2,Alligator mississippiensis,Crocodilia,Reptilia,American alligator,320.5,204,False
3,Alligator sinensis,Crocodilia,Reptilia,Chinese alligator,320.5,204,False
4,Amazona vittata,Psittaciformes,Aves,Puerto Rican amazon,320.5,204,False
...,...,...,...,...,...,...,...
2883904,Vicugna pacos huacaya,Artiodactyla,Mammalia,Huacaya alpaca,97.5,15302,False
2883905,Vipera berus berus,Squamata,Reptilia,Common European viper,320.5,15302,False
2883906,Xenopus laevi,Anura,Reptilia,African clawed frog,355.7,15302,False
2883907,Xenopus tropicalis,Anura,Reptilia,Western clawed frog,355.7,15302,False


In [41]:
species_dfs_long = {}
species_dfs_true = {}
species_cluster_conservation_dfs = {}

In [42]:
#####################################################
# OMEGA EDIT THIS... IT DOESN't WORK RIGHT NOW #
#####################################################

def speciesKZFPclusterConservation(species_df_dict, unique_species, df_wide):

    i = 0
    for key in species_df_dict:
        print(key)

        # Melt the wide dataframe to long format
        species_dfs_long[unique_species[i]] = species_df_dict[key].melt(
            id_vars=['Species', 'Order', 'Class', 'CommonName', 'timeFromHuman_MY'],  # keep metadata
            var_name='Cluster',   # column for cluster IDs
            value_name='Present'  # column for True/False
        )
        
        #  # Convert cluster IDs to numeric (optional)
        # species_dfs_long[unique_species[i]]['Cluster'] = df_long[unique_species[i]]['Cluster']
        
        # Keep only rows where Present is True
        species_dfs_true[unique_species[i]] =  species_dfs_long[unique_species[i]].loc[species_dfs_long[unique_species[i]]['Present'] == True].reset_index(drop=True)

        # Get the appropriate dfs for next step
        species_rows = []

        for cluster in species_dfs_true[unique_species[i]]['Cluster']:
            evoDist = df_wide.loc[df_wide['Species']==unique_species[i]]['timeFromHuman_MY'].item()
            present = df_wide.loc[df_wide['Species']==unique_species[i]][cluster].item()
            
        species_rows.append({'cluster': cluster, 'species': organism, 'present': present, 'evoDist': evoDist})

        species_cluster_conservation_dfs[unique_species[i]] = pd.DataFrame(species_rows)
        
        # monoRows
        # print(i)

        #### EDIT
        # Find the order of most cluster TRUE in df to least
        clusterTfreq = []


        for cluster in species_dfs_true[unique_species[i]]['Cluster']:
            freq = species_cluster_conservation_dfs[unique_species[i]].loc[(species_cluster_conservation_dfs[unique_species[i]]['cluster'] == cluster) & (species_cluster_conservation_dfs[unique_species[i]]['present'] == True)].shape[0]
            clusterTfreq.append({'Cluster': cluster, 'Frequency_T': freq})
        
        df_temp = pd.DataFrame(clusterTfreq)        
        
        df_cluster_sorted = df_temp.sort_values(by=['Frequency_T', 'Cluster'], ascending=[False, True])

        cluster_order = []
        for cluster in df_cluster_sorted['Cluster']:
            cluster_order.append(cluster)
            
        species_cluster_conservation_dfs[unique_species[i]]['cluster'] = pd.Categorical(species_cluster_conservation_dfs[unique_species[i]]['cluster'], categories=cluster_order, ordered=True)

        species_cluster_conservation_dfs[unique_species[i]] = species_cluster_conservation_dfs[unique_species[i]].sort_values(by=['evoDist', 'species', 'cluster'], ascending=[False, True, True]).reset_index(drop=True)

        i += 1

    return species_dfs_long, species_dfs_true, species_cluster_conservation_dfs       


In [43]:
speciesKZFPclusterConservation(species_dfs, unique_species, df_wide)

Acanthisitta chloris
Ailuropoda melanoleuca
Alligator mississippiensis
Alligator sinensis
Amazona vittata
Anas platyrhynchos
Anolis carolinensis
Anser cygnoides domesticus
Antrostomus carolinensis
Aotus nancymaae
Apalone spinifera
Aptenodytes forsteri
Apteryx australis mantelli
Aquila chrysaetos
Ara macao
Balaenoptera acutorostrata scammoni
Balaenoptera bonaerensis
Balearica regulorum gibbericeps
Bison bison bison
Bos indicus
Bos mutus
Bos taurus
Bubalus bubalis
Callithrix jacchus
Calypte anna
Camelus bactrianus
Camelus dromedarius
Camelus ferus
Canis familiaris
Capra aegagrus
Capra hircus
Capreolus capreolus
Caprimulgus carolinensis
Cariama cristata
Cathartes aura
Cavia aperea
Cavia porcellus
Ceratotherium simum
Cercocebus atys
Chaetura pelagica
Charadrius vociferus
Chelonia mydas
Chinchilla lanigera
Chlamydotis macqueenii
Chlorocebus sabaeus
Choloepus hoffmanni
Chrysemys picta bellii
Chrysochloris asiatica
Colinus virginianus
Colius striatus
Colobus angolensis palliatus
Columba livia

({'Acanthisitta chloris':                     Species          Order Class CommonName  timeFromHuman_MY  \
  0      Acanthisitta chloris  Passeriformes  Aves   Rifleman             320.5   
  1      Acanthisitta chloris  Passeriformes  Aves   Rifleman             320.5   
  2      Acanthisitta chloris  Passeriformes  Aves   Rifleman             320.5   
  3      Acanthisitta chloris  Passeriformes  Aves   Rifleman             320.5   
  4      Acanthisitta chloris  Passeriformes  Aves   Rifleman             320.5   
  ...                     ...            ...   ...        ...               ...   
  15094  Acanthisitta chloris  Passeriformes  Aves   Rifleman             320.5   
  15095  Acanthisitta chloris  Passeriformes  Aves   Rifleman             320.5   
  15096  Acanthisitta chloris  Passeriformes  Aves   Rifleman             320.5   
  15097  Acanthisitta chloris  Passeriformes  Aves   Rifleman             320.5   
  15098  Acanthisitta chloris  Passeriformes  Aves   Rifleman  

In [44]:
# for key in species_dfs:
#     print(type(species_dfs[key]))
# # species_dfs['Acanthisitta chloris']

In [45]:
species_cluster_conservation_dfs

{'Acanthisitta chloris':   cluster             species  present  evoDist
 0    3638  Xenopus tropicalis     True    320.5,
 'Ailuropoda melanoleuca':   cluster             species  present  evoDist
 0   15053  Xenopus tropicalis     True     97.5,
 'Alligator mississippiensis':   cluster             species  present  evoDist
 0   15053  Xenopus tropicalis     True    320.5,
 'Alligator sinensis':   cluster             species  present  evoDist
 0   15053  Xenopus tropicalis     True    320.5,
 'Amazona vittata':   cluster             species  present  evoDist
 0    8903  Xenopus tropicalis     True    320.5,
 'Anas platyrhynchos':   cluster             species  present  evoDist
 0   12092  Xenopus tropicalis     True    320.5,
 'Anolis carolinensis':   cluster             species  present  evoDist
 0   15068  Xenopus tropicalis     True    320.5,
 'Anser cygnoides domesticus':   cluster             species  present  evoDist
 0   12092  Xenopus tropicalis     True    320.5,
 'Antrostomu

In [46]:
for key in species_cluster_conservation_dfs:
    print(key)

Acanthisitta chloris
Ailuropoda melanoleuca
Alligator mississippiensis
Alligator sinensis
Amazona vittata
Anas platyrhynchos
Anolis carolinensis
Anser cygnoides domesticus
Antrostomus carolinensis
Aotus nancymaae
Apalone spinifera
Aptenodytes forsteri
Apteryx australis mantelli
Aquila chrysaetos
Ara macao
Balaenoptera acutorostrata scammoni
Balaenoptera bonaerensis
Balearica regulorum gibbericeps
Bison bison bison
Bos indicus
Bos mutus
Bos taurus
Bubalus bubalis
Callithrix jacchus
Calypte anna
Camelus bactrianus
Camelus dromedarius
Camelus ferus
Canis familiaris
Capra aegagrus
Capra hircus
Capreolus capreolus
Caprimulgus carolinensis
Cariama cristata
Cathartes aura
Cavia aperea
Cavia porcellus
Ceratotherium simum
Cercocebus atys
Chaetura pelagica
Charadrius vociferus
Chelonia mydas
Chinchilla lanigera
Chlamydotis macqueenii
Chlorocebus sabaeus
Choloepus hoffmanni
Chrysemys picta bellii
Chrysochloris asiatica
Colinus virginianus
Colius striatus
Colobus angolensis palliatus
Columba livia

In [47]:
species_cluster_conservation_dfs['Ornithorhynchus anatinus']

Unnamed: 0,cluster,species,present,evoDist
0,14844,Xenopus tropicalis,True,179.2
