In [1]:
import pandas as pd
import numpy as np

In [2]:
DATA_DIR = '../data/processed'

In [3]:
plant_disease_df = pd.read_csv(
    f'{DATA_DIR}/plant_disease_associations.tsv',
    sep='\t',
)
plant_disease_df.head()

Unnamed: 0,plant_curie,plant_name,disease_curie,database,evidence
0,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10067319_8
1,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10094290_4
2,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10336604_1
3,ncbitaxon:3311,Ginkgo biloba,mondo:0002643,bern2,10345150_1
4,ncbitaxon:203270,Berberis aquifolium,mondo:0008334,bern2,10352377_1


In [4]:
collapsed_plant_disease_df = pd.read_csv(
    f'{DATA_DIR}/plant_disease_collapsed.tsv',
    sep='\t',
    low_memory=False,
)
collapsed_plant_disease_df.head()

Unnamed: 0,plant_curie,plant_name,disease_curie,database,evidence
0,ncbitaxon:3369,Cryptomeria japonica,mondo:0024623,bern2,10067319_8
1,ncbitaxon:3369,Cryptomeria japonica,mondo:0005087,bern2,10067319_8
2,ncbitaxon:3369,Cryptomeria japonica,mondo:0021166,bern2,10067319_8
3,ncbitaxon:3369,Cryptomeria japonica,mondo:0005046,bern2,10067319_8
4,ncbitaxon:3369,Cryptomeria japonica,mondo:0024623,bern2,10094290_4


### Create plant-disease binary matrix 

In [5]:
plants = set(plant_disease_df['plant_curie'].values.tolist())
diseases = set(plant_disease_df['disease_curie'].values.tolist())
len(plants), len(diseases)

(6048, 2205)

In [6]:
plants_collapsed = set(collapsed_plant_disease_df['plant_curie'].values.tolist())
diseases_collapsed = set(collapsed_plant_disease_df['disease_curie'].values.tolist())
len(plants_collapsed), len(diseases_collapsed)

(5636, 23)

In [7]:
## Initalize empty matrix
plant_disease_matrix = pd.DataFrame(columns=diseases, index=plants)
plant_disease_collapsed_matrix = pd.DataFrame(columns=diseases_collapsed, index=plants_collapsed)

In [8]:
from tqdm import tqdm

In [9]:
for plant, disease in tqdm(plant_disease_df[['plant_curie', 'disease_curie']].values):
    plant_disease_matrix.loc[plant, disease] = 1

100%|████████████████████████████████████████████████| 97066/97066 [00:06<00:00, 14535.24it/s]


In [10]:
for plant, disease in tqdm(collapsed_plant_disease_df[['plant_curie', 'disease_curie']].values):
    plant_disease_collapsed_matrix.loc[plant, disease] = 1

100%|██████████████████████████████████████████████| 172702/172702 [00:11<00:00, 15183.69it/s]


### Calculate sparsity of the two matrix

In [11]:
plant_disease_matrix.shape, plant_disease_collapsed_matrix.shape

((6048, 2205), (5636, 23))

In [12]:
plant_disease_matrix.fillna(0, inplace=True)
plant_disease_collapsed_matrix.fillna(0, inplace=True)

In [13]:
def count_zeros(df: pd.DataFrame):
    """Method to count the number of 0's in a dataframe."""
    
    zeros = []
    
    for col in df.columns:
        zero_elms = (df[col] == 0).sum()
        zeros.append(zero_elms)
        
    return sum(zeros)

In [14]:
zeros_in_matrix = count_zeros(plant_disease_matrix)
zeros_in_collapsed_matrix = count_zeros(plant_disease_collapsed_matrix)

In [15]:
# Sparsity for original matrix
zeros_in_matrix / (plant_disease_matrix.shape[0] * plant_disease_matrix.shape[1])

0.9970544037720909

In [16]:
# Sparsity for collapsed matrix
zeros_in_collapsed_matrix / (plant_disease_collapsed_matrix.shape[0] * plant_disease_collapsed_matrix.shape[1])

0.7724565680254266

In [17]:
0.9970544037720909 - 0.7724565680254266

0.22459783574666425