In [1]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data_dir = "/Users/cheesemania/PycharmProjects/mscthesis_wrkdir"

## Comparing sample counts before and after rarefaction at 3000

This was done before excluding H1N1 samples (Health Status = H1N1, Disease Level 2 = COVID-19) in PRJNA637034.

This comparison led to exclusion of four projects with too small sample size after rarefaction for downstream analysis.

In [34]:
# Use a regular expression to handle varying amounts of whitespace between columns
data_before_rar3000 = pd.read_csv(f'{data_dir}/metadata/before-rarefaction3000.txt', sep=r'\s+', engine='python') 
data_after_rar3000 = pd.read_csv(f'{data_dir}/metadata/after-rarefaction3000.txt', sep=r'\s+', engine='python')

In [35]:
data_after_rar3000.head()

Unnamed: 0,Project_ID,Disease_Level_2,Case_Control,sample_count
0,PRJDB13192,COVID-19,Case,73
1,PRJDB13192,COVID-19,Control,16
2,PRJEB35665,Melanoma,Case,19
3,PRJEB35665,Melanoma,Control,17
4,PRJEB38917,Obesity,Case,7


In [36]:
# Rename the sample_count column to be specific
data_before_rar3000 = data_before_rar3000.rename(columns={'sample_count': 'sample_count_before'})
data_after_rar3000 = data_after_rar3000.rename(columns={'sample_count': 'sample_count_after'})

# Perform a left merge to keep all rows from the "before" dataset
merged_df_rar3000 = pd.merge(data_before_rar3000, data_after_rar3000, on=['Project_ID', 'Disease_Level_2', 'Case_Control'], how='left')

# Fill missing values in the sample_count_after column with 0
merged_df_rar3000['sample_count_after'] = merged_df_rar3000['sample_count_after'].fillna(0)

# Convert sample_count_after to integers
merged_df_rar3000['sample_count_after'] = merged_df_rar3000['sample_count_after'].astype(int)

In [37]:
merged_df_rar3000

Unnamed: 0,Project_ID,Disease_Level_2,Case_Control,sample_count_before,sample_count_after
0,PRJDB13192,COVID-19,Case,76,73
1,PRJDB13192,COVID-19,Control,24,16
2,PRJEB35665,Melanoma,Case,20,19
3,PRJEB35665,Melanoma,Control,17,17
4,PRJEB38917,Obesity,Case,32,7
5,PRJEB38917,Obesity,Control,9,2
6,PRJEB38930,AN,Case,94,70
7,PRJEB38930,AN,Control,62,43
8,PRJEB42375,IBS,Case,80,80
9,PRJEB42375,IBS,Control,64,64


In [45]:
len(merged_df_rar3000['Project_ID'].unique())
# Sanity check - OK

25

In [39]:
merged_df_rar3000.to_csv(f'{data_dir}/metadata/merged-before-after-rarefaction3000.tsv', sep='\t', index=False)

### Observed features: non-filtered, filt-nonresident, and only the non-resident (rar3000)

In [78]:
df_non_filtered_of_rar3000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/alpha-diversity-results-non-filtered/observed-features-group-significance.tsv', sep='\t')

df_filt_nonresident_of_rar3000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/alpha-diversity-results-filt-nonresident/observed-features-group-significance.tsv', sep='\t')

df_nonresident_of_rar3000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/alpha-diversity-results-nonresident/observed-features-group-significance.tsv', sep='\t')

In [79]:
df_non_filtered_of_rar3000.shape

(1874, 20)

In [80]:
df_filt_nonresident_of_rar3000.shape

(1872, 20)

In [81]:
df_nonresident_of_rar3000.shape

(1088, 20)

In [82]:
df_non_filtered_of_rar3000.head()

Unnamed: 0,id,Project_ID,Sequencing_Region,Case_Control,Disease_Level_1,Health_Status,Disease_Level_2,Disease_Level_3,Disease_Subgroup,IBD_Subgroup,GeoLoc_Continent,GeoLoc_Country,GeoLoc_Region,GeoLoc_City,Host_Sex,Host_Age_Category,Host_Age_Decade,Host_Age,Age_Detail_Level,observed_features
0,S675,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,32
1,S683,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,10
2,S691,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,14
3,S699,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,37
4,S707,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,CD,clinical remission Crohn's disease (CD),Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,8


In [83]:
df_non_filtered_of_rar3000.columns.tolist()

['id',
 'Project_ID',
 'Sequencing_Region',
 'Case_Control',
 'Disease_Level_1',
 'Health_Status',
 'Disease_Level_2',
 'Disease_Level_3',
 'Disease_Subgroup',
 'IBD_Subgroup',
 'GeoLoc_Continent',
 'GeoLoc_Country',
 'GeoLoc_Region',
 'GeoLoc_City',
 'Host_Sex',
 'Host_Age_Category',
 'Host_Age_Decade',
 'Host_Age',
 'Age_Detail_Level',
 'observed_features']

In [84]:
# Rename the observed features column to be specific
df_non_filtered_of_rar3000 = df_non_filtered_of_rar3000.rename(columns={'observed_features': 'observed_features_non_filtered'})
df_filt_nonresident_of_rar3000 = df_filt_nonresident_of_rar3000.rename(columns={'observed_features': 'observed_features_filt_nonresident'})
df_nonresident_of_rar3000 = df_nonresident_of_rar3000.rename(columns={'observed_features': 'observed_features_nonresident'})

# Perform a left merge to keep all rows from the "non_filtered" dataset
merged_df_transition_of_rar3000 = pd.merge(df_non_filtered_of_rar3000, df_filt_nonresident_of_rar3000, on=['id', 'Project_ID', 'Sequencing_Region', 'Case_Control', 'Disease_Level_1', 'Health_Status', 'Disease_Level_2', 'Disease_Level_3', 'Disease_Subgroup', 'IBD_Subgroup', 'GeoLoc_Continent', 'GeoLoc_Country', 'GeoLoc_Region', 'GeoLoc_City', 'Host_Sex', 'Host_Age_Category', 'Host_Age_Decade', 'Host_Age', 'Age_Detail_Level'], how='left')
merged_df_observed_features_rar3000 = pd.merge(merged_df_transition_of_rar3000, df_nonresident_of_rar3000, on=['id', 'Project_ID', 'Sequencing_Region', 'Case_Control', 'Disease_Level_1', 'Health_Status', 'Disease_Level_2', 'Disease_Level_3', 'Disease_Subgroup', 'IBD_Subgroup', 'GeoLoc_Continent', 'GeoLoc_Country', 'GeoLoc_Region', 'GeoLoc_City', 'Host_Sex', 'Host_Age_Category', 'Host_Age_Decade', 'Host_Age', 'Age_Detail_Level'], how='left')

# Fill missing values in the sample_count_after column with 0
merged_df_observed_features_rar3000['observed_features_filt_nonresident'] = merged_df_observed_features_rar3000['observed_features_filt_nonresident'].fillna(0)
merged_df_observed_features_rar3000['observed_features_nonresident'] = merged_df_observed_features_rar3000['observed_features_nonresident'].fillna(0)

# Convert sample_count_after to integers
merged_df_observed_features_rar3000['observed_features_filt_nonresident'] = merged_df_observed_features_rar3000['observed_features_filt_nonresident'].astype(int)
merged_df_observed_features_rar3000['observed_features_nonresident'] = merged_df_observed_features_rar3000['observed_features_nonresident'].astype(int)

In [85]:
merged_df_observed_features_rar3000

Unnamed: 0,id,Project_ID,Sequencing_Region,Case_Control,Disease_Level_1,Health_Status,Disease_Level_2,Disease_Level_3,Disease_Subgroup,IBD_Subgroup,...,GeoLoc_Region,GeoLoc_City,Host_Sex,Host_Age_Category,Host_Age_Decade,Host_Age,Age_Detail_Level,observed_features_non_filtered,observed_features_filt_nonresident,observed_features_nonresident
0,S675,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,32,31,1
1,S683,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,10,9,1
2,S691,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,14,13,1
3,S699,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,37,34,3
4,S707,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,CD,clinical remission Crohn's disease (CD),Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,8,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1869,SRR18268067,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,32,29,3
1870,SRR18268068,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,56,52,4
1871,SRR18268069,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,139,135,4
1872,SRR18268070,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,18,16,2


In [86]:
merged_df_observed_features_rar3000.to_csv(f'{data_dir}/metadata/combined_observed_features_rar3000.tsv', sep='\t', index=False)

### Shannon index: non-filtered, filt-nonresident, and only the non-resident (rar3000)

In [98]:
df_non_filtered_shannon_rar3000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/alpha-diversity-results-non-filtered/shannon-group-significance.tsv', sep='\t')

df_filt_nonresident_shannon_rar3000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/alpha-diversity-results-filt-nonresident/shannon-group-significance.tsv', sep='\t')

df_nonresident_shannon_rar3000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/alpha-diversity-results-nonresident/shannon-group-significance.tsv', sep='\t')

In [99]:
df_non_filtered_shannon_rar3000.shape

(1874, 20)

In [100]:
df_filt_nonresident_shannon_rar3000.shape

(1872, 20)

In [101]:
df_nonresident_shannon_rar3000.shape

(1088, 20)

In [102]:
# Rename the shannon entropy column to be specific
df_non_filtered_shannon_rar3000 = df_non_filtered_shannon_rar3000.rename(columns={'shannon_entropy': 'shannon_entropy_non_filtered'})
df_filt_nonresident_shannon_rar3000 = df_filt_nonresident_shannon_rar3000.rename(columns={'shannon_entropy': 'shannon_entropy_filt_nonresident'})
df_nonresident_shannon_rar3000 = df_nonresident_shannon_rar3000.rename(columns={'shannon_entropy': 'shannon_entropy_nonresident'})

# Perform a left merge to keep all rows from the "non_filtered" dataset
merged_df_transition_shannon_rar3000 = pd.merge(df_non_filtered_shannon_rar3000, df_filt_nonresident_shannon_rar3000, on=['id', 'Project_ID', 'Sequencing_Region', 'Case_Control', 'Disease_Level_1', 'Health_Status', 'Disease_Level_2', 'Disease_Level_3', 'Disease_Subgroup', 'IBD_Subgroup', 'GeoLoc_Continent', 'GeoLoc_Country', 'GeoLoc_Region', 'GeoLoc_City', 'Host_Sex', 'Host_Age_Category', 'Host_Age_Decade', 'Host_Age', 'Age_Detail_Level'], how='left')
merged_df_shannon_rar3000 = pd.merge(merged_df_transition_shannon_rar3000, df_nonresident_shannon_rar3000, on=['id', 'Project_ID', 'Sequencing_Region', 'Case_Control', 'Disease_Level_1', 'Health_Status', 'Disease_Level_2', 'Disease_Level_3', 'Disease_Subgroup', 'IBD_Subgroup', 'GeoLoc_Continent', 'GeoLoc_Country', 'GeoLoc_Region', 'GeoLoc_City', 'Host_Sex', 'Host_Age_Category', 'Host_Age_Decade', 'Host_Age', 'Age_Detail_Level'], how='left')

# Fill missing values in the sample_count_after column with 0
merged_df_shannon_rar3000['shannon_entropy_filt_nonresident'] = merged_df_shannon_rar3000['shannon_entropy_filt_nonresident'].fillna(0)
merged_df_shannon_rar3000['shannon_entropy_nonresident'] = merged_df_shannon_rar3000['shannon_entropy_nonresident'].fillna(0)

In [103]:
merged_df_shannon_rar3000

Unnamed: 0,id,Project_ID,Sequencing_Region,Case_Control,Disease_Level_1,Health_Status,Disease_Level_2,Disease_Level_3,Disease_Subgroup,IBD_Subgroup,...,GeoLoc_Region,GeoLoc_City,Host_Sex,Host_Age_Category,Host_Age_Decade,Host_Age,Age_Detail_Level,shannon_entropy_non_filtered,shannon_entropy_filt_nonresident,shannon_entropy_nonresident
0,S675,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,3.680774,3.589150,0.000000
1,S683,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,1.770000,1.546965,0.000000
2,S691,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,0.252146,0.240979,0.000000
3,S699,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,0.948265,0.901563,1.158939
4,S707,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,CD,clinical remission Crohn's disease (CD),Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,0.910435,0.906406,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1869,SRR18268067,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,2.655323,2.544278,0.365099
1870,SRR18268068,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,2.814774,2.777367,1.610577
1871,SRR18268069,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,4.777275,4.739929,1.675143
1872,SRR18268070,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,0.931249,0.842526,0.591673


In [104]:
merged_df_rar3000 = pd.merge(merged_df_observed_features_rar3000, merged_df_shannon_rar3000, on=[*merged_df_observed_features_rar3000.columns.intersection(merged_df_shannon_rar3000.columns)], how='inner')

In [105]:
merged_df_rar3000

Unnamed: 0,id,Project_ID,Sequencing_Region,Case_Control,Disease_Level_1,Health_Status,Disease_Level_2,Disease_Level_3,Disease_Subgroup,IBD_Subgroup,...,Host_Age_Category,Host_Age_Decade,Host_Age,Age_Detail_Level,observed_features_non_filtered,observed_features_filt_nonresident,observed_features_nonresident,shannon_entropy_non_filtered,shannon_entropy_filt_nonresident,shannon_entropy_nonresident
0,S675,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Unknown,Unknown,Unknown,L2,32,31,1,3.680774,3.589150,0.000000
1,S683,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Unknown,Unknown,Unknown,L2,10,9,1,1.770000,1.546965,0.000000
2,S691,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Unknown,Unknown,Unknown,L2,14,13,1,0.252146,0.240979,0.000000
3,S699,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Unknown,Unknown,Unknown,L2,37,34,3,0.948265,0.901563,1.158939
4,S707,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,CD,clinical remission Crohn's disease (CD),Crohn's disease (CD),...,Unknown,Unknown,Unknown,L2,8,7,1,0.910435,0.906406,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1869,SRR18268067,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Unknown,Unknown,Unknown,L1,32,29,3,2.655323,2.544278,0.365099
1870,SRR18268068,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Unknown,Unknown,Unknown,L1,56,52,4,2.814774,2.777367,1.610577
1871,SRR18268069,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Unknown,Unknown,Unknown,L1,139,135,4,4.777275,4.739929,1.675143
1872,SRR18268070,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Unknown,Unknown,Unknown,L1,18,16,2,0.931249,0.842526,0.591673


In [106]:
merged_df_rar3000.to_csv(f'{data_dir}/metadata/combined_alpha_diversity_rar3000.tsv', sep='\t', index=False)

## Comparing sample counts before and after rarefaction at 6000
(not using rar6000 in the end)

This was done after excluding H1N1 samples (Health Status = H1N1, Disease Level 2 = COVID-19) in PRJNA637034.

This comparison led to exclusion of five projects with too small sample size after rarefaction for downstream analysis.

In [108]:
df_case_control = pd.read_csv(f'{data_dir}/metadata/case_control_metadata_remove_h1n1.tsv', sep='\t')

In [109]:
df_case_control.shape

(2383, 19)

In [110]:
df_case_control.head()

Unnamed: 0,SampleID,Project_ID,Sequencing_Region,Case_Control,Disease_Level_1,Health_Status,Disease_Level_2,Disease_Level_3,Disease_Subgroup,IBD_Subgroup,GeoLoc_Continent,GeoLoc_Country,GeoLoc_Region,GeoLoc_City,Host_Sex,Host_Age_Category,Host_Age_Decade,Host_Age,Age_Detail_Level
0,S675,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2
1,S683,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2
2,S691,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2
3,S699,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2
4,S707,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,CD,clinical remission Crohn's disease (CD),Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2


In [111]:
case_control_per_project_before = df_case_control.groupby(['Project_ID', 'Disease_Level_2', 'Case_Control']).agg(sample_count=('SampleID', 'size')).reset_index()

print(case_control_per_project_before)

               Project_ID        Disease_Level_2 Case_Control  sample_count
0              PRJDB13192               COVID-19         Case            76
1              PRJDB13192               COVID-19      Control            24
2              PRJEB35665               Melanoma         Case            20
3              PRJEB35665               Melanoma      Control            17
4              PRJEB38917                Obesity         Case            32
5              PRJEB38917                Obesity      Control             9
6              PRJEB38930                     AN         Case            94
7              PRJEB38930                     AN      Control            62
8              PRJEB42375                    IBS         Case            80
9              PRJEB42375                    IBS      Control            64
10             PRJEB46343                    HIV         Case            24
11             PRJEB46343                    HIV      Control            12
12          

In [127]:
df_obf_rar6000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/core-metrics-results-remove-h1n1-rar6000/observed-features-group-significance.tsv', sep='\t')

In [128]:
df_obf_rar6000.shape

(1699, 20)

In [129]:
df_obf_rar6000.head()

Unnamed: 0,id,Project_ID,Sequencing_Region,Case_Control,Disease_Level_1,Health_Status,Disease_Level_2,Disease_Level_3,Disease_Subgroup,IBD_Subgroup,GeoLoc_Continent,GeoLoc_Country,GeoLoc_Region,GeoLoc_City,Host_Sex,Host_Age_Category,Host_Age_Decade,Host_Age,Age_Detail_Level,observed_features
0,S675,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,32
1,S683,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,9
2,S691,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,18
3,S699,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,41
4,S707,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,CD,clinical remission Crohn's disease (CD),Crohn's disease (CD),Europe,Poland,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,14


In [130]:
case_control_per_project_after = df_obf_rar6000.groupby(['Project_ID', 'Disease_Level_2', 'Case_Control']).agg(sample_count=('id', 'size')).reset_index()

print(case_control_per_project_after)

               Project_ID        Disease_Level_2 Case_Control  sample_count
0              PRJDB13192               COVID-19         Case            68
1              PRJDB13192               COVID-19      Control            11
2              PRJEB35665               Melanoma         Case            17
3              PRJEB35665               Melanoma      Control            16
4              PRJEB38917                Obesity         Case             3
5              PRJEB38917                Obesity      Control             1
6              PRJEB38930                     AN         Case            60
7              PRJEB38930                     AN      Control            34
8              PRJEB42375                    IBS         Case            80
9              PRJEB42375                    IBS      Control            64
10             PRJEB46343                    HIV         Case            23
11             PRJEB46343                    HIV      Control            12
12          

In [131]:
data_before_rar6000 = pd.read_csv(f'{data_dir}/metadata/before-rarefaction6000.txt', sep=r'\s+', engine='python') 
data_after_rar6000 = pd.read_csv(f'{data_dir}/metadata/after-rarefaction6000.txt', sep=r'\s+', engine='python')

In [132]:
# Rename the sample_count column to be specific
data_before_rar6000 = data_before_rar6000.rename(columns={'sample_count': 'sample_count_before'})
data_after_rar6000 = data_after_rar6000.rename(columns={'sample_count': 'sample_count_after'})

# Perform a left merge to keep all rows from the "before" dataset
merged_df_rar6000 = pd.merge(data_before_rar6000, data_after_rar6000, on=['Project_ID', 'Disease_Level_2', 'Case_Control'], how='left')

# Fill missing values in the sample_count_after column with 0
merged_df_rar6000['sample_count_after'] = merged_df_rar6000['sample_count_after'].fillna(0)

# Convert sample_count_after to integers
merged_df_rar6000['sample_count_after'] = merged_df_rar6000['sample_count_after'].astype(int)

In [133]:
merged_df_rar6000

Unnamed: 0,Project_ID,Disease_Level_2,Case_Control,sample_count_before,sample_count_after
0,PRJDB13192,COVID-19,Case,76,68
1,PRJDB13192,COVID-19,Control,24,11
2,PRJEB35665,Melanoma,Case,20,17
3,PRJEB35665,Melanoma,Control,17,16
4,PRJEB38917,Obesity,Case,32,3
5,PRJEB38917,Obesity,Control,9,1
6,PRJEB38930,AN,Case,94,60
7,PRJEB38930,AN,Control,62,34
8,PRJEB42375,IBS,Case,80,80
9,PRJEB42375,IBS,Control,64,64


In [134]:
len(merged_df_rar6000['Project_ID'].unique())
# Sanity check - OK

25

In [135]:
merged_df_rar6000.to_csv(f'{data_dir}/metadata/merged-before-after-rarefaction6000.tsv', sep='\t', index=False)

### Observed features: non-filtered, filt-nonresident, and only the non-resident (rar6000)

In [136]:
df_non_filtered_of_rar6000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/results-rar6000/alpha-diversity-results-non-filtered/observed-features-group-significance.tsv', sep='\t')

df_filt_nonresident_of_rar6000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/results-rar6000/alpha-diversity-results-filt-nonresident/observed-features-group-significance.tsv', sep='\t')

df_nonresident_of_rar6000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/results-rar6000/alpha-diversity-results-nonresident/observed-features-group-significance.tsv', sep='\t')

In [137]:
df_non_filtered_of_rar6000.shape

(1672, 20)

In [138]:
df_filt_nonresident_of_rar6000.shape

(1670, 20)

In [139]:
df_nonresident_of_rar6000.shape

(1066, 20)

In [140]:
# Rename the observed features column to be specific
df_non_filtered_of_rar6000 = df_non_filtered_of_rar6000.rename(columns={'observed_features': 'observed_features_non_filtered'})
df_filt_nonresident_of_rar6000 = df_filt_nonresident_of_rar6000.rename(columns={'observed_features': 'observed_features_filt_nonresident'})
df_nonresident_of_rar6000 = df_nonresident_of_rar6000.rename(columns={'observed_features': 'observed_features_nonresident'})

# Perform a left merge to keep all rows from the "non_filtered" dataset
merged_df_transition_of_rar6000 = pd.merge(df_non_filtered_of_rar6000, df_filt_nonresident_of_rar6000, on=['id', 'Project_ID', 'Sequencing_Region', 'Case_Control', 'Disease_Level_1', 'Health_Status', 'Disease_Level_2', 'Disease_Level_3', 'Disease_Subgroup', 'IBD_Subgroup', 'GeoLoc_Continent', 'GeoLoc_Country', 'GeoLoc_Region', 'GeoLoc_City', 'Host_Sex', 'Host_Age_Category', 'Host_Age_Decade', 'Host_Age', 'Age_Detail_Level'], how='left')
merged_df_observed_features_rar6000 = pd.merge(merged_df_transition_of_rar6000, df_nonresident_of_rar6000, on=['id', 'Project_ID', 'Sequencing_Region', 'Case_Control', 'Disease_Level_1', 'Health_Status', 'Disease_Level_2', 'Disease_Level_3', 'Disease_Subgroup', 'IBD_Subgroup', 'GeoLoc_Continent', 'GeoLoc_Country', 'GeoLoc_Region', 'GeoLoc_City', 'Host_Sex', 'Host_Age_Category', 'Host_Age_Decade', 'Host_Age', 'Age_Detail_Level'], how='left')

# Fill missing values in the sample_count_after column with 0
merged_df_observed_features_rar6000['observed_features_filt_nonresident'] = merged_df_observed_features_rar6000['observed_features_filt_nonresident'].fillna(0)
merged_df_observed_features_rar6000['observed_features_nonresident'] = merged_df_observed_features_rar6000['observed_features_nonresident'].fillna(0)

# Convert sample_count_after to integers
merged_df_observed_features_rar6000['observed_features_filt_nonresident'] = merged_df_observed_features_rar6000['observed_features_filt_nonresident'].astype(int)
merged_df_observed_features_rar6000['observed_features_nonresident'] = merged_df_observed_features_rar6000['observed_features_nonresident'].astype(int)

In [141]:
merged_df_observed_features_rar6000

Unnamed: 0,id,Project_ID,Sequencing_Region,Case_Control,Disease_Level_1,Health_Status,Disease_Level_2,Disease_Level_3,Disease_Subgroup,IBD_Subgroup,...,GeoLoc_Region,GeoLoc_City,Host_Sex,Host_Age_Category,Host_Age_Decade,Host_Age,Age_Detail_Level,observed_features_non_filtered,observed_features_filt_nonresident,observed_features_nonresident
0,S675,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,30,29,1
1,S683,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,9,8,1
2,S691,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,19,17,2
3,S699,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,40,38,2
4,S707,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,CD,clinical remission Crohn's disease (CD),Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,11,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1667,SRR18268065,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,64,60,4
1668,SRR18268067,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,32,29,3
1669,SRR18268069,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,163,158,5
1670,SRR18268070,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,20,18,2


### Shannon index: non-filtered, filt-nonresident, and only the non-resident (rar6000)

In [142]:
df_non_filtered_shannon_rar6000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/results-rar6000/alpha-diversity-results-non-filtered/shannon-group-significance.tsv', sep='\t')

df_filt_nonresident_shannon_rar6000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/results-rar6000/alpha-diversity-results-filt-nonresident/shannon-group-significance.tsv', sep='\t')

df_nonresident_shannon_rar6000 = pd.read_csv(f'{data_dir}/src-analysis/diversity-analysis/results-rar6000/alpha-diversity-results-nonresident/shannon-group-significance.tsv', sep='\t')

In [143]:
df_non_filtered_shannon_rar6000.shape

(1672, 20)

In [144]:
df_filt_nonresident_shannon_rar6000.shape

(1670, 20)

In [145]:
df_nonresident_shannon_rar6000.shape

(1066, 20)

In [146]:
# Rename the shannon entropy column to be specific
df_non_filtered_shannon_rar6000 = df_non_filtered_shannon_rar6000.rename(columns={'shannon_entropy': 'shannon_entropy_non_filtered'})
df_filt_nonresident_shannon_rar6000 = df_filt_nonresident_shannon_rar6000.rename(columns={'shannon_entropy': 'shannon_entropy_filt_nonresident'})
df_nonresident_shannon_rar6000 = df_nonresident_shannon_rar6000.rename(columns={'shannon_entropy': 'shannon_entropy_nonresident'})

# Perform a left merge to keep all rows from the "non_filtered" dataset
merged_df_transition_shannon_rar6000 = pd.merge(df_non_filtered_shannon_rar6000, df_filt_nonresident_shannon_rar6000, on=['id', 'Project_ID', 'Sequencing_Region', 'Case_Control', 'Disease_Level_1', 'Health_Status', 'Disease_Level_2', 'Disease_Level_3', 'Disease_Subgroup', 'IBD_Subgroup', 'GeoLoc_Continent', 'GeoLoc_Country', 'GeoLoc_Region', 'GeoLoc_City', 'Host_Sex', 'Host_Age_Category', 'Host_Age_Decade', 'Host_Age', 'Age_Detail_Level'], how='left')
merged_df_shannon_rar6000 = pd.merge(merged_df_transition_shannon_rar6000, df_nonresident_shannon_rar6000, on=['id', 'Project_ID', 'Sequencing_Region', 'Case_Control', 'Disease_Level_1', 'Health_Status', 'Disease_Level_2', 'Disease_Level_3', 'Disease_Subgroup', 'IBD_Subgroup', 'GeoLoc_Continent', 'GeoLoc_Country', 'GeoLoc_Region', 'GeoLoc_City', 'Host_Sex', 'Host_Age_Category', 'Host_Age_Decade', 'Host_Age', 'Age_Detail_Level'], how='left')

# Fill missing values in the sample_count_after column with 0
merged_df_shannon_rar6000['shannon_entropy_filt_nonresident'] = merged_df_shannon_rar6000['shannon_entropy_filt_nonresident'].fillna(0)
merged_df_shannon_rar6000['shannon_entropy_nonresident'] = merged_df_shannon_rar6000['shannon_entropy_nonresident'].fillna(0)

In [147]:
merged_df_shannon_rar6000

Unnamed: 0,id,Project_ID,Sequencing_Region,Case_Control,Disease_Level_1,Health_Status,Disease_Level_2,Disease_Level_3,Disease_Subgroup,IBD_Subgroup,...,GeoLoc_Region,GeoLoc_City,Host_Sex,Host_Age_Category,Host_Age_Decade,Host_Age,Age_Detail_Level,shannon_entropy_non_filtered,shannon_entropy_filt_nonresident,shannon_entropy_nonresident
0,S675,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,3.659017,3.575652,0.000000
1,S683,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,1.708475,1.492134,0.000000
2,S691,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,0.245006,0.240422,1.000000
3,S699,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,0.939716,0.890005,0.983376
4,S707,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,CD,clinical remission Crohn's disease (CD),Crohn's disease (CD),...,Lesser Poland Voivodeship (Małopolskie),Krakow,Unknown,Unknown,Unknown,Unknown,L2,0.916610,0.916610,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1667,SRR18268065,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,3.644144,3.484520,0.884344
1668,SRR18268067,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,2.661708,2.539531,0.288229
1669,SRR18268069,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,4.834299,4.795857,2.124692
1670,SRR18268070,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Beijing,Beijing,female,Unknown,Unknown,Unknown,L1,0.940242,0.851719,0.413817


In [148]:
merged_df_rar6000 = pd.merge(merged_df_observed_features_rar6000, merged_df_shannon_rar6000, on=[*merged_df_observed_features_rar6000.columns.intersection(merged_df_shannon_rar6000.columns)], how='inner')

In [150]:
merged_df_rar6000

Unnamed: 0,id,Project_ID,Sequencing_Region,Case_Control,Disease_Level_1,Health_Status,Disease_Level_2,Disease_Level_3,Disease_Subgroup,IBD_Subgroup,...,Host_Age_Category,Host_Age_Decade,Host_Age,Age_Detail_Level,observed_features_non_filtered,observed_features_filt_nonresident,observed_features_nonresident,shannon_entropy_non_filtered,shannon_entropy_filt_nonresident,shannon_entropy_nonresident
0,S675,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Unknown,Unknown,Unknown,L2,30,29,1,3.659017,3.575652,0.000000
1,S683,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Unknown,Unknown,Unknown,L2,9,8,1,1.708475,1.492134,0.000000
2,S691,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Unknown,Unknown,Unknown,L2,19,17,2,0.245006,0.240422,1.000000
3,S699,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,active_new_CD,"active, newly diagnosed Crohn's disease (CD)",Crohn's disease (CD),...,Unknown,Unknown,Unknown,L2,40,38,2,0.939716,0.890005,0.983376
4,S707,crohns-paper_37122605,ITS1,Case,Gastrointestinal,Inflammatory bowel disease (IBD),IBD,CD,clinical remission Crohn's disease (CD),Crohn's disease (CD),...,Unknown,Unknown,Unknown,L2,11,11,0,0.916610,0.916610,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1667,SRR18268065,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Unknown,Unknown,Unknown,L1,64,60,4,3.644144,3.484520,0.884344
1668,SRR18268067,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Unknown,Unknown,Unknown,L1,32,29,3,2.661708,2.539531,0.288229
1669,SRR18268069,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Unknown,Unknown,Unknown,L1,163,158,5,4.834299,4.795857,2.124692
1670,SRR18268070,PRJNA813503,ITS1,Case,Metabolic,Gestational diabetes mellitus (GDM),GDM,,,,...,Unknown,Unknown,Unknown,L1,20,18,2,0.940242,0.851719,0.413817


In [151]:
merged_df_rar6000.to_csv(f'{data_dir}/metadata/combined_alpha_diversity_rar6000.tsv', sep='\t', index=False)