### This filtering step of metadata is for performing ·sample-classifier classify-samples·

In [1]:
import pandas as pd

In [3]:
data_dir = "/Users/cheesemania/PycharmProjects/mscthesis_wrkdir"

In [4]:
df_original = pd.read_csv(f'{data_dir}/metadata/case_control_metadata_remove_h1n1.tsv', sep='\t')

In [5]:
df_original.shape

(2383, 19)

In [6]:
df_original['Project_ID'].nunique()

25

In [8]:
grouped_case_control_sample_count_per_project = df_original.groupby(['Project_ID', 'Case_Control']).agg(sample_count=('SampleID', 'size')).reset_index()

print(grouped_case_control_sample_count_per_project)

               Project_ID Case_Control  sample_count
0              PRJDB13192         Case            76
1              PRJDB13192      Control            24
2              PRJEB35665         Case            20
3              PRJEB35665      Control            17
4              PRJEB38917         Case            32
5              PRJEB38917      Control             9
6              PRJEB38930         Case            94
7              PRJEB38930      Control            62
8              PRJEB42375         Case            80
9              PRJEB42375      Control            64
10             PRJEB46343         Case            24
11             PRJEB46343      Control            12
12             PRJEB53019         Case            17
13             PRJEB53019      Control            11
14            PRJNA419104         Case            34
15            PRJNA419104      Control            23
16            PRJNA517994         Case            91
17            PRJNA517994      Control        

#### Excluding 6 projects due to inadequate sample sizes in total/per case-control group
As a rule of thumb of sample classifier, a minimum of approximately 50 samples in total should be provided. Categorical metadata columns that are used as classifier targets should have a minimum of 10 samples per unique value (before rarefaction).

PRJNA637034 - TBD
Due to only 1 sample remaning after rarefaction, this project is excluded from the ML analysis.

In [13]:
projects_to_exclude_step1 = ['PRJEB35665', 'PRJEB38917', 'PRJEB46343', 'PRJEB53019', 'PRJNA557226', 'PRJNA751473']
df_original_exclude_step1 = df_original[~df_original['Project_ID'].isin(projects_to_exclude_step1)]

In [14]:
df_original_exclude_step1.shape

(2180, 19)

In [15]:
df_original_exclude_step1['Project_ID'].nunique()

19

In [16]:
update_grouped_case_control_sample_count_per_project = df_original_exclude_step1.groupby(['Project_ID', 'Case_Control']).agg(sample_count=('SampleID', 'size')).reset_index()

print(update_grouped_case_control_sample_count_per_project)

               Project_ID Case_Control  sample_count
0              PRJDB13192         Case            76
1              PRJDB13192      Control            24
2              PRJEB38930         Case            94
3              PRJEB38930      Control            62
4              PRJEB42375         Case            80
5              PRJEB42375      Control            64
6             PRJNA419104         Case            34
7             PRJNA419104      Control            23
8             PRJNA517994         Case            91
9             PRJNA517994      Control            11
10            PRJNA590898         Case            36
11            PRJNA590898      Control            18
12            PRJNA607176         Case            34
13            PRJNA607176      Control            47
14            PRJNA637034         Case           106
15            PRJNA637034      Control            46
16            PRJNA647266         Case            90
17            PRJNA647266      Control        

#### Sample size balance

In [18]:
# Calculate the sample size balance between 'Case' and 'Control' groups within each project
sample_size_balance = df_original_exclude_step1.groupby(['Project_ID', 'Case_Control']).size().unstack(fill_value=0)

# Add a column to show the absolute difference between the 'Case' and 'Control' counts
sample_size_balance['Difference'] = abs(sample_size_balance['Case'] - sample_size_balance['Control'])

# Display the balance summary
sample_size_balance.sort_values(by='Difference', ascending=False)  
# Show top 10 projects with the highest imbalance

Case_Control,Case,Control,Difference
Project_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PRJNA703732,137,18,119
PRJNA517994,91,11,80
PRJNA791216,99,30,69
crohns-paper_37122605,105,39,66
PRJNA698272,78,16,62
PRJNA637034,106,46,60
PRJNA647266,90,150,60
PRJDB13192,76,24,52
PRJNA756382,103,58,45
PRJEB38930,94,62,32


#### Age, Sex, and Geoloc_Country Matching

In [19]:
from scipy.stats import chi2_contingency

# Function to perform chi-square tests on categorical data
def chi_square_test(df, column):
    contingency_table = pd.crosstab(df['Case_Control'], df[column])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return p

# Analyze all projects
projects_analysis = []
for project_id in df_original_exclude_step1['Project_ID'].unique():
    project_data = df_original_exclude_step1[df_original_exclude_step1['Project_ID'] == project_id]
    
    # Only perform chi-square tests if there are both Case and Control samples
    if len(project_data['Case_Control'].unique()) == 2:
        age_p_value = chi_square_test(project_data, 'Host_Age_Decade')
        sex_p_value = chi_square_test(project_data, 'Host_Sex')
        geo_p_value = chi_square_test(project_data, 'GeoLoc_Country')
        
        projects_analysis.append({
            'Project_ID': project_id,
            'Case_Count': (project_data['Case_Control'] == 'Case').sum(),
            'Control_Count': (project_data['Case_Control'] == 'Control').sum(),
            'Age_Match_p_value': age_p_value,
            'Sex_Match_p_value': sex_p_value,
            'Geo_Match_p_value': geo_p_value
        })

In [20]:
# Convert the results into a DataFrame for better visualization
analysis_df = pd.DataFrame(projects_analysis)

# Marking projects with p-values > 0.05 as "Matched" (no significant difference)
analysis_df['Age_Matched'] = analysis_df['Age_Match_p_value'] > 0.05
analysis_df['Sex_Matched'] = analysis_df['Sex_Match_p_value'] > 0.05
analysis_df['Geo_Matched'] = analysis_df['Geo_Match_p_value'] > 0.05

# Display the analysis summary
analysis_df[['Project_ID', 'Case_Count', 'Control_Count', 'Age_Matched', 'Sex_Matched', 'Geo_Matched']]

Unnamed: 0,Project_ID,Case_Count,Control_Count,Age_Matched,Sex_Matched,Geo_Matched
0,crohns-paper_37122605,105,39,True,True,True
1,PRJDB13192,76,24,True,True,True
2,PRJEB38930,94,62,False,False,True
3,PRJEB42375,80,64,True,True,True
4,PRJNA419104,34,23,False,True,True
5,PRJNA517994,91,11,True,True,False
6,PRJNA590898,36,18,False,False,True
7,PRJNA607176,34,47,True,True,True
8,PRJNA637034,106,46,True,True,True
9,PRJNA647266,90,150,True,True,True


In [22]:
analysis_df

Unnamed: 0,Project_ID,Case_Count,Control_Count,Age_Match_p_value,Sex_Match_p_value,Geo_Match_p_value,Age_Matched,Sex_Matched,Geo_Matched
0,crohns-paper_37122605,105,39,1.0,1.0,1.0,True,True,True
1,PRJDB13192,76,24,1.0,1.0,1.0,True,True,True
2,PRJEB38930,94,62,6.825276e-35,6.825276e-35,1.0,False,False,True
3,PRJEB42375,80,64,1.0,1.0,1.0,True,True,True
4,PRJNA419104,34,23,6.183117e-07,0.2902398,1.0,False,True,True
5,PRJNA517994,91,11,0.136338,0.4360838,2.795095e-15,True,True,False
6,PRJNA590898,36,18,2.095826e-10,1.879529e-12,1.0,False,False,True
7,PRJNA607176,34,47,1.0,1.0,1.0,True,True,True
8,PRJNA637034,106,46,1.0,1.0,1.0,True,True,True
9,PRJNA647266,90,150,1.0,1.0,1.0,True,True,True


In [25]:
analysis_df.to_csv(f'{data_dir}/metadata/case_control_matching_summary.tsv', na_rep='NA', sep='\t', index=False)