# Exploratory Data Analysis for SDOH Domains

This notebook performs EDA for the domains/items in the domain map with cohorts defined by the 'survey' column in the combined dataset.

In [1]:
import pandas as pd
import seaborn as sns
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt

# Set style for plots
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = [12, 8]

ModuleNotFoundError: No module named 'ydata_profiling'

## 1. Load and Preprocess Data

In [None]:
# Load domain map
domain_map = pd.read_csv('reference/domain_map.tsv', sep='\t')
print("Domain Map:")
display(domain_map.head())

# Load combined data
combined_data = pd.read_csv('data/combined.tsv', sep='\t')
print("\nCombined Data:")
display(combined_data.head())

# Get unique cohorts
cohorts = combined_data['survey'].unique()
print(f"\nUnique cohorts: {cohorts}")

## 2. Generate Overall Profile Report

In [None]:
# Generate profile report for the entire dataset
profile = ProfileReport(combined_data, title="SDOH Data Profiling Report", explorative=True)
profile.to_file("sdoh_profile_report.html")
print("Overall profile report generated and saved as 'sdoh_profile_report.html'")

## 3. EDA by Domain Across Cohorts

In [None]:
def analyze_domain(domain, items):
    print(f"\n--- Analysis for {domain} ---")
    
    for item in items:
        if item in combined_data.columns:
            print(f"\nItem: {item}")
            
            # Distribution plot
            sns.displot(data=combined_data, x=item, hue='survey', kde=True)
            plt.title(f'Distribution of {item} by Cohort')
            plt.show()
            
            # Box plot
            sns.boxplot(data=combined_data, x='survey', y=item)
            plt.title(f'Box Plot of {item} by Cohort')
            plt.xticks(rotation=45)
            plt.show()
        else:
            print(f"Column {item} not found in the dataset.")

# Analyze each domain
for domain, items in domain_map.groupby('Domain')['Column Name']:
    analyze_domain(domain, items)

## 4. EDA Within Each Cohort

In [None]:
def analyze_cohort(cohort_data, cohort_name):
    print(f"\n=== Analysis for {cohort_name} Cohort ===")
    
    # Generate profile report for the cohort
    cohort_profile = ProfileReport(cohort_data, title=f"{cohort_name} Cohort Profiling Report", explorative=True)
    cohort_profile.to_file(f"{cohort_name.lower()}_profile_report.html")
    print(f"Profile report for {cohort_name} cohort generated and saved as '{cohort_name.lower()}_profile_report.html'")

# Analyze each cohort
for cohort in cohorts:
    cohort_data = combined_data[combined_data['survey'] == cohort]
    analyze_cohort(cohort_data, cohort)

## 5. Highlight Differences Between Cohorts

In [None]:
def highlight_differences():
    print("\n=== Highlighting Differences Between Cohorts ===")
    
    for domain, items in domain_map.groupby('Domain')['Column Name']:
        print(f"\n--- {domain} ---")
        
        for item in items:
            if item in combined_data.columns:
                print(f"\nItem: {item}")
                
                # ANOVA test
                from scipy import stats
                groups = [group for _, group in combined_data.groupby('survey')[item]]
                f_value, p_value = stats.f_oneway(*groups)
                print(f"ANOVA test - F-value: {f_value:.4f}, p-value: {p_value:.4f}")
                
                if p_value < 0.05:
                    print("There is a significant difference between cohorts.")
                    
                    # Pairwise comparisons
                    from statsmodels.stats.multicomp import pairwise_tukeyhsd
                    tukey_results = pairwise_tukeyhsd(combined_data[item], combined_data['survey'])
                    print("\nTukey's test results:")
                    print(tukey_results)
                    
                    # Visualize differences
                    sns.boxplot(x='survey', y=item, data=combined_data)
                    plt.title(f'Differences in {item} Across Cohorts')
                    plt.xticks(rotation=45)
                    plt.show()
                else:
                    print("No significant difference between cohorts.")
            else:
                print(f"Column {item} not found in the dataset.")

highlight_differences()

## 6. Summary and Conclusions

Based on the analysis above, we can draw the following conclusions:

1. [Add your conclusions here based on the EDA results]
2. [Highlight key findings and differences between cohorts]
3. [Discuss any patterns or trends observed in the data]
4. [Suggest areas for further investigation or analysis]

For detailed insights, please refer to the generated HTML reports:
- Overall profile report: 'sdoh_profile_report.html'
- Cohort-specific reports: '[cohort_name]_profile_report.html'