https://www.indeed.com/career-advice/career-development/outliers-statistics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

HS_data = pd.read_csv('logistic_regression_results.csv')

# Calculate significance
N = len(HS_data)
HS_data['is_significant'] = np.where(HS_data['Pvalue'] < 0.05/N, 1, 0)

print(f"Original HS data: {HS_data.shape}")
print(f"Significant observations: {HS_data['is_significant'].sum()}")

# --- Visualization: Original Beta Distribution ---
print("\nCreating visualizations...")

# Original beta distribution
plt.figure(figsize=(8, 6))
sns.histplot(HS_data['Beta'], bins=30, kde=True)
plt.title('Original Beta Distribution')
plt.xlabel('Beta Value')
plt.ylabel('Frequency')
plt.savefig('beta_distribution_original.png', dpi=300, bbox_inches='tight')
plt.show()

# --- filter for significant and positive beta values ---
print("\nFiltering for significant and positive beta values...")
HS_data_filtered = HS_data[(HS_data['is_significant'] == 1) & (HS_data['Beta'] > 0)]

print(f"Filtered data: {HS_data_filtered.shape}")

# --- Visualization: Filtered Beta Distribution ---
# Filtered beta distribution
plt.figure(figsize=(8, 6))
sns.histplot(HS_data_filtered['Beta'], bins=30, kde=True)
plt.title('Beta Distribution for Significant & Beta > 0')
plt.xlabel('Beta Value')
plt.ylabel('Frequency')
plt.savefig('beta_distribution_filtered.png', dpi=300, bbox_inches='tight')
plt.show()

# Boxplot of filtered data
plt.figure(figsize=(8, 6))
sns.boxplot(x=HS_data_filtered['Beta'])
plt.title('Boxplot of Beta Distribution with Outliers')
plt.xlabel('Beta Value')
plt.savefig('beta_boxplot.png', dpi=300, bbox_inches='tight')
plt.show()

# --- Outlier detection: Z score --- 

print("\n=== Z-SCORE OUTLIER DETECTION ===")
z_scores = zscore(HS_data_filtered['Beta'])
outliers_zscore = HS_data_filtered[np.abs(z_scores) > 3]

print(f"Number of outliers (z-score > 3): {len(outliers_zscore)}")
print(f"Percentage of outliers: {len(outliers_zscore)/len(HS_data_filtered)*100:.2f}%")

if len(outliers_zscore) > 0:
    print("\nZ-score outliers (first 10):")
    print(outliers_zscore[['Phecode', 'Beta', 'Pvalue']].head(10))


# Outlier detection: IQR ---
print("\n=== IQR OUTLIER DETECTION ===")
Q1 = np.percentile(HS_data_filtered['Beta'], 25)
Q3 = np.percentile(HS_data_filtered['Beta'], 75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_iqr = HS_data_filtered[(HS_data_filtered['Beta'] < lower_bound) | (HS_data_filtered['Beta'] > upper_bound)]

print(f"Q1: {Q1:.4f}")
print(f"Q3: {Q3:.4f}")
print(f"IQR: {IQR:.4f}")
print(f"Lower bound: {lower_bound:.4f}")
print(f"Upper bound: {upper_bound:.4f}")
print(f"Number of outliers (IQR method): {len(outliers_iqr)}")
print(f"Percentage of outliers: {len(outliers_iqr)/len(HS_data_filtered)*100:.2f}%")

if len(outliers_iqr) > 0:
    print("\nIQR outliers (first 10):")
    print(outliers_iqr[['Phecode', 'Beta', 'Pvalue']].head(10))

# --- comparison of method ---
print("\n=== COMPARISON OF OUTLIER DETECTION METHODS ===")
print(f"Z-score outliers: {len(outliers_zscore)}")
print(f"IQR outliers: {len(outliers_iqr)}")

# Find common outliers
common_outliers = pd.merge(outliers_zscore, outliers_iqr, on='Phecode', how='inner')
print(f"Common outliers (both methods): {len(common_outliers)}")



In [None]:
print("\n=== SUMMARY STATISTICS ===")
print(f"Original data: {len(HS_data):,} observations")
print(f"Significant observations: {HS_data['is_significant'].sum():,}")
print(f"Filtered data (significant & positive): {len(HS_data_filtered):,}")
print(f"Beta range: {HS_data_filtered['Beta'].min():.4f} to {HS_data_filtered['Beta'].max():.4f}")
print(f"Mean beta: {HS_data_filtered['Beta'].mean():.4f}")
print(f"Std beta: {HS_data_filtered['Beta'].std():.4f}")

In [None]:
print("\nSaving outlier analysis results...")

# Save outliers
if len(outliers_zscore) > 0:
    outliers_zscore.to_csv('outliers_zscore.csv', index=False)
    print("Saved z-score outliers to: outliers_zscore.csv")

if len(outliers_iqr) > 0:
    outliers_iqr.to_csv('outliers_iqr.csv', index=False)
    print("Saved IQR outliers to: outliers_iqr.csv")

# Save summary
summary_data = {
    'Method': ['Z-score', 'IQR', 'Common'],
    'Number_of_Outliers': [len(outliers_zscore), len(outliers_iqr), len(common_outliers)],
    'Percentage': [
        len(outliers_zscore)/len(HS_data_filtered)*100,
        len(outliers_iqr)/len(HS_data_filtered)*100,
        len(common_outliers)/len(HS_data_filtered)*100
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('outlier_analysis_summary.csv', index=False)
print("Saved summary to: outlier_analysis_summary.csv")

print("\nOutlier analysis completed!")
print("Check the generated plots and CSV files for detailed results.")


In [None]:
Q1 = np.percentile(HS_data['Beta'], 25)
Q3 = np.percentile(HS_data['Beta'], 75)
IQR = Q3 - Q1

In [None]:
IQR

In [None]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
outliers = HS_data['Beta'][(HS_data['Beta'] < lower_bound) | (HS_data['Beta'] > upper_bound)]
print(len(outliers))
print(outliers)