# Comprehensive Thyroid Cancer Data Analysis

This notebook provides a detailed analysis of thyroid cancer risk factors and patterns using a dataset of 212,691 records.

## Table of Contents:
1. Data Loading and Initial Exploration
2. Basic Statistical Analysis
3. Distribution Analysis
4. Clinical Measurements Analysis
5. Risk Factor Analysis
6. Geographic and Demographic Patterns
7. Advanced Statistical Analysis
8. Machine Learning Models
9. Survival Analysis
10. Interactive Visualizations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = [12, 8]

## 1. Data Loading and Initial Exploration

In [None]:
# Read the dataset
df = pd.read_csv('thyroid_cancer_risk_data.csv')

# Display basic information
print("Dataset Info:")
print("-" * 50)
print(df.info())

print("\nSample Data:")
print("-" * 50)
display(df.head())

print("\nBasic Statistics:")
print("-" * 50)
display(df.describe())

## 2. Basic Statistical Analysis

In [None]:
# Define variable categories
numerical_vars = ['Age', 'TSH_Level', 'T3_Level', 'T4_Level', 'Nodule_Size']
categorical_vars = ['Gender', 'Country', 'Ethnicity', 'Family_History', 
                   'Radiation_Exposure', 'Iodine_Deficiency', 'Smoking', 
                   'Obesity', 'Diabetes']

# Display categorical variable distributions
for var in categorical_vars:
    print(f"\n{var} Distribution:")
    print(df[var].value_counts(normalize=True).round(3) * 100)
    
    # Create pie chart
    plt.figure(figsize=(10, 6))
    plt.pie(df[var].value_counts(), labels=df[var].value_counts().index, autopct='%1.1f%%')
    plt.title(f'{var} Distribution')
    plt.show()

## 3. Distribution Analysis

In [None]:
# Create distribution plots for numerical variables
for var in numerical_vars:
    fig = make_subplots(rows=2, cols=2,
                        subplot_titles=['Histogram', 'Box Plot', 'Violin Plot', 'KDE Plot'])
    
    # Histogram
    fig.add_trace(
        go.Histogram(x=df[var], name='All'),
        row=1, col=1
    )
    
    # Box Plot
    fig.add_trace(
        go.Box(y=df[var], name='All'),
        row=1, col=2
    )
    
    # Violin Plot by Diagnosis
    fig.add_trace(
        go.Violin(y=df[var], x=df['Diagnosis'], name='By Diagnosis'),
        row=2, col=1
    )
    
    # KDE Plot
    for diagnosis in df['Diagnosis'].unique():
        kde = stats.gaussian_kde(df[df['Diagnosis'] == diagnosis][var])
        x_range = np.linspace(df[var].min(), df[var].max(), 100)
        fig.add_trace(
            go.Scatter(x=x_range, y=kde(x_range), name=diagnosis),
            row=2, col=2
        )
    
    fig.update_layout(height=800, width=1200, title=f'Distribution Analysis of {var}')
    fig.show()

## 4. Clinical Measurements Analysis

In [None]:
# Correlation analysis of clinical measurements
clinical_vars = ['TSH_Level', 'T3_Level', 'T4_Level', 'Nodule_Size']
correlation = df[clinical_vars].corr()

# Create heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Clinical Measurements')
plt.show()

# Statistical tests for clinical measurements by diagnosis
print("\nStatistical Tests for Clinical Measurements:")
for var in clinical_vars:
    benign = df[df['Diagnosis'] == 'Benign'][var]
    malignant = df[df['Diagnosis'] == 'Malignant'][var]
    
    # Perform t-test
    t_stat, p_val = stats.ttest_ind(benign, malignant)
    print(f"\n{var}:")
    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value: {p_val:.4e}")

## 5. Risk Factor Analysis

In [None]:
# Create risk score
df['Risk_Score'] = (
    (df['Family_History'] == 'Yes').astype(int) * 2 +
    (df['Radiation_Exposure'] == 'Yes').astype(int) * 1.5 +
    (df['Iodine_Deficiency'] == 'Yes').astype(int) * 1.5 +
    (df['Smoking'] == 'Yes').astype(int) +
    (df['Obesity'] == 'Yes').astype(int) +
    (df['Diabetes'] == 'Yes').astype(int) +
    ((df['Nodule_Size'] > df['Nodule_Size'].median()).astype(int) * 1.5)
)

# Analyze risk score distribution
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Diagnosis', y='Risk_Score')
plt.title('Risk Score Distribution by Diagnosis')
plt.show()

# Calculate risk ratios
print("\nRisk Ratios for Different Factors:")
for factor in categorical_vars[3:]:  # Skip demographic variables
    risk_ratio = (
        df[df[factor] == 'Yes']['Diagnosis'].value_counts(normalize=True)['Malignant'] /
        df[df[factor] == 'No']['Diagnosis'].value_counts(normalize=True)['Malignant']
    )
    print(f"\n{factor}:")
    print(f"Risk Ratio: {risk_ratio:.2f}x")

## 6. Geographic and Demographic Patterns

In [None]:
# Analyze geographic patterns
country_stats = df.groupby('Country').agg({
    'Diagnosis': lambda x: (x == 'Malignant').mean(),
    'Age': 'mean',
    'TSH_Level': 'mean',
    'Risk_Score': 'mean'
}).round(3)

# Create interactive choropleth map
fig = px.bar(country_stats.reset_index(), 
             x='Country', 
             y='Diagnosis',
             title='Malignancy Rates by Country',
             labels={'Diagnosis': 'Malignancy Rate'})
fig.show()

# Demographic analysis
demographic_vars = ['Gender', 'Ethnicity', 'Age']
for var in demographic_vars:
    if var != 'Age':
        plt.figure(figsize=(10, 6))
        sns.barplot(data=df, x=var, y=(df['Diagnosis'] == 'Malignant').astype(int))
        plt.title(f'Malignancy Rate by {var}')
        plt.ylabel('Malignancy Rate')
        plt.show()

## 7. Advanced Statistical Analysis

In [None]:
# Perform PCA on numerical variables
scaler = StandardScaler()
pca = PCA(n_components=2)
numerical_scaled = scaler.fit_transform(df[numerical_vars])
pca_result = pca.fit_transform(numerical_scaled)

# Create PCA plot
plt.figure(figsize=(10, 8))
plt.scatter(pca_result[df['Diagnosis'] == 'Benign', 0],
            pca_result[df['Diagnosis'] == 'Benign', 1],
            alpha=0.5, label='Benign')
plt.scatter(pca_result[df['Diagnosis'] == 'Malignant', 0],
            pca_result[df['Diagnosis'] == 'Malignant', 1],
            alpha=0.5, label='Malignant')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA of Numerical Variables')
plt.legend()
plt.show()

# Explained variance ratio
print("\nPCA Explained Variance Ratio:")
print(pca.explained_variance_ratio_)

## 8. Machine Learning Models

In [None]:
# Prepare data for machine learning
le = LabelEncoder()
X = df[categorical_vars + clinical_vars].copy()
for col in categorical_vars:
    X[col] = le.fit_transform(X[col])
y = le.fit_transform(df['Diagnosis'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    print(f"\n{name} Results:")
    print("-" * 50)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    
    # Train and evaluate
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Feature importance
    importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=importance, x='Importance', y='Feature')
    plt.title(f'Feature Importance - {name}')
    plt.show()

## 9. Survival Analysis

In [None]:
# Create age groups for survival analysis
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 30, 45, 60, 75, 100], 
                       labels=['<30', '30-45', '45-60', '60-75', '>75'])

# Analyze survival rates by age group and risk factors
survival_analysis = df.groupby(['AgeGroup', 'Risk_Score'])['Diagnosis'].value_counts(normalize=True).unstack()

# Plot survival curves
plt.figure(figsize=(12, 6))
for age_group in df['AgeGroup'].unique():
    group_data = survival_analysis.loc[age_group]
    plt.plot(group_data.index, group_data['Benign'], label=age_group)

plt.xlabel('Risk Score')
plt.ylabel('Survival Probability')
plt.title('Survival Curves by Age Group')
plt.legend()
plt.grid(True)
plt.show()

## 10. Interactive Visualizations

In [None]:
# Create interactive scatter plot matrix
fig = px.scatter_matrix(df,
                        dimensions=clinical_vars,
                        color='Diagnosis',
                        title='Interactive Scatter Matrix of Clinical Measurements')
fig.show()

# Create 3D scatter plot
fig = px.scatter_3d(df,
                    x='TSH_Level',
                    y='T3_Level',
                    z='T4_Level',
                    color='Diagnosis',
                    size='Nodule_Size',
                    title='3D Visualization of Thyroid Hormone Levels')
fig.show()

# Create parallel coordinates plot
fig = px.parallel_coordinates(df,
                             dimensions=clinical_vars + ['Risk_Score'],
                             color='Diagnosis',
                             title='Parallel Coordinates Plot of Clinical Measurements')
fig.show()