In [None]:
# Import librerie necessarie
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Configurazione
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Impostazioni display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("✅ Librerie importate correttamente")


In [None]:
# Caricamento dataset
data_path = Path("../data/breast_cancer_dataset.csv")
df = pd.read_csv(data_path)

print(f"📊 Shape del dataset: {df.shape}")
print(f"📋 Colonne: {list(df.columns)}")
print(f"\n🔍 Prime 5 righe:")
df.head()


In [None]:
# Analisi target
print("🎯 Distribuzione del target (diagnosis):")
target_counts = df['diagnosis'].value_counts()
print(target_counts)

# Percentuali
target_percentages = df['diagnosis'].value_counts(normalize=True) * 100
print(f"\n📊 Percentuali:")
for diagnosis, percentage in target_percentages.items():
    label = "Benigno" if diagnosis == "B" else "Maligno"
    print(f"{label} ({diagnosis}): {percentage:.1f}%")

# Visualizzazione
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
target_counts.plot(kind='bar', color=['lightblue', 'lightcoral'])
plt.title('Distribuzione Target')
plt.xlabel('Diagnosis')
plt.ylabel('Count')
plt.xticks([0, 1], ['Benigno (B)', 'Maligno (M)'])

plt.subplot(1, 2, 2)
plt.pie(target_counts.values, labels=['Benigno (B)', 'Maligno (M)'], 
        autopct='%1.1f%%', colors=['lightblue', 'lightcoral'])
plt.title('Distribuzione Target (%)')

plt.tight_layout()
plt.show()


In [None]:
# Correlazioni con il target
df_with_target = df.copy()
df_with_target['diagnosis_numeric'] = (df_with_target['diagnosis'] == 'M').astype(int)

# Selezionare solo features numeriche
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
target_correlations = df_with_target[numerical_features + ['diagnosis_numeric']].corr()['diagnosis_numeric'].sort_values(ascending=False)

print("🎯 Top 10 Correlazioni con il target (diagnosis):")
top_correlations = target_correlations.drop('diagnosis_numeric').head(10)
for feature, corr in top_correlations.items():
    print(f"  {feature}: {corr:.3f}")

# Visualizzazione correlazioni
plt.figure(figsize=(12, 8))
top_correlations.plot(kind='barh', color='skyblue')
plt.title('Top 10 Features - Correlazione con Target')
plt.xlabel('Correlazione')
plt.tight_layout()
plt.show()


In [None]:
# Analisi delle top 6 features più correlate
top_features = ['concave points_worst', 'perimeter_worst', 'concave points_mean', 
                'radius_worst', 'perimeter_mean', 'area_worst']

# Box plots per features principali
plt.figure(figsize=(18, 12))
for i, feature in enumerate(top_features, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(data=df, x='diagnosis', y=feature, palette=['lightblue', 'lightcoral'])
    plt.title(f'{feature} vs Diagnosis')
    plt.xlabel('Diagnosis')
    plt.ylabel(feature)

plt.tight_layout()
plt.show()

# Statistiche descrittive per gruppo
print("📊 Statistiche delle top features per diagnosis:")
for feature in top_features[:3]:
    print(f"\n{feature}:")
    stats = df.groupby('diagnosis')[feature].describe()
    print(stats)


In [None]:
# Riassunto EDA
print("📋 RIASSUNTO EDA - Breast Cancer Dataset")
print("=" * 50)
print(f"📊 Dataset Shape: {df.shape}")
print(f"🎯 Target Distribution: {dict(target_counts)}")
print(f"📈 Features Numeriche: {len(numerical_features)}")

# Missing values check
missing_values = df.isnull().sum().sum()
print(f"🔍 Missing Values: {missing_values}")

# Top features per correlazione
print(f"\n🏆 Top 5 Features per correlazione con target:")
for feature, corr in top_correlations.head().items():
    print(f"  {feature}: {corr:.3f}")

print(f"\n✅ Key Insights:")
print("1. Dataset bilanciato con leggera prevalenza di casi benigni (62.7%)")
print("2. Features 'worst' mostrano correlazioni più forti con il target")
print("3. Parametri geometrici (perimeter, radius, area) sono predittori importanti")
print("4. Features 'concave points' sono tra le più discriminanti")
print("5. Dataset pronto per il preprocessing e modeling")

print(f"\n📈 Prossimi Passi:")
print("1. Preprocessing delle features")
print("2. Feature selection basata su correlazioni")
print("3. Train/test split")
print("4. Model development")
