# 🏥 Health Risk Prediction: Smoking & Drinking Analysis

Data science project to predict smoking/drinking behaviors and cardiovascular risk using medical health check-up data. Apply ML techniques for preventive healthcare interventions.

## 1. Data loading and visualization

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# loading and visualizing the dataset
df = pd.read_csv('../data/smoking-drinking_dataset.csv')
df.head()

In [None]:
df.info()

df.shape

In [None]:
# checking and handling missing values
columns_with_missing_values = df.columns[df.isnull().any()]

print("Missing value percentage:")
if columns_with_missing_values.empty:
    print("No missing values found.")
else:
    for column in columns_with_missing_values:
        print(column,":",df[column].isnull().sum()/df.shape[0]*100)

In [None]:
# Note that the dataset has several categorical features represented as numerical codes.

# Defining manually the categorical columns
categorical_columns = ['sex', 'DRK_YN', 'SMK_stat_type_cd', 
                       'hear_left', 'hear_right', 'urine_protein']

print("=" * 50)
print("CATEGORICAL FEATURES DISTRIBUTION")
print("=" * 50)

for column in categorical_columns:
    print(f"\n{column.upper()}")
    print("-" * 30)
    counts = df[column].value_counts()
    print(counts)
    
    # Calcola percentuali
    percentages = df[column].value_counts(normalize=True) * 100
    print("\nPercentages:")
    print(percentages.round(2))
    print("=" * 50)

## 2. EDA

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# 1. Smoking Status
smoking_counts = df['SMK_stat_type_cd'].value_counts()
total_smoking = smoking_counts.sum()

def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return f'{pct:.1f}%\n({val:,})'
    return my_autopct

axes[0].pie(
    smoking_counts,
    labels=['Never Smoker', 'Ex-Smoker', 'Current Smoker'],
    autopct=make_autopct(smoking_counts),
    colors=['#90EE90', '#FFA500', '#FF6B6B'],
    startangle=90
)
axes[0].set_title(f'Smoking Status Distribution\nTotal: {total_smoking:,}', 
                  fontsize=14, fontweight='bold')

# 2. Drinking Status
drinking_counts = df['DRK_YN'].value_counts()
total_drinking = drinking_counts.sum()

axes[1].pie(
    drinking_counts,
    labels=['Non-Drinker', 'Drinker'],
    autopct=make_autopct(drinking_counts),
    colors=['#87CEEB', '#FA8072'],
    startangle=90
)
axes[1].set_title(f'Drinking Status Distribution\nTotal: {total_drinking:,}', 
                  fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

### Smoking-related features

In [None]:
# Age vs Smoking Status (3 categories)
plt.figure(figsize=(12, 6))

df[df['SMK_stat_type_cd']==1]['age'].plot.hist(
    bins=30, alpha=0.5, color='green', label='Never Smoker'
)
df[df['SMK_stat_type_cd']==2]['age'].plot.hist(
    bins=30, alpha=0.5, color='orange', label='Ex-Smoker'
)
df[df['SMK_stat_type_cd']==3]['age'].plot.hist(
    bins=30, alpha=0.5, color='red', label='Current Smoker'
)

plt.legend(fontsize=12)
plt.xlabel('Age (years)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Smoking Status by Age', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)
plt.show()


In [None]:
# Systolic Blood Pressure vs Smoking - BAR CHART
plt.figure(figsize=(8, 6))

sbp_means = df[df['SMK_stat_type_cd'].isin([1, 3])].groupby('SMK_stat_type_cd')['SBP'].mean()
colors = ['#90EE90', '#FF6B6B']
bars = plt.bar(['Never Smoker', 'Current Smoker'], sbp_means, 
               color=colors, edgecolor='black', linewidth=1.5, width=0.6)

# Aggiungi valori sopra le barre
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f} mmHg',
             ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.axhline(y=120, color='red', linestyle='--', linewidth=2, label='Normal threshold (120 mmHg)')
plt.ylabel('Average Systolic BP (mmHg)', fontsize=12, fontweight='bold')
plt.title('Systolic Blood Pressure: Smokers vs Non-Smokers', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.ylim(100, max(sbp_means) * 1.1)
plt.grid(axis='y', alpha=0.3)
plt.show()

**Analysis**: Current smokers show elevated systolic blood pressure (123.6 mmHg) compared to never smokers (121.2 mmHg), with a difference of 2.4 mmHg. Both groups cluster near the normal threshold (120 mmHg), but smokers demonstrate a clear rightward shift indicating increased cardiovascular risk. This feature will serve as a moderate predictor for smoking status classification.

**Feature Importance**: ⭐⭐ Moderate predictor - The difference is statistically significant and aligns with medical literature confirming smoking's impact on blood pressure regulation.

In [None]:
# HDL Cholesterol vs Smoking - BAR CHART
plt.figure(figsize=(8, 6))

hdl_means = df[df['SMK_stat_type_cd'].isin([1, 3])].groupby('SMK_stat_type_cd')['HDL_chole'].mean()
colors = ['#90EE90', '#FF6B6B']
bars = plt.bar(['Never Smoker', 'Current Smoker'], hdl_means, 
               color=colors, edgecolor='black', linewidth=1.5, width=0.6)

# Aggiungi valori sopra le barre
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f} mg/dL',
             ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.axhline(y=40, color='blue', linestyle='--', linewidth=2, label='Low HDL threshold (40 mg/dL)')
plt.ylabel('Average HDL Cholesterol (mg/dL)', fontsize=12, fontweight='bold')
plt.title('HDL Cholesterol: Smokers vs Non-Smokers', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.ylim(30, max(hdl_means) * 1.15)
plt.grid(axis='y', alpha=0.3)
plt.show()

**Analysis**: Current smokers exhibit significantly lower HDL levels (52.9 mg/dL) compared to never smokers (59.3 mg/dL), representing a 10.8% reduction in good cholesterol. This 6.4 mg/dL difference is clinically meaningful and aligns with established cardiovascular research showing smoking's negative impact on HDL metabolism.

**Feature Importance**: ⭐⭐⭐ Strong predictor - The clear separation makes HDL a valuable feature for smoking status classification, particularly when combined with blood pressure and other cardiovascular markers.

In [None]:
# LDL Cholesterol vs Smoking - BAR CHART
plt.figure(figsize=(8, 6))

ldl_means = df[df['SMK_stat_type_cd'].isin([1, 3])].groupby('SMK_stat_type_cd')['LDL_chole'].mean()
colors = ['#90EE90', '#FF6B6B']
bars = plt.bar(['Never Smoker', 'Current Smoker'], ldl_means, 
               color=colors, edgecolor='black', linewidth=1.5, width=0.6)

# Aggiungi valori sopra le barre
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f} mg/dL',
             ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.axhline(y=100, color='red', linestyle='--', linewidth=2, label='Optimal limit (100 mg/dL)')
plt.axhline(y=130, color='orange', linestyle='--', linewidth=2, label='Borderline high (130 mg/dL)')
plt.ylabel('Average LDL Cholesterol (mg/dL)', fontsize=12, fontweight='bold')
plt.title('LDL Cholesterol: Smokers vs Non-Smokers', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.ylim(80, max(ldl_means) * 1.1)
plt.grid(axis='y', alpha=0.3)
plt.show()


**Analysis**: LDL cholesterol shows minimal difference between never smokers (113.4 mg/dL) and current smokers (112.3 mg/dL), with only a 1.0 mg/dL variation. Both groups exceed the optimal limit (100 mg/dL) but remain below borderline high (130 mg/dL). The high overlap suggests LDL is influenced by multiple confounding factors beyond smoking status.

**Feature Importance**: ⭐ Weak predictor - Limited predictive power due to substantial inter-group similarity. LDL will contribute minimally to smoking classification models.

In [None]:
# Waistline vs Smoking - BAR CHART
plt.figure(figsize=(8, 6))

waist_means = df[df['SMK_stat_type_cd'].isin([1, 3])].groupby('SMK_stat_type_cd')['waistline'].mean()
colors = ['#90EE90', '#FF6B6B']
bars = plt.bar(['Never Smoker', 'Current Smoker'], waist_means, 
               color=colors, edgecolor='black', linewidth=1.5, width=0.6)

# Aggiungi valori sopra le barre
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f} cm',
             ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.axhline(y=90, color='orange', linestyle='--', linewidth=2, label='Obesity threshold (90 cm)')
plt.ylabel('Average Waistline (cm)', fontsize=12, fontweight='bold')
plt.title('Waistline: Smokers vs Non-Smokers', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.ylim(70, max(waist_means) * 1.1)
plt.grid(axis='y', alpha=0.3)
plt.show()

**Analysis**: Current smokers display larger waistlines (84.2 cm) compared to never smokers (79.0 cm), showing a 5.2 cm difference. Both groups remain below the obesity threshold (90 cm), but smokers demonstrate a shift toward higher values. This unexpected pattern (contrary to nicotine's appetite-suppressing effects) may reflect other lifestyle factors.

**Feature Importance**: ⭐ Limited predictor - While statistically significant, waistline shows high variability and will have limited standalone predictive power for smoking status classification.

### Drinking-related features

In [None]:
# Gamma-GTP vs Drinking - BAR CHART
plt.figure(figsize=(8, 6))

gtp_means = df.groupby('DRK_YN')['gamma_GTP'].mean()
colors = ['#87CEEB', '#FA8072']
bars = plt.bar(['Non-Drinker', 'Drinker'], gtp_means, 
               color=colors, edgecolor='black', linewidth=1.5, width=0.6)

# Aggiungi valori sopra le barre
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f} U/L',
             ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.axhline(y=55, color='red', linestyle='--', linewidth=2, label='Upper normal limit (55 U/L)')
plt.ylabel('Average Gamma-GTP (U/L)', fontsize=12, fontweight='bold')
plt.title('Gamma-GTP: Drinkers vs Non-Drinkers', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.ylim(0, max(gtp_means) * 1.2)
plt.grid(axis='y', alpha=0.3)
plt.show()

**Analysis**: Gamma-GTP demonstrates exceptional discriminative power with drinkers showing 77% higher levels (47.5 U/L) compared to non-drinkers (26.8 U/L). Drinkers approach the upper normal limit (55 U/L) while non-drinkers remain well below it. This confirms gamma-GTP as the most specific biomarker for alcohol consumption.

**Feature Importance**: ⭐⭐⭐⭐⭐ PRIMARY predictor - This feature will be the strongest predictor for drinking status, potentially achieving high accuracy independently due to clear separation between groups.

In [None]:
# Triglycerides vs Drinking - BAR CHART
plt.figure(figsize=(8, 6))

trig_means = df.groupby('DRK_YN')['triglyceride'].mean()
colors = ['#87CEEB', '#FA8072']
bars = plt.bar(['Non-Drinker', 'Drinker'], trig_means, 
               color=colors, edgecolor='black', linewidth=1.5, width=0.6)

# Aggiungi valori sopra le barre
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f} mg/dL',
             ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.axhline(y=150, color='red', linestyle='--', linewidth=2, label='Upper normal limit (150 mg/dL)')
plt.ylabel('Average Triglycerides (mg/dL)', fontsize=12, fontweight='bold')
plt.title('Triglycerides: Drinkers vs Non-Drinkers', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.ylim(0, max(trig_means) * 1.2)
plt.grid(axis='y', alpha=0.3)
plt.show()

**Analysis**: Drinkers show elevated triglyceride levels (142.8 mg/dL) compared to non-drinkers (121.5 mg/dL), representing a 17.5% increase. Drinkers approach the upper normal limit (150 mg/dL) while non-drinkers remain comfortably below it. This metabolic marker reflects alcohol's significant impact on lipid metabolism.

**Feature Importance**: ⭐⭐⭐ Strong predictor - Triglycerides will serve as a valuable secondary feature for drinking status classification, particularly when combined with liver enzyme markers (gamma-GTP, AST, ALT).

In [None]:
# AST vs Drinking - BAR CHART
plt.figure(figsize=(8, 6))

ast_means = df.groupby('DRK_YN')['SGOT_AST'].mean()
colors = ['#87CEEB', '#FA8072']
bars = plt.bar(['Non-Drinker', 'Drinker'], ast_means, 
               color=colors, edgecolor='black', linewidth=1.5, width=0.6)

# Aggiungi valori sopra le barre
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f} U/L',
             ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.axhline(y=40, color='red', linestyle='--', linewidth=2, label='Upper normal limit (40 U/L)')
plt.ylabel('Average AST (U/L)', fontsize=12, fontweight='bold')
plt.title('AST Enzyme: Drinkers vs Non-Drinkers', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.ylim(0, 50)
plt.grid(axis='y', alpha=0.3)
plt.show()

**Analysis**: Drinkers display slightly elevated AST levels (26.9 U/L) versus non-drinkers (25.1 U/L), representing a 7.2% increase. Both groups stay within normal ranges (<40 U/L), suggesting AST is less sensitive than gamma-GTP for detecting moderate alcohol consumption. AST will serve as a supporting biomarker in the classification model.

**Feature Importance**: ⭐⭐ Supporting predictor - Useful in combination with other liver enzymes but limited standalone discriminative power due to high inter-group overlap.

In [None]:
# ALT vs Drinking - BAR CHART
plt.figure(figsize=(8, 6))

alt_means = df.groupby('DRK_YN')['SGOT_ALT'].mean()
colors = ['#87CEEB', '#FA8072']
bars = plt.bar(['Non-Drinker', 'Drinker'], alt_means, 
               color=colors, edgecolor='black', linewidth=1.5, width=0.6)

# Aggiungi valori sopra le barre
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f} U/L',
             ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.axhline(y=56, color='red', linestyle='--', linewidth=2, label='Upper normal limit (56 U/L)')
plt.ylabel('Average ALT (U/L)', fontsize=12, fontweight='bold')
plt.title('ALT Enzyme: Drinkers vs Non-Drinkers', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.ylim(0, 50)
plt.grid(axis='y', alpha=0.3)
plt.show()

**Analysis**: Drinkers exhibit elevated ALT levels (27.4 U/L) compared to non-drinkers (24.1 U/L), showing a 13.7% increase. Both groups remain well below the upper normal limit (56 U/L), indicating that ALT elevation is primarily observed in heavy or chronic drinkers. This liver enzyme marker will contribute as a secondary predictor in the drinking status model.

**Feature Importance**: ⭐⭐ Secondary predictor - Moderate discriminative power when combined with gamma-GTP and AST for comprehensive liver function assessment.

### Feature Importance Summary

| Target | Feature | Mean Difference | Importance | Role in Model |
|--------|---------|----------------|------------|---------------|
| **Smoking** | SBP | +2.4 mmHg | ⭐⭐ | Moderate predictor |
| **Smoking** | HDL | -6.4 mg/dL | ⭐⭐⭐ | Strong predictor |
| **Smoking** | LDL | -1.0 mg/dL | ⭐ | Weak predictor |
| **Smoking** | Waistline | +5.2 cm | ⭐ | Limited predictor |
| **Drinking** | Gamma-GTP | +20.7 U/L (+77%) | ⭐⭐⭐⭐⭐ | **PRIMARY** predictor |
| **Drinking** | Triglycerides | +21.3 mg/dL (+17.5%) | ⭐⭐⭐ | Strong predictor |
| **Drinking** | AST | +1.8 U/L (+7.2%) | ⭐⭐ | Supporting predictor |
| **Drinking** | ALT | +3.3 U/L (+13.7%) | ⭐⭐ | Secondary predictor |

**Key Insights**:
- Gamma-GTP stands out as the most discriminative feature with a 77% difference between groups
- HDL shows the strongest separation for smoking status with a 10.8% reduction in smokers
- LDL has minimal predictive value due to high inter-group overlap
- Liver enzymes (gamma-GTP, ALT, AST) combined will provide robust drinking status prediction
- Cardiovascular markers (SBP, HDL) will be most effective for smoking status classification

## 3. Data preparation for modeling

### Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Encode sex (Male/Female → 0/1)
df['sex'] = le.fit_transform(df['sex'])
print("Sex encoded:", df['sex'].unique())

# Encode drinking status (Y/N → 1/0) - TARGET VARIABLE
df['DRK_YN'] = le.fit_transform(df['DRK_YN'])
print("Drinking encoded:", df['DRK_YN'].unique())

# Note: SMK_stat_type_cd, hear_left/right, urine_protein are already numeric

### Feature engineering

In [None]:
# Salva shape originale
original_features = df.shape[1]
print(f"\nOriginal features: {original_features}")

# 1. CARDIOVASCULAR RATIOS
df['HDL_LDL_ratio'] = df['HDL_chole'] / (df['LDL_chole'] + 1e-5)  # Protezione divisione per zero
df['BP_ratio'] = df['SBP'] / (df['DBP'] + 1e-5)
df['cholesterol_total'] = df['HDL_chole'] + df['LDL_chole'] + df['triglyceride']

# 2. LIVER HEALTH INDICATORS
df['liver_enzyme_score'] = df['gamma_GTP'] + df['SGOT_AST'] + df['SGOT_ALT']
df['AST_ALT_ratio'] = df['SGOT_AST'] / (df['SGOT_ALT'] + 1e-5)
df['GTP_AST_ratio'] = df['gamma_GTP'] / (df['SGOT_AST'] + 1e-5)

# 3. BODY COMPOSITION
df['BMI'] = df['weight'] / ((df['height'] / 100) ** 2)
df['waist_height_ratio'] = df['waistline'] / df['height']

# 4. CARDIOVASCULAR RISK SCORE (weighted combination)
df['cardiovascular_risk'] = (
    df['SBP'] * 0.25 + 
    df['DBP'] * 0.15 + 
    df['LDL_chole'] * 0.20 + 
    df['triglyceride'] * 0.15 +
    (200 - df['HDL_chole']) * 0.25  # Inverso HDL (più basso = più rischio)
)

# 5. METABOLIC SYNDROME INDICATORS
df['metabolic_score'] = (
    (df['waistline'] > 90).astype(int) * 2 +  # Obesità addominale
    (df['triglyceride'] > 150).astype(int) +   # Trigliceridi alti
    (df['HDL_chole'] < 40).astype(int) +       # HDL basso
    (df['SBP'] > 130).astype(int)              # Ipertensione
)

# 6. AGE-RELATED INTERACTIONS
df['age_BP_interaction'] = df['age'] * df['SBP'] / 100
df['age_cholesterol_interaction'] = df['age'] * df['cholesterol_total'] / 1000

# 7. HEARING ISSUES (binario)
df['hearing_problem'] = ((df['hear_left'] == 2) | (df['hear_right'] == 2)).astype(int)

# Rimuovi eventuali NaN o Inf generati
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.median(), inplace=True)

# Report finale
new_features = df.shape[1]
print(f"New features: {new_features}")
print(f"Features added: {new_features - original_features}")
print("\nNew feature columns:")
new_cols = [col for col in df.columns if col not in ['sex', 'age', 'height', 'weight', 
            'waistline', 'sight_left', 'sight_right', 'hear_left', 'hear_right',
            'SBP', 'DBP', 'BLDS', 'tot_chole', 'HDL_chole', 'LDL_chole', 
            'triglyceride', 'hemoglobin', 'urine_protein', 'serum_creatinine',
            'SGOT_AST', 'SGOT_ALT', 'gamma_GTP', 'SMK_stat_type_cd', 'DRK_YN']]
for col in new_cols:
    print(f"  - {col}")

### Correlation heatmap

In [None]:
plt.figure(figsize=(18, 16))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=False, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix (with Engineered Features)', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Focus on target correlations
smoking_corr = correlation_matrix['SMK_stat_type_cd'].sort_values(ascending=False)
print("\nTop 15 features correlated with SMOKING STATUS:")
print(smoking_corr.head(15))

drinking_corr = correlation_matrix['DRK_YN'].sort_values(ascending=False)
print("\nTop 15 features correlated with DRINKING STATUS:")
print(drinking_corr.head(15))

### Smoking status split

In [None]:
from sklearn.model_selection import train_test_split

# Smoking status split (CON feature engineered!)
X_smoke = df.drop('SMK_stat_type_cd', axis=1)
y_smoke = df['SMK_stat_type_cd']

X_train_smoke, X_test_smoke, y_train_smoke, y_test_smoke = train_test_split(
    X_smoke, y_smoke, 
    test_size=0.30, 
    random_state=101, 
    stratify=y_smoke
)

print(f"Smoking - Training set: {X_train_smoke.shape}")
print(f"Smoking - Test set: {X_test_smoke.shape}")
print(f"Total features (with engineered): {X_train_smoke.shape[1]}")

### Drinking status split

In [None]:
from sklearn.model_selection import train_test_split

# Drinking status split
X_drink = df.drop('DRK_YN', axis=1)
y_drink = df['DRK_YN']

X_train_drink, X_test_drink, y_train_drink, y_test_drink = train_test_split(
    X_drink, y_drink, 
    test_size=0.30, 
    random_state=101,
    stratify=y_drink
)

print(f"Drinking - Training set: {X_train_drink.shape}")
print(f"Drinking - Test set: {X_test_drink.shape}")

## 5. Modeling - Task 1: Smoking Status

### A. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Analisi veloce class distribution
print("Class distribution:", y_train_smoke.value_counts(normalize=True).round(3).to_dict())
print(f"Features: {X_train_smoke.shape[1]}, Samples: {X_train_smoke.shape[0]}\n")

# Subsample ridotto (10% invece di 40%)
X_sample = X_train_smoke.sample(frac=0.1, random_state=101)
y_sample = y_train_smoke.loc[X_sample.index]

# Pipeline semplice
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(random_state=0, max_iter=2000))
])

# Grid MINIMA - solo 4 combinazioni
param_grid = {
    'logreg__C': [1, 10],
    'logreg__solver': ['lbfgs'],
    'logreg__class_weight': [None, 'balanced']
}

# GridSearch veloce
grid = GridSearchCV(pipeline, param_grid, cv=2, n_jobs=-1, verbose=1)
grid.fit(X_sample, y_sample)

print(f"\nBest params: {grid.best_params_}")
print(f"Best CV score: {grid.best_score_:.4f}")

# %%
# Training finale rapido
pipeline.set_params(**grid.best_params_)
pipeline.fit(X_train_smoke, y_train_smoke)

# Risultati
train_acc = pipeline.score(X_train_smoke, y_train_smoke)
test_acc = pipeline.score(X_test_smoke, y_test_smoke)

print("\n" + "=" * 50)
print("SMOKING STATUS - FINAL RESULTS")
print("=" * 50)
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy:     {test_acc:.4f}")
print("=" * 50)

logmodel_smoke_final = pipeline

### B. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Piccolo campione per ottimizzazione parametri
X_sample_rf = X_train_smoke.sample(frac=0.1, random_state=101)
y_sample_rf = y_train_smoke.loc[X_sample_rf.index]

# Pipeline (nessuna scaling necessaria, ma si può lasciare consistenza)
pipeline_rf = Pipeline([
    ('rf', RandomForestClassifier(random_state=0, n_jobs=-1))
])

# Grid minimal - trova un buon compromesso tra accuratezza e RAM
param_grid_rf = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [8, 16],
    'rf__class_weight': [None, 'balanced']
}

grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=2, n_jobs=-1, verbose=1)
grid_rf.fit(X_sample_rf, y_sample_rf)

print(f"\nBest params: {grid_rf.best_params_}")
print(f"Best CV score: {grid_rf.best_score_:.4f}")

# Allena modello finale con i best params sui dati completi
pipeline_rf.set_params(**grid_rf.best_params_)
pipeline_rf.fit(X_train_smoke, y_train_smoke)

train_acc_rf = pipeline_rf.score(X_train_smoke, y_train_smoke)
test_acc_rf = pipeline_rf.score(X_test_smoke, y_test_smoke)

print("\n" + "="*50)
print("SMOKING STATUS - RANDOM FOREST")
print("="*50)
print(f"Training Accuracy: {train_acc_rf:.4f}")
print(f"Test Accuracy:     {test_acc_rf:.4f}")
print("="*50)

rf_smoke_final = pipeline_rf


In [None]:
importances = rf_smoke_final.named_steps['rf'].feature_importances_
features = X_train_smoke.columns
for imp, feat in sorted(zip(importances, features), reverse=True):
    print(f"{feat}: {imp:.4f}")

## 6. Modeling - Task 2: Drinking Status

### A. Logistic Regression

In [None]:
# Analisi veloce class distribution
print("Class distribution:", y_train_drink.value_counts(normalize=True).round(3).to_dict())
print(f"Features: {X_train_drink.shape[1]}, Samples: {X_train_drink.shape[0]}\n")

# Subsample ridotto (10% invece di 40%)
X_sample_drink = X_train_drink.sample(frac=0.1, random_state=101)
y_sample_drink = y_train_drink.loc[X_sample_drink.index]

# Pipeline semplice
pipeline_drink = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(random_state=0, max_iter=2000))
])

# Grid MINIMA - solo 4 combinazioni
param_grid_drink = {
    'logreg__C': [1, 10],
    'logreg__solver': ['lbfgs'],
    'logreg__class_weight': [None, 'balanced']
}

# GridSearch veloce
grid_drink = GridSearchCV(pipeline_drink, param_grid_drink, cv=2, n_jobs=-1, verbose=1)
grid_drink.fit(X_sample_drink, y_sample_drink)

print(f"\nBest params: {grid_drink.best_params_}")
print(f"Best CV score: {grid_drink.best_score_:.4f}")

# %%
# Training finale rapido
pipeline_drink.set_params(**grid_drink.best_params_)
pipeline_drink.fit(X_train_drink, y_train_drink)

# Risultati
train_acc_drink = pipeline_drink.score(X_train_drink, y_train_drink)
test_acc_drink = pipeline_drink.score(X_test_drink, y_test_drink)

print("\n" + "=" * 50)
print("DRINKING STATUS - FINAL RESULTS")
print("=" * 50)
print(f"Training Accuracy: {train_acc_drink:.4f}")
print(f"Test Accuracy:     {test_acc_drink:.4f}")
print("=" * 50)

logmodel_drink_final = pipeline_drink

### B. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Piccolo campione per ottimizzazione parametri
X_sample_rf = X_train_drink.sample(frac=0.1, random_state=101)
y_sample_rf = y_train_drink.loc[X_sample_rf.index]

# Pipeline (nessuna scaling necessaria, ma si può lasciare consistenza)
pipeline_rf = Pipeline([
    ('rf', RandomForestClassifier(random_state=0, n_jobs=-1))
])

# Grid minimal - trova un buon compromesso tra accuratezza e RAM
param_grid_rf = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [8, 16],
    'rf__class_weight': [None, 'balanced']
}

grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=2, n_jobs=-1, verbose=1)
grid_rf.fit(X_sample_rf, y_sample_rf)

print(f"\nBest params: {grid_rf.best_params_}")
print(f"Best CV score: {grid_rf.best_score_:.4f}")

# Allena modello finale con i best params sui dati completi
pipeline_rf.set_params(**grid_rf.best_params_)
pipeline_rf.fit(X_train_drink, y_train_drink)

train_acc_rf = pipeline_rf.score(X_train_drink, y_train_drink)
test_acc_rf = pipeline_rf.score(X_test_drink, y_test_drink)

print("\n" + "="*50)
print("DRINKING STATUS - RANDOM FOREST")
print("="*50)
print(f"Training Accuracy: {train_acc_rf:.4f}")
print(f"Test Accuracy:     {test_acc_rf:.4f}")
print("="*50)

rf_drink_final = pipeline_rf

In [None]:
importances = rf_drink_final.named_steps['rf'].feature_importances_
features = X_train_drink.columns
for imp, feat in sorted(zip(importances, features), reverse=True):
    print(f"{feat}: {imp:.4f}")

## 7. Model Evaluation

## 8. Insights

## 9. Conclusion