# Teleconsulta Absenteísmo — Dataset Combinado (Real + Sintético)Este notebook usa o dataset `appointments_combined.csv`, resultado da junção de dados públicos (Kaggle No-Show Appointments) e dados sintéticos enriquecidos com variáveis de teleconsulta.

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Carregar dataset combinado
df = pd.read_csv('/mnt/data/teleconsulta_dataset_combined/appointments_combined.csv', low_memory=False)
print('Tamanho:', df.shape)
df.head()

## Pré-processamento básico

In [None]:
# Rótulo
df = df[df['outcome'].notna()]
df['target'] = np.where(df['outcome'] == 'no_show', 1, 0)

# Seleção de features
ignore_cols = ['outcome', 'no_show_flag', 'patientid', 'appointmentid', 'scheduledday', 'appointmentday']
features = [c for c in df.columns if c not in ignore_cols]

# Conversão de categóricas para string
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype(str)

# Remover colunas com cardinalidade muito alta
high_card_cols = [c for c in features if df[c].nunique() > 500]
features = [c for c in features if c not in high_card_cols]

X = df[features]
y = df['target']

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
print('Treino:', X_train.shape, 'Teste:', X_test.shape)

## Treino do modelo (LightGBM ou RandomForest fallback)

In [None]:
try:
    model = lgb.LGBMClassifier(n_estimators=300, random_state=42)
    model.fit(X_train, y_train)
except Exception as e:
    print('LightGBM não disponível, usando RandomForest fallback.')
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=200, random_state=42)
    model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

print('Acurácia:', accuracy_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
if y_prob is not None:
    print('ROC-AUC:', roc_auc_score(y_test, y_prob))

## Importância das variáveis

In [None]:
if hasattr(model, 'feature_importances_'):
    importances = pd.Series(model.feature_importances_, index=X_train.columns)
    top_feats = importances.sort_values(ascending=False).head(15)
    plt.figure(figsize=(8,5))
    sns.barplot(x=top_feats.values, y=top_feats.index)
    plt.title('Top 15 Features mais importantes')
    plt.show()

## ConclusãoEste modelo baseline mostra quais variáveis influenciam o absenteísmo em teleconsultas, combinando dados reais (Kaggle) e sintéticos (HC).