In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import joblib


df = pd.read_csv("telco_churn.csv")  
display(df.head(10))
print(df.dtypes)
print("\nValores ausentes por coluna:")
print(df.isnull().sum())


cols_with_null = [c for c in df.columns if df[c].isnull().any()]
df_clean = df.copy()
for c in cols_with_null:
    pct = df_clean[c].isnull().mean()
    if pct > 0.5:
        df_clean.drop(columns=[c], inplace=True)
    else:
        if pd.api.types.is_numeric_dtype(df_clean[c]):
            df_clean[c].fillna(df_clean[c].median(), inplace=True)
        else:
            df_clean[c].fillna(df_clean[c].mode().iloc[0], inplace=True)


target = 'Churn'
X = df_clean.drop(columns=[target])
y = df_clean[target].map(lambda v: 1 if str(v).strip().lower() in ['yes','1','true','y'] else 0)

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
binary_cols = [c for c in cat_cols if X[c].nunique() == 2]
multi_cols = [c for c in cat_cols if X[c].nunique() > 2]

num_transformer = Pipeline([('imp', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
multi_cat_transformer = Pipeline([('imp', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
binary_cat_transformer = Pipeline([('imp', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse=False))])

preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('multi', multi_cat_transformer, multi_cols),
    ('binary', binary_cat_transformer, binary_cols)
])


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
pipe_log = Pipeline([('preproc', preprocessor), ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))])
pipe_tree = Pipeline([('preproc', preprocessor), ('clf', DecisionTreeClassifier(class_weight='balanced', random_state=42))])
pipe_log.fit(X_train, y_train)
pipe_tree.fit(X_train, y_train)

def avaliar_modelo(pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    print("AcurÃ¡cia:", accuracy_score(y_test, y_pred))
    print("PrecisÃ£o:", precision_score(y_test, y_pred, zero_division=0))
    print("Recall:", recall_score(y_test, y_pred, zero_division=0))
    print("F1:", f1_score(y_test, y_pred, zero_division=0))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nRelatÃ³rio:\n", classification_report(y_test, y_pred, zero_division=0))

print("=== RegressÃ£o LogÃ­stica ===")
avaliar_modelo(pipe_log, X_test, y_test)
print("\n=== Decision Tree ===")
avaliar_modelo(pipe_tree, X_test, y_test)


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_log = cross_validate(pipe_log, X, y, cv=cv, scoring=['accuracy','precision','recall','f1'])
scores_tree = cross_validate(pipe_tree, X, y, cv=cv, scoring=['accuracy','precision','recall','f1'])
print("\nMÃ©dias CV Logistic:", {k: np.mean(v) for k,v in scores_log.items() if k.startswith('test_')})
print("MÃ©dias CV Tree:", {k: np.mean(v) for k,v in scores_tree.items() if k.startswith('test_')})


best_pipe = pipe_log
joblib.dump(best_pipe, "modelo_churn_pipeline.joblib")

def prever_novo_cliente(dados_cliente, modelo_path="modelo_churn_pipeline.joblib", limiar=0.5):
    model = joblib.load(modelo_path)
    df_new = pd.DataFrame([dados_cliente])
    proba = model.predict_proba(df_new)[:,1][0]
    decision = 'Churn' if proba >= limiar else 'Nao-Churn'
    return {'probabilidade': float(proba), 'decisao': decision}


exemplo = {col: X.iloc[0][col] for col in X.columns}
print(prever_novo_cliente(exemplo))

<class 'ModuleNotFoundError'>: No module named 'pandas'