In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load data
df = pd.read_csv('preprocessed_telco_churn.csv')

# Clean column names (remove spaces)
df.columns = df.columns.str.strip()

# Fix TotalCharges: convert blank strings to NaN, then convert to float
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Impute missing TotalCharges with median
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)


# Encode categorical variables
df_encoded = df.copy()
label_enc = LabelEncoder()

for col in df.columns:
    if df[col].dtype == "object":
        df_encoded[col] = label_enc.fit_transform(df[col])

# Define X and y
X = df_encoded.drop("Churn", axis=1)
y = df_encoded["Churn"]

# Train/test split (STRATIFIED)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Decision Tree Model with balancing
clf = DecisionTreeClassifier(
    max_depth=5,
    class_weight="balanced",
    random_state=42
)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature Importances
importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": clf.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("\nFeature Importances:\n", importances)


Accuracy: 0.7345635202271115

Confusion Matrix:
 [[733 302]
 [ 72 302]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.71      0.80      1035
           1       0.50      0.81      0.62       374

    accuracy                           0.73      1409
   macro avg       0.71      0.76      0.71      1409
weighted avg       0.80      0.73      0.75      1409


Feature Importances:
                                   Feature  Importance
19                      Contract_Two year    0.405269
18                      Contract_One year    0.253452
10            InternetService_Fiber optic    0.111558
4                                  tenure    0.094694
17                    StreamingMovies_Yes    0.034102
8                            TotalCharges    0.027222
21         PaymentMethod_Electronic check    0.020621
7                          MonthlyCharges    0.019440
11                     InternetService_No    0.015233
5            

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)
