In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load data
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv.xls')

# Clean column names (remove spaces)
df.columns = df.columns.str.strip()

# Fix TotalCharges: convert blank strings to NaN, then convert to float
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Impute missing TotalCharges with median
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

# Drop customerID (not predictive)
df = df.drop("customerID", axis=1)

# Encode categorical variables
df_encoded = df.copy()
label_enc = LabelEncoder()

for col in df.columns:
    if df[col].dtype == "object":
        df_encoded[col] = label_enc.fit_transform(df[col])

# Define X and y
X = df_encoded.drop("Churn", axis=1)
y = df_encoded["Churn"]

# Train/test split (STRATIFIED)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Decision Tree Model with balancing
clf = DecisionTreeClassifier(
    max_depth=5,
    class_weight="balanced",
    random_state=42
)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature Importances
importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": clf.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("\nFeature Importances:\n", importances)


Accuracy: 0.7622427253371186

Confusion Matrix:
 [[810 225]
 [110 264]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.78      0.83      1035
           1       0.54      0.71      0.61       374

    accuracy                           0.76      1409
   macro avg       0.71      0.74      0.72      1409
weighted avg       0.79      0.76      0.77      1409


Feature Importances:
              Feature  Importance
14          Contract    0.625276
8     OnlineSecurity    0.107322
17    MonthlyCharges    0.094560
4             tenure    0.062896
7    InternetService    0.051217
18      TotalCharges    0.029732
16     PaymentMethod    0.015224
13   StreamingMovies    0.003943
11       TechSupport    0.003414
1      SeniorCitizen    0.003029
3         Dependents    0.002968
9       OnlineBackup    0.000420
12       StreamingTV    0.000000
10  DeviceProtection    0.000000
6      MultipleLines    0.000000
15  PaperlessBilling    0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)
