In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Load training data
train_df = pd.read_csv('churn_train.csv')

In [4]:
def preprocess_data(df):
    df = df.copy()
    
    # Convert categorical columns to numerical values
    label_encoders = {}
    categorical_columns = ['State', 'International plan', 'Voice mail plan', 'Churn']
    
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    
    return df, label_encoders

In [5]:
# Preprocess training data
train_df, label_encoders = preprocess_data(train_df)

In [6]:
# Split data into train and validation sets
X = train_df.drop(columns=['Churn'])
y = train_df['Churn']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [8]:
# Perform Grid Search
dt_model = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [3, 5, 10, None],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10]},
             scoring='accuracy')

In [9]:
# Best model
best_dt_model = grid_search.best_estimator_

In [10]:
# Evaluate on validation set
y_val_pred = best_dt_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print('Validation Classification Report:\n', classification_report(y_val, y_val_pred))

Validation Accuracy: 0.9307
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.96       455
           1       0.89      0.61      0.72        79

    accuracy                           0.93       534
   macro avg       0.91      0.80      0.84       534
weighted avg       0.93      0.93      0.93       534



In [11]:
# Load test data
test_df = pd.read_csv('churn_test.csv')

In [12]:
def preprocess_test_data(df, label_encoders):
    df = df.copy()
    for col in label_encoders:
        df[col] = label_encoders[col].transform(df[col])
    return df

In [13]:
test_df = preprocess_test_data(test_df, label_encoders)

In [14]:
# Split features and target in test data
X_test = test_df.drop(columns=['Churn'])
y_test = test_df['Churn']

In [15]:
# Predict using best trained model
y_test_pred = best_dt_model.predict(X_test)

In [16]:
# Evaluate model on test data
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print('Test Classification Report:\n', classification_report(y_test, y_test_pred))
print('Test Confusion Matrix:\n', confusion_matrix(y_test, y_test_pred))

Test Accuracy: 0.9415
Test Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97       572
           1       0.85      0.72      0.78        95

    accuracy                           0.94       667
   macro avg       0.90      0.85      0.87       667
weighted avg       0.94      0.94      0.94       667

Test Confusion Matrix:
 [[560  12]
 [ 27  68]]
