<a href="https://colab.research.google.com/github/christophersw/DATA660-Unit-6-Assignment/blob/main/Webster_Unit_6_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unit 6 Assignment - AutoML

by Christopher Webster

for Unit 6 Assignment, DATA 660 9040

Professor Steve Knode

February 17, 2026

## Environment Setup

In [1]:
!pip install git+https://github.com/pycaret/pycaret.git@master --upgrade

Collecting git+https://github.com/pycaret/pycaret.git@master
  Cloning https://github.com/pycaret/pycaret.git (to revision master) to /tmp/pip-req-build-rapml25k
  Running command git clone --filter=blob:none --quiet https://github.com/pycaret/pycaret.git /tmp/pip-req-build-rapml25k
  Resolved https://github.com/pycaret/pycaret.git to commit 58ec3c282d58e94727f9d5b77b49f241e9103ab3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
# Import necessary libraries

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from pycaret.classification import *

from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score


## Data Loading and Preprocessing

- Loads the Adult dataset from a *.csv file
- Review column names
- Check for missing values
- Ensure that the target variable churn is encoded as a bool, and has only bool values
- Drop phone number as this is an identifier


In [14]:
# Load data
print ("\n* * * * Loading data * * * * \n")
data = pd.read_csv('/content/Cell Phone Company Churn data.csv')
print('Data loaded: ')
print(data.shape)

# Look at column names
print ("\n* * * * Column Names * * * * \n")
print(data.columns)

# Look for missing values
print ("\n* * * * Check for Missing Values * * * * \n")
print(data.isnull().sum())

# Check that target variable is encoded as T/F
print ("\n* * * * Check Target Variable Encoding * * * * \n")
print("Unique values in 'churn' column:")
print(data['churn'].unique())
print("\nData type of 'churn' column:")
print(data['churn'].dtype)

# Convert 'area code' to object type to ensure it's treated as categorical
data['area code'] = data['area code'].astype(str)

# Drop the 'phone number' column as it's an identifier and not useful for modeling
data = data.drop(columns=['phone number'])


* * * * Loading data * * * * 

Data loaded: 
(3333, 21)

* * * * Column Names * * * * 

Index(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')

* * * * Check for Missing Values * * * * 

state                     0
account length            0
area code                 0
phone number              0
international plan        0
voice mail plan           0
number vmail messages     0
total day minutes         0
total day calls           0
total day charge          0
total eve minutes         0
total eve calls           0
total eve charge          0
total night minutes      

In [None]:
## Split data into test and training sets

In [18]:
# Split the data into training and testing sets
# PyCaret's setup will further split the training data for cross-validation.
# This split ensures an unseen test set for final evaluation.

#train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42,
                                    stratify=data['churn']) # Stratify for balanced target distribution

In [19]:
# Initialize PyCaret setup

clf = setup(data=train_data, target='churn', train_size=0.8, session_id=42)



# Compare multiple models

best_models = compare_models(include=['lr', 'rf', 'xgboost', 'lightgbm', 'svm'], n_select=5)



# Create a list to store results

results = []



# Evaluate each model

for model in best_models:

  # Get the model name

  model_name = model.__class__.__name__



  # Make predictions on test data

  predictions = predict_model(model, data=test_data)



  # Calculate confusion matrix

  cm = confusion_matrix(test_data['churn'], predictions['prediction_label'])



  # Calculate metrics

  accuracy = accuracy_score(test_data['churn'], predictions['prediction_label'])

  balanced_acc = balanced_accuracy_score(test_data['churn'], predictions['prediction_label'])



  # Calculate sensitivity and specificity for each class

  sensitivities = []

  specificities = []

  for i in range(len(cm)):

    tp = cm[i, i]

    fn = np.sum(cm[i, :]) - tp

    fp = np.sum(cm[:, i]) - tp

    tn = np.sum(cm) - (tp + fn + fp)



    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0

    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0



    sensitivities.append(sensitivity)

    specificities.append(specificity)





  # Average sensitivity and specificity across all classes

  avg_sensitivity = np.mean(sensitivities)

  avg_specificity = np.mean(specificities)





  # Store results

  results.append({

      'Model': model_name,

      'Accuracy': accuracy,

      'Balanced Accuracy': balanced_acc,

      'Avg Sensitivity': avg_sensitivity,

      'Avg Specificity': avg_specificity

  })



# Convert results to a DataFrame and display

results_df = pd.DataFrame(results)

print(results_df.to_string(index=False, float_format='{:.4f}'.format))

Unnamed: 0,Description,Value
0,Session id,42
1,Target,churn
2,Target type,Binary
3,Original data shape,"(2666, 20)"
4,Transformed data shape,"(2666, 22)"
5,Transformed train set shape,"(2132, 22)"
6,Transformed test set shape,"(534, 22)"
7,Numeric features,15
8,Categorical features,4
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.955,0.9064,0.741,0.9386,0.8253,0.8,0.809,0.88
xgboost,Extreme Gradient Boosting,0.9489,0.8989,0.7281,0.9026,0.8044,0.7755,0.7823,0.232
rf,Random Forest Classifier,0.9418,0.902,0.6504,0.9352,0.7613,0.7298,0.7486,0.93
lr,Logistic Regression,0.8602,0.815,0.1847,0.5542,0.2713,0.2171,0.2589,0.357
svm,SVM - Linear Kernel,0.7891,0.6927,0.1801,0.1043,0.0922,0.0471,0.0658,0.128


Processing:   0%|          | 0/29 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.946,0.9217,0.6804,0.9296,0.7857,0.7557,0.7677


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9475,0.9008,0.732,0.8875,0.8023,0.7723,0.7771


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9475,0.9119,0.6701,0.9559,0.7879,0.759,0.7746


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.8546,0.8136,0.1856,0.5,0.2707,0.2084,0.2402


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.7121,0.6733,0.6186,0.2791,0.3846,0.2304,0.2615


                 Model  Accuracy  Balanced Accuracy  Avg Sensitivity  Avg Specificity
        LGBMClassifier    0.9460             0.8358           0.8358           0.8358
         XGBClassifier    0.9475             0.8581           0.8581           0.8581
RandomForestClassifier    0.9475             0.8324           0.8324           0.8324
    LogisticRegression    0.8546             0.5770           0.5770           0.5770
         SGDClassifier    0.7121             0.6733           0.6733           0.6733
