# Swan Teleco Project 

## Importing the Libraries
**Purpose:** Import necessary libraries for data manipulation, visualization, and machine learning.

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

## Loading the Dataset
**Purpose:** Load the project data from a CSV file into a pandas DataFrame.

In [2]:
dataset = "1_-_Project_Data.csv"
df = pd.read_csv(dataset)

In [3]:
df.head() # Previewing the Data

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,Competitor had better devices


## Explore Data Structure
**Purpose:** Examine the dataset’s columns, shape, null values, and data types.

In [4]:
df.columns

Index(['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code',
       'Lat Long', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
       'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
       'Monthly Charges', 'Total Charges', 'Churn Label', 'Churn Value',
       'Churn Reason'],
      dtype='object')

In [5]:
df.shape

(7043, 31)

In [6]:
df.isnull().sum() # No nulls 

CustomerID              0
Count                   0
Country                 0
State                   0
City                    0
Zip Code                0
Lat Long                0
Latitude                0
Longitude               0
Gender                  0
Senior Citizen          0
Partner                 0
Dependents              0
Tenure Months           0
Phone Service           0
Multiple Lines          0
Internet Service        0
Online Security         0
Online Backup           0
Device Protection       0
Tech Support            0
Streaming TV            0
Streaming Movies        0
Contract                0
Paperless Billing       0
Payment Method          0
Monthly Charges         0
Total Charges           0
Churn Label             0
Churn Value             0
Churn Reason         5174
dtype: int64

In [7]:
df.dtypes

CustomerID            object
Count                  int64
Country               object
State                 object
City                  object
Zip Code               int64
Lat Long              object
Latitude             float64
Longitude            float64
Gender                object
Senior Citizen        object
Partner               object
Dependents            object
Tenure Months          int64
Phone Service         object
Multiple Lines        object
Internet Service      object
Online Security       object
Online Backup         object
Device Protection     object
Tech Support          object
Streaming TV          object
Streaming Movies      object
Contract              object
Paperless Billing     object
Payment Method        object
Monthly Charges      float64
Total Charges         object
Churn Label           object
Churn Value            int64
Churn Reason          object
dtype: object

In [8]:
# Set the ‘CustomerID’ column as the index of the DataFrame for easier data manipulation.
df = df.set_index('CustomerID')
df.head()

Unnamed: 0_level_0,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,Senior Citizen,...,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Reason
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,No,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,Competitor made better offer
9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,No,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,Moved
9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,No,...,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,Moved
7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,No,...,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,Moved
0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,No,...,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,Competitor had better devices


## Preparing Data for Modeling

### Train Test Splitting

**Purpose:** Separate the target variable (‘Churn Value’) from the features and split the data into training and testing sets.

In [9]:
y = df['Churn Value']  # Extract the target column
X = df.drop(columns=['Churn Value'])  # Drop the target column from the DataFrame

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train.head()

Unnamed: 0_level_0,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,Senior Citizen,...,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Reason
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9571-EDEBV,1,United States,California,San Diego,92119,"32.802959, -117.027095",32.802959,-117.027095,Male,No,...,No,Yes,Yes,One year,Yes,Credit card (automatic),98.65,6962.85,No,
7562-GSUHK,1,United States,California,Jamestown,95327,"37.84771, -120.486589",37.84771,-120.486589,Female,No,...,No,No,Yes,Month-to-month,Yes,Credit card (automatic),93.2,1573.7,Yes,Competitor made better offer
2325-WINES,1,United States,California,Littlerock,93543,"34.505273, -117.955054",34.505273,-117.955054,Female,No,...,No,Yes,Yes,One year,Yes,Credit card (automatic),104.05,3416.85,No,
9381-NDKME,1,United States,California,El Monte,91731,"34.079934, -118.046695",34.079934,-118.046695,Female,Yes,...,No,No,Yes,Month-to-month,Yes,Credit card (automatic),40.65,933.3,Yes,Limited range of services
2164-SOQXL,1,United States,California,Los Angeles,90043,"33.988543, -118.334081",33.988543,-118.334081,Female,No,...,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,20.05,406.05,No,


In [11]:
y.head()

CustomerID
3668-QPYBK    1
9237-HQITU    1
9305-CDSKC    1
7892-POOKP    1
0280-XJGEX    1
Name: Churn Value, dtype: int64

## Feature engineering 
**Purpose:** Encode categorical variables and perform other feature engineering steps to prepare the data for modeling.

In [12]:
from sklearn.preprocessing import OrdinalEncoder

def train_encode(df):
    # Create encoder
    oe = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
    # Fit on training data
    oe.fit(df[['City']])
    return oe

In [13]:
oe=train_encode(df)

In [14]:
def feature_eng(df):
    
    # Label encode 'City'
    df['City Number'] = oe.transform(df[['City']])
    
    # Mapping categorical variables to numerical values
    df['Gender'] = df['Gender'].map({'Female':0, 'Male':1})
    df['Senior Citizen'] = df['Senior Citizen'].map({'No':0, 'Yes':1})
    df['Partner'] = df['Partner'].map({'No':0, 'Yes':1})
    df['Dependents'] = df['Dependents'].map({'No':0, 'Yes':1})
    df['Phone Service'] = df['Phone Service'].map({'No':0, 'Yes':1})
    df['Multiple Lines'] = df['Multiple Lines'].map({'No':0, 'Yes':1, 'No phone service':2})
    df['Internet Service'] = df['Internet Service'].map({'DSL':0, 'Fiber optic':1, 'No':2})
    df['Online Security'] = df['Online Security'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Online Backup'] = df['Online Backup'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Device Protection'] = df['Device Protection'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Tech Support'] = df['Tech Support'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Streaming TV'] = df['Streaming TV'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Streaming Movies'] = df['Streaming Movies'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Contract'] = df['Contract'].map({'Month-to-month':0, 'Two year':1, 'One year':2})
    df['Paperless Billing'] = df['Paperless Billing'].map({'No':0, 'Yes':1})
    df['Payment Method'] = df['Payment Method'].map({'Mailed check':0, 'Electronic check':1, 'Bank transfer (automatic)':2, 'Credit card (automatic)':3})
    
    # Convert 'Total Charges' column to float
    df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
    
    # Fill null values in 'Total Charges' with 0
    df['Total Charges'].fillna(0, inplace=True)
    
    # Add additional charges column 
    df['Additional charges'] = df['Total Charges'] - (df['Monthly Charges'] * df['Tenure Months'])
    
    # Reorder columns
    cols = ['City Number', 'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure Months',
            'Phone Service', 'Multiple Lines', 'Internet Service', 'Online Security',
            'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
            'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
            'Monthly Charges', 'Total Charges', 'Additional charges']
    df = df[cols]
    
    return df

In [15]:
X_train_fe = feature_eng(X_train)

In [16]:
X_test_fe = feature_eng(X_test)

## Building and Tuning the Model
### Random Forest Model 
**Purpose:** Initialize the Random Forest model, select hyperparameters, and use GridSearchCV to find the best model configuration.

In [17]:
rf = RF() # Random Forests

#Step 1: Select parameters
rf_params = {
    'n_estimators': [10,20,50], 
    'max_depth': [3, 4, 5]
}

# Create a grid search object
gs = GridSearchCV(rf, param_grid=rf_params, cv=5, verbose = 1) 

# Fit grid search on the training set
gs.fit(X_train_fe, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 4, 5], 'n_estimators': [10, 20, 50]},
             verbose=1)

## Evaluating the Model
**Purpose:** Output the best score from GridSearchCV, feature importances, and evaluate the model’s accuracy on both training and testing sets.

In [18]:
# Save my best model as a random forest model
best_rf = gs.best_estimator_
best_rf

RandomForestClassifier(max_depth=5, n_estimators=20)

In [20]:
# print score given by best estimator
print(gs.best_score_)

0.8012105575877737


In [21]:
print(list(zip(X_train_fe.columns, np.round(best_rf.feature_importances_,3))))

[('City Number', 0.012), ('Gender', 0.0), ('Senior Citizen', 0.005), ('Partner', 0.001), ('Dependents', 0.057), ('Tenure Months', 0.204), ('Phone Service', 0.001), ('Multiple Lines', 0.003), ('Internet Service', 0.056), ('Online Security', 0.123), ('Online Backup', 0.053), ('Device Protection', 0.013), ('Tech Support', 0.071), ('Streaming TV', 0.006), ('Streaming Movies', 0.021), ('Contract', 0.212), ('Paperless Billing', 0.004), ('Payment Method', 0.013), ('Monthly Charges', 0.056), ('Total Charges', 0.076), ('Additional charges', 0.015)]


In [22]:
print(f'The training accuracy is :{best_rf.score(X_train_fe, y_train)}')
print(f'The testing accuracy is :{best_rf.score(X_test_fe, y_test)}')

The training accuracy is :0.8054668086616968
The testing accuracy is :0.7906316536550745
