# Swan Teleco Project 

## Importing the Libraries
**Purpose:** Import necessary libraries for data manipulation, visualization, and machine learning.

In [3]:
# Importing the main packages
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# packages for splitting and encoding the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# Packages for creating the model
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

#Packages for evaluating the strength of our model
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

## Loading the Dataset
**Purpose:** Load the project data from a CSV file into a pandas DataFrame.

In [5]:
#Importing the data
dataset = "1_-_Project_Data.csv"
df = pd.read_csv(dataset)

In [6]:
# Previewing the Data
df.head() 

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,Competitor had better devices


## Explore Data Structure
**Purpose:** Examine the dataset’s columns, shape, null values, and data types.

In [11]:
df.columns

Index(['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code',
       'Lat Long', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
       'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
       'Monthly Charges', 'Total Charges', 'Churn Label', 'Churn Value',
       'Churn Reason'],
      dtype='object')

In [13]:
df.shape

(7043, 31)

In [15]:
df.info() #checking for data type and nulls

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         7043 non-null   object 
 1   Count              7043 non-null   int64  
 2   Country            7043 non-null   object 
 3   State              7043 non-null   object 
 4   City               7043 non-null   object 
 5   Zip Code           7043 non-null   int64  
 6   Lat Long           7043 non-null   object 
 7   Latitude           7043 non-null   float64
 8   Longitude          7043 non-null   float64
 9   Gender             7043 non-null   object 
 10  Senior Citizen     7043 non-null   object 
 11  Partner            7043 non-null   object 
 12  Dependents         7043 non-null   object 
 13  Tenure Months      7043 non-null   int64  
 14  Phone Service      7043 non-null   object 
 15  Multiple Lines     7043 non-null   object 
 16  Internet Service   7043 

In [17]:
# Set the ‘CustomerID’ column as the index of the DataFrame for easier data manipulation. 
# We want to maintain the customer label to attach probabilities to at the end but don't want the model to 'learn' from it
df = df.set_index('CustomerID')
df.head(2)

Unnamed: 0_level_0,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,Senior Citizen,...,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Reason
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,No,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,Competitor made better offer
9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,No,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,Moved


## Preparing Data for Modeling

### Train Test Splitting

**Purpose:** Separate the target variable (‘Churn Value’) from the features and split the data into training and testing sets.

In [20]:
y = df['Churn Value']  # Extract the target column
X = df.drop(columns=['Churn Value'])  # Drop the target column from the DataFrame

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
X_train.head(2)

Unnamed: 0_level_0,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,Senior Citizen,...,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Reason
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9571-EDEBV,1,United States,California,San Diego,92119,"32.802959, -117.027095",32.802959,-117.027095,Male,No,...,No,Yes,Yes,One year,Yes,Credit card (automatic),98.65,6962.85,No,
7562-GSUHK,1,United States,California,Jamestown,95327,"37.84771, -120.486589",37.84771,-120.486589,Female,No,...,No,No,Yes,Month-to-month,Yes,Credit card (automatic),93.2,1573.7,Yes,Competitor made better offer


In [24]:
y.head()

CustomerID
3668-QPYBK    1
9237-HQITU    1
9305-CDSKC    1
7892-POOKP    1
0280-XJGEX    1
Name: Churn Value, dtype: int64

## Feature engineering 
**Purpose:** Encode categorical variables and perform other feature engineering steps to prepare the data for modeling.

In [57]:
def train_encode(df):
    # Create encoder
    oe = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
    # Fit on training data
    oe.fit(df[['City']])
    return oe

In [58]:
oe=train_encode(df)

In [59]:
def feature_eng(df):
    
    # Label encode 'City'
    df['City Number'] = oe.transform(df[['City']])
    
    # Mapping categorical variables to numerical values
    df['Gender'] = df['Gender'].map({'Female':0, 'Male':1})
    df['Senior Citizen'] = df['Senior Citizen'].map({'No':0, 'Yes':1})
    df['Partner'] = df['Partner'].map({'No':0, 'Yes':1})
    df['Dependents'] = df['Dependents'].map({'No':0, 'Yes':1})
    df['Phone Service'] = df['Phone Service'].map({'No':0, 'Yes':1})
    df['Multiple Lines'] = df['Multiple Lines'].map({'No':0, 'Yes':1, 'No phone service':2})
    df['Internet Service'] = df['Internet Service'].map({'DSL':0, 'Fiber optic':1, 'No':2})
    df['Online Security'] = df['Online Security'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Online Backup'] = df['Online Backup'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Device Protection'] = df['Device Protection'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Tech Support'] = df['Tech Support'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Streaming TV'] = df['Streaming TV'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Streaming Movies'] = df['Streaming Movies'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Contract'] = df['Contract'].map({'Month-to-month':0, 'Two year':1, 'One year':2})
    df['Paperless Billing'] = df['Paperless Billing'].map({'No':0, 'Yes':1})
    df['Payment Method'] = df['Payment Method'].map({'Mailed check':0, 'Electronic check':1, 'Bank transfer (automatic)':2, 'Credit card (automatic)':3})
    
    # Convert 'Total Charges' column to float
    df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
    
    # Fill null values in 'Total Charges' with 0
    df['Total Charges'].fillna(0, inplace=True)
    
    # Add additional charges column 
    df['Additional charges'] = df['Total Charges'] - (df['Monthly Charges'] * df['Tenure Months'])
    
    # Reorder columns
    cols = ['City Number', 'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure Months',
            'Phone Service', 'Multiple Lines', 'Internet Service', 'Online Security',
            'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
            'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
            'Monthly Charges', 'Total Charges', 'Additional charges']
    df = df[cols]
    
    return df

In [60]:
X_train_fe = feature_eng(X_train)

In [61]:
X_test_fe = feature_eng(X_test)

## Building and Tuning the Model
### Random Forest Model 
**Purpose:** Initialize the Random Forest model, select hyperparameters, and use GridSearchCV to find the best model configuration.

In [63]:
rf = RF() # Random Forests

#Step 1: Select parameters
rf_params = {
    'n_estimators': [10,20,50], 
    'max_depth': [3, 4, 5]
}

# Create a grid search object
gs = GridSearchCV(rf, param_grid=rf_params, cv=5, verbose = 1) 

# Fit grid search on the training set
gs.fit(X_train_fe, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


## Evaluating the Model
**Purpose:** Output the best score from GridSearchCV, feature importances, and evaluate the model’s accuracy on both training and testing sets.

In [65]:
# Save my best model as a random forest model
best_rf = gs.best_estimator_
best_rf

In [66]:
# print score given by best estimator
print(gs.best_score_)

0.8001440502063828


In [67]:
Feature_Importances=zip(X_train_fe.columns, np.round(best_rf.feature_importances_,3))
print(list(Feature_Importances))

[('City Number', 0.012), ('Gender', 0.002), ('Senior Citizen', 0.003), ('Partner', 0.004), ('Dependents', 0.053), ('Tenure Months', 0.159), ('Phone Service', 0.001), ('Multiple Lines', 0.002), ('Internet Service', 0.061), ('Online Security', 0.109), ('Online Backup', 0.022), ('Device Protection', 0.031), ('Tech Support', 0.134), ('Streaming TV', 0.022), ('Streaming Movies', 0.004), ('Contract', 0.165), ('Paperless Billing', 0.013), ('Payment Method', 0.025), ('Monthly Charges', 0.065), ('Total Charges', 0.101), ('Additional charges', 0.014)]


In [68]:
feat_import = pd.DataFrame(zip(X_train_fe.columns, np.round(100*(best_rf.feature_importances_),1)), columns = ['Feature', 'Importance(%)'])
feat_import=feat_import.sort_values(by ='Importance(%)', ascending=False)
feat_import=feat_import.reset_index(drop=True)
feat_import

Unnamed: 0,Feature,Importance(%)
0,Contract,16.5
1,Tenure Months,15.9
2,Tech Support,13.4
3,Online Security,10.9
4,Total Charges,10.1
5,Monthly Charges,6.5
6,Internet Service,6.1
7,Dependents,5.3
8,Device Protection,3.1
9,Payment Method,2.5


In [69]:
Feature_Importances=(zip(X_train_fe.columns, np.round(best_rf.feature_importances_,3)))
print(list(Feature_Importances))

[('City Number', 0.012), ('Gender', 0.002), ('Senior Citizen', 0.003), ('Partner', 0.004), ('Dependents', 0.053), ('Tenure Months', 0.159), ('Phone Service', 0.001), ('Multiple Lines', 0.002), ('Internet Service', 0.061), ('Online Security', 0.109), ('Online Backup', 0.022), ('Device Protection', 0.031), ('Tech Support', 0.134), ('Streaming TV', 0.022), ('Streaming Movies', 0.004), ('Contract', 0.165), ('Paperless Billing', 0.013), ('Payment Method', 0.025), ('Monthly Charges', 0.065), ('Total Charges', 0.101), ('Additional charges', 0.014)]


In [70]:
print(f'The training accuracy is :{best_rf.score(X_train_fe, y_train)}')
print(f'The testing accuracy is :{best_rf.score(X_test_fe, y_test)}')

The training accuracy is :0.8095491657791978
The testing accuracy is :0.7934705464868701


## Final Results

In [72]:
results = df.copy()
results = results.drop(columns=['Churn Value'])
results = feature_eng(results)
results1 = results.copy()
results['y_pred'] = best_rf.predict(results1)
results['y_real'] = df['Churn Value']
results['Probability of Churning'] = np.round(best_rf.predict_proba(results1)[:,1],3)

results.head()

Unnamed: 0_level_0,City Number,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,...,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Additional charges,y_pred,y_real,Probability of Churning
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3668-QPYBK,562.0,1,0,0,0,2,1,0,0,1,...,0,0,1,0,53.85,108.15,0.45,0,1,0.459
9237-HQITU,562.0,0,0,0,1,2,1,0,1,0,...,0,0,1,1,70.7,151.65,10.25,1,1,0.598
9305-CDSKC,562.0,0,0,0,1,8,1,1,1,0,...,1,0,1,1,99.65,820.5,23.3,1,1,0.519
7892-POOKP,562.0,0,0,1,1,28,1,1,1,0,...,1,0,1,1,104.8,3046.05,111.65,0,1,0.261
0280-XJGEX,562.0,1,0,0,1,49,1,1,1,0,...,1,0,1,2,103.7,5036.3,-45.0,0,1,0.249


In [73]:
worst_500 = results[results['y_real']==0][['Probability of Churning']].sort_values(by = 'Probability of Churning', ascending = False).head(500)
worst_500

Unnamed: 0_level_0,Probability of Churning
CustomerID,Unnamed: 1_level_1
7439-DKZTW,0.763
4912-PIGUY,0.760
7577-SWIFR,0.755
1452-VOQCH,0.736
0021-IKXGC,0.734
...,...
3771-PZOBW,0.451
6693-FRIRW,0.451
5144-TVGLP,0.451
1730-ZMAME,0.450


In [74]:
All_Customers=results[['Probability of Churning']].sort_values(by='Probability of Churning',ascending= False)
All_Customers

Unnamed: 0_level_0,Probability of Churning
CustomerID,Unnamed: 1_level_1
5178-LMXOP,0.776
2725-KXXWT,0.775
7216-EWTRS,0.775
0295-PPHDO,0.773
0318-QUUOB,0.771
...,...
0480-KYJVA,0.007
6621-NRZAK,0.007
8917-SZTTJ,0.007
2050-ONYDQ,0.007


In [None]:
worst_500.to_csv('Most likely to churn 500.csv')
All_Customers.to_csv('Probability of Churning for All Customers.csv')
feat_import.to_csv('List of Feature Importances Ranked.csv', index=False)