# Swan Teleco Project 

## Importing the Libraries
**Purpose:** Import necessary libraries for data manipulation, visualization, and machine learning.

In [3]:
# Importing the main packages
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# packages for splitting and encoding the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# Packages for creating the model
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

#Packages for evaluating the strength of our model
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

## Loading the Dataset
**Purpose:** Load the project data from a CSV file into a pandas DataFrame.

In [5]:
#Importing the data
dataset = "1_-_Project_Data.csv"
df = pd.read_csv(dataset)

In [6]:
# Previewing the Data
df.head() 

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,Competitor had better devices


## Explore Data Structure
**Purpose:** Examine the dataset’s columns, shape, null values, and data types.

In [8]:
df.columns

Index(['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code',
       'Lat Long', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
       'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
       'Monthly Charges', 'Total Charges', 'Churn Label', 'Churn Value',
       'Churn Reason'],
      dtype='object')

In [9]:
df.shape

(7043, 31)

In [10]:
df.info() #checking for data type and nulls

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         7043 non-null   object 
 1   Count              7043 non-null   int64  
 2   Country            7043 non-null   object 
 3   State              7043 non-null   object 
 4   City               7043 non-null   object 
 5   Zip Code           7043 non-null   int64  
 6   Lat Long           7043 non-null   object 
 7   Latitude           7043 non-null   float64
 8   Longitude          7043 non-null   float64
 9   Gender             7043 non-null   object 
 10  Senior Citizen     7043 non-null   object 
 11  Partner            7043 non-null   object 
 12  Dependents         7043 non-null   object 
 13  Tenure Months      7043 non-null   int64  
 14  Phone Service      7043 non-null   object 
 15  Multiple Lines     7043 non-null   object 
 16  Internet Service   7043 

In [11]:
# Set the ‘CustomerID’ column as the index of the DataFrame for easier data manipulation. 
# We want to maintain the customer label to attach probabilities to at the end but don't want the model to 'learn' from it
df = df.set_index('CustomerID')
df.head(2)

Unnamed: 0_level_0,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,Senior Citizen,...,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Reason
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,No,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,Competitor made better offer
9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,No,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,Moved


## Preparing Data for Modeling

### Train Test Splitting

**Purpose:** Separate the target variable (‘Churn Value’) from the features and split the data into training and testing sets.

In [13]:
y = df['Churn Value']  # Extract the target column
X = df.drop(columns=['Churn Value'])  # Drop the target column from the DataFrame

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train.head(2)

Unnamed: 0_level_0,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,Senior Citizen,...,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Reason
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9571-EDEBV,1,United States,California,San Diego,92119,"32.802959, -117.027095",32.802959,-117.027095,Male,No,...,No,Yes,Yes,One year,Yes,Credit card (automatic),98.65,6962.85,No,
7562-GSUHK,1,United States,California,Jamestown,95327,"37.84771, -120.486589",37.84771,-120.486589,Female,No,...,No,No,Yes,Month-to-month,Yes,Credit card (automatic),93.2,1573.7,Yes,Competitor made better offer


In [15]:
y.head()

CustomerID
3668-QPYBK    1
9237-HQITU    1
9305-CDSKC    1
7892-POOKP    1
0280-XJGEX    1
Name: Churn Value, dtype: int64

## Feature engineering 
**Purpose:** Encode categorical variables and perform other feature engineering steps to prepare the data for modeling.

In [17]:
def train_encode(df):
    # Create encoder
    oe = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
    # Fit on training data
    oe.fit(df[['City']])
    return oe

In [18]:
oe=train_encode(df)

In [19]:
def feature_eng(df):
    df= df.copy()
    # Label encode 'City'
    df['City Number'] = oe.transform(df[['City']])
    
    # Mapping categorical variables to numerical values
    df['Gender'] = df['Gender'].map({'Female':0, 'Male':1})
    df['Senior Citizen'] = df['Senior Citizen'].map({'No':0, 'Yes':1})
    df['Partner'] = df['Partner'].map({'No':0, 'Yes':1})
    df['Dependents'] = df['Dependents'].map({'No':0, 'Yes':1})
    df['Phone Service'] = df['Phone Service'].map({'No':0, 'Yes':1})
    df['Multiple Lines'] = df['Multiple Lines'].map({'No':0, 'Yes':1, 'No phone service':2})
    df['Internet Service'] = df['Internet Service'].map({'DSL':0, 'Fiber optic':1, 'No':2})
    df['Online Security'] = df['Online Security'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Online Backup'] = df['Online Backup'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Device Protection'] = df['Device Protection'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Tech Support'] = df['Tech Support'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Streaming TV'] = df['Streaming TV'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Streaming Movies'] = df['Streaming Movies'].map({'No':0, 'Yes':1, 'No internet service':2})
    df['Contract'] = df['Contract'].map({'Month-to-month':0, 'Two year':1, 'One year':2})
    df['Paperless Billing'] = df['Paperless Billing'].map({'No':0, 'Yes':1})
    df['Payment Method'] = df['Payment Method'].map({'Mailed check':0, 'Electronic check':1, 'Bank transfer (automatic)':2, 'Credit card (automatic)':3})
    
    # Convert 'Total Charges' column to float
    df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
    
    # Fill null values in 'Total Charges' with 0
    df['Total Charges'].fillna(0, inplace=True)
    
    # Add additional charges column 
    df['Additional charges'] = df['Total Charges'] - (df['Monthly Charges'] * df['Tenure Months'])
    
    # Reorder columns
    cols = ['City Number', 'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure Months',
            'Phone Service', 'Multiple Lines', 'Internet Service', 'Online Security',
            'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
            'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
            'Monthly Charges', 'Total Charges', 'Additional charges']
    df = df[cols]
    
    return df

In [20]:
X_train_fe = feature_eng(X_train)

In [21]:
X_test_fe = feature_eng(X_test)

## Building and Tuning the Model
### Random Forest Model 
**Purpose:** Initialize the Random Forest model, select hyperparameters, and use GridSearchCV to find the best model configuration.

In [23]:
rf = RF() # Random Forests

#Step 1: Select parameters
rf_params = {
    'n_estimators': [10,20,50], 
    'max_depth': [3, 4, 5]
}

# Create a grid search object
gs = GridSearchCV(rf, param_grid=rf_params, cv=5, verbose = 1) 

# Fit grid search on the training set
gs.fit(X_train_fe, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


## Evaluating the Model
**Purpose:** Output the best score from GridSearchCV, feature importances, and evaluate the model’s accuracy on both training and testing sets.

In [25]:
# Save my best model as a random forest model
best_rf = gs.best_estimator_
best_rf

In [26]:
# print score given by best estimator
print(gs.best_score_)

0.8003224581206332


In [43]:
y_pred_train = best_rf.predict(X_train_fe)
y_pred_test = best_rf.predict(X_test_fe)

In [45]:
def get_results(actual, predicted):
    print("The confusion matrix for your predictions is:")
    print(metrics.confusion_matrix(actual, predicted), "\n")
    print(f'The accuracy of your model is: {metrics.accuracy_score(actual, predicted)}')
    print(f'The recall of your model is: {metrics.recall_score(actual, predicted)}')
    print(f'The precision of your model is: {metrics.precision_score(actual, predicted)}')
    print(f'The F1-score of your model is: {metrics.f1_score(actual, predicted)}')

In [57]:
print('The training results are :')
get_results(y_train, y_pred_train)

The training results are :
The confusion matrix for your predictions is:
[[3861  304]
 [ 757  712]] 

The accuracy of your model is: 0.8116790912318069
The recall of your model is: 0.4846834581347856
The precision of your model is: 0.7007874015748031
The F1-score of your model is: 0.5730382293762576


In [55]:
print('The testing results are :')
get_results(y_test, y_pred_test)

The testing results are :
The confusion matrix for your predictions is:
[[924  85]
 [205 195]] 

The accuracy of your model is: 0.794180269694819
The recall of your model is: 0.4875
The precision of your model is: 0.6964285714285714
The F1-score of your model is: 0.5735294117647058


These scores are really similar and shows that we have avoided overfitting. We will continue our analysis using this model.

In [103]:
feat_import = pd.DataFrame(zip(X_train_fe.columns, np.round(100*(best_rf.feature_importances_),1)), columns = ['Feature', 'Importance(%)'])
feat_import=feat_import.sort_values(by ='Importance(%)', ascending=False)
feat_import=feat_import.reset_index(drop=True)
feat_import

Unnamed: 0,Feature,Importance(%)
0,Contract,22.4
1,Tenure Months,13.9
2,Total Charges,11.9
3,Online Security,7.5
4,Dependents,7.1
5,Tech Support,6.8
6,Internet Service,6.7
7,Monthly Charges,5.5
8,Online Backup,4.2
9,Payment Method,2.8


## Final Results
**Purpose:** Here, the probabilities of churning for each customer have been added and the csv files have been generated.

In [69]:
results = df.copy()
results = results.drop(columns=['Churn Value'])
results = feature_eng(results)
results1 = results.copy()
results['y_pred'] = best_rf.predict(results1)
results['y_real'] = df['Churn Value']
results['Probability of Churning'] = np.round(best_rf.predict_proba(results1)[:,1],3)

results.head()

Unnamed: 0_level_0,City Number,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,...,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Additional charges,y_pred,y_real,Probability of Churning
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3668-QPYBK,562.0,1,0,0,0,2,1,0,0,1,...,0,0,1,0,53.85,108.15,0.45,1,1,0.501
9237-HQITU,562.0,0,0,0,1,2,1,0,1,0,...,0,0,1,1,70.7,151.65,10.25,1,1,0.606
9305-CDSKC,562.0,0,0,0,1,8,1,1,1,0,...,1,0,1,1,99.65,820.5,23.3,0,1,0.412
7892-POOKP,562.0,0,0,1,1,28,1,1,1,0,...,1,0,1,1,104.8,3046.05,111.65,0,1,0.244
0280-XJGEX,562.0,1,0,0,1,49,1,1,1,0,...,1,0,1,2,103.7,5036.3,-45.0,0,1,0.178


In [71]:
worst_500 = results[results['y_real']==0][['Probability of Churning']].sort_values(by = 'Probability of Churning', ascending = False).head(500)
worst_500

Unnamed: 0_level_0,Probability of Churning
CustomerID,Unnamed: 1_level_1
7577-SWIFR,0.762
7439-DKZTW,0.754
4912-PIGUY,0.747
7465-ZZRVX,0.730
9603-OAIHC,0.730
...,...
8212-CRQXP,0.472
5019-GQVCR,0.472
2507-QZPQS,0.472
5222-IMUKT,0.471


In [73]:
All_Customers=results[['Probability of Churning']].sort_values(by='Probability of Churning',ascending= False)
All_Customers

Unnamed: 0_level_0,Probability of Churning
CustomerID,Unnamed: 1_level_1
3027-ZTDHO,0.776
2725-KXXWT,0.776
7274-RTAPZ,0.776
9725-SCPZG,0.776
0970-ETWGE,0.776
...,...
6340-DACFT,0.005
1830-IPXVJ,0.005
7064-JHXCE,0.005
2050-ONYDQ,0.005


In [75]:
worst_500.to_csv('Most likely to churn 500.csv')
All_Customers.to_csv('Probability of Churning for All Customers.csv')
feat_import.to_csv('List of Feature Importances Ranked.csv', index=False)

## Further Considerations

**Purpose:** To see if there was any merit in changing to a different model for better metric scores

In [78]:

def log_eng(df):
    df = df.copy()
    # Mapping categorical variables to numerical values
    df['Gender'] = df['Gender'].map({'Female':0, 'Male':1})
    df['Senior Citizen'] = df['Senior Citizen'].map({'No':0, 'Yes':1})
    df['Partner'] = df['Partner'].map({'No':0, 'Yes':1})
    df['Dependents'] = df['Dependents'].map({'No':0, 'Yes':1})
    df['Phone Service'] = df['Phone Service'].map({'No':0, 'Yes':1})
    df['Paperless Billing'] = df['Paperless Billing'].map({'No':0, 'Yes':1})
    
    #One Hot Encoding the features that have more than one value
    dum_cols = ['Multiple Lines', 'Internet Service', 'Online Security',
            'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
            'Streaming Movies', 'Contract', 'Payment Method']
    
    for x in dum_cols:
        df = pd.get_dummies(data = df, columns = [x], prefix=f'{x}', drop_first=True, dtype= int)

    # Convert 'Total Charges' column to float
    df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
    
    # Fill null values in 'Total Charges' with 0
    df['Total Charges'].fillna(0, inplace=True)
    
    # Add additional charges column 
    df['Additional charges'] = df['Total Charges'] - (df['Monthly Charges'] * df['Tenure Months'])
    
    # Select columns
    cols = ['Gender', 'Senior Citizen', 'Partner', 'Dependents',
       'Tenure Months', 
        'Phone Service', 'Paperless Billing',
       'Monthly Charges',
        'Total Charges',
        'Multiple Lines_No phone service', 'Multiple Lines_Yes',
       'Internet Service_Fiber optic', 'Internet Service_No',
       'Online Security_No internet service', 'Online Security_Yes',
       'Online Backup_No internet service', 'Online Backup_Yes',
       'Device Protection_No internet service', 'Device Protection_Yes',
       'Tech Support_No internet service', 'Tech Support_Yes',
       'Streaming TV_No internet service', 'Streaming TV_Yes',
       'Streaming Movies_No internet service', 'Streaming Movies_Yes',
       'Contract_One year', 'Contract_Two year',
       'Payment Method_Credit card (automatic)',
       'Payment Method_Electronic check', 'Payment Method_Mailed check'
        ,'Additional charges']
    df = df[cols]
    
    return df

In [79]:
df_l =log_eng(df)

In [81]:
df_l.head()

Unnamed: 0_level_0,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Paperless Billing,Monthly Charges,Total Charges,Multiple Lines_No phone service,...,Streaming TV_No internet service,Streaming TV_Yes,Streaming Movies_No internet service,Streaming Movies_Yes,Contract_One year,Contract_Two year,Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check,Additional charges
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3668-QPYBK,1,0,0,0,2,1,1,53.85,108.15,0,...,0,0,0,0,0,0,0,0,1,0.45
9237-HQITU,0,0,0,1,2,1,1,70.7,151.65,0,...,0,0,0,0,0,0,0,1,0,10.25
9305-CDSKC,0,0,0,1,8,1,1,99.65,820.5,0,...,0,1,0,1,0,0,0,1,0,23.3
7892-POOKP,0,0,1,1,28,1,1,104.8,3046.05,0,...,0,1,0,1,0,0,0,1,0,111.65
0280-XJGEX,1,0,0,1,49,1,1,103.7,5036.3,0,...,0,1,0,1,0,0,0,0,0,-45.0


In [83]:
X_train_lr = log_eng(X_train)
X_test_lr = log_eng(X_test)

In [85]:
from sklearn.linear_model import LogisticRegression
logreg= LogisticRegression(max_iter = 350)
logreg.fit(X_train_lr,y_train)


In [86]:
y_pred_train_lr = logreg.predict(X_train_lr)

In [87]:
get_results(y_train,y_pred_train_lr)

The confusion matrix for your predictions is:
[[3756  409]
 [ 627  842]] 

The accuracy of your model is: 0.8161164359247426
The recall of your model is: 0.5731790333560245
The precision of your model is: 0.6730615507593924
The F1-score of your model is: 0.6191176470588236


In [89]:
y_pred_test_lr = logreg.predict(X_test_lr)

In [90]:
get_results(y_test,y_pred_test_lr)

The confusion matrix for your predictions is:
[[901 108]
 [173 227]] 

The accuracy of your model is: 0.8005677785663591
The recall of your model is: 0.5675
The precision of your model is: 0.6776119402985075
The F1-score of your model is: 0.6176870748299319


You can see from these numbers that the scores produced by this model aren't much different from the previous model. Due to time constraints this was by no means a perfect model but it shows us that our random forest model is valid and yeilds good information on the likelhood of customers churning.