#Kaggle:Telco Customer Churn
"Predict behavior to retain customers. You can analyze all relevant customer data and develop focused customer retention programs."

In [2]:
# File location and type
file_location = "/FileStore/tables/WA_Fn_UseC__Telco_Customer_Churn-89c80.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
# Reference:https://stackoverflow.com/questions/44296484/how-to-replace-null-nan-or-infinite-values-to-default-value-in-spark-scala
# Reference:https://stackoverflow.com/questions/46439410/spark-treating-null-values-in-csv-column-as-null-datatype
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .option('nanValue',' ')\
  .option('nulValue',' ')\
  .load(file_location)

display(df)

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [3]:
# Create a view or table

temp_table_name = "TelcoChurnAnalysis"

df.createOrReplaceTempView(temp_table_name)

In [4]:
%sql

/* Query the created temp table in a SQL cell */

select * from `TelcoChurnAnalysis`


customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [5]:
# Let us first understand the dataset:
# We have the customer unique identifier
# Following that - categorical features such as gender, partner, dependents, multiplelLines, InternetService etc which provide info on the customers and the services that they have opted for. We also have the tenure which talks about how long the customer has been with telecom service provider.

# We also have info on monthly charges which the customer pays and the total charges paid by the customertill date. Finally we have our #dependent variable which is Churn which needs to be converted to numerical before we feed it to the dataset.


In [6]:
df.printSchema()

In [7]:
# If we look at the above schema, most of the attributes are of string type excepting Senior Citizen which is of integer type 
# Monthly Charges and Total Charges are of type double


In [8]:
# Checking out the no. of observations in the data
df.count()


In [9]:
# Finding number of nulls in each column
# reference: https://stackoverflow.com/questions/44627386/how-to-find-count-of-null-and-nan-values-for-each-column-in-a-pyspark-dataframe
from pyspark.sql.functions import isnan, when, count, col
df.select([count(when(isnan(d) | col(d).isNull(), d)).alias(d) for d in df.columns]).show()


In [10]:
# Let us find the distinct values in the PaymentMethod available for the customers
df.select('PaymentMethod').distinct().show()

In [11]:
# Exploring the unique levels in Contract type for customers
df.select('Contract').distinct().show()

In [12]:
# Let us find the distinct values in the PaymentMethod available for the customers
df.select('InternetService').distinct().show()

In [13]:
df.groupby('churn').count().show()
# By looking at at the below table
# There are more non churned users
# It is a slightly imbalanced dataset in terms of the target variable

In [14]:
df.select('tenure','TotalCharges','MonthlyCharges').describe().show()
# Checking the descriptive statistics of all the three numeric columns

In [15]:
df.stat.crosstab("SeniorCitizen","InternetService").show()
# We can see that most of the senior citizens are using fiber optics. This might mean a higher correlation, however there  might not be any causation as well.

In [16]:
df.stat.freqItems(["PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies"],0.6).collect()
# Checking for frequent features with respect to certain columns by setting the threshold to 60 percentage*/
# From the analysis, it is evident that Phone Services, FiberOptic internet services & Online Backup are used by many users
# Tech Support, Device Protection, Online Security & Movie Streaming services are not used by most of the customers in the dataset

In [17]:
# Creating a Pandas dataframe to use matplotlib & Seaborn for visualizations
testquer = sqlContext.sql("select * from TelcoChurnAnalysis")
test_df = testquer.toPandas()

In [18]:
# Create the index and import libraries necessary to manipulate data and generate visualizations
test_df_indexed = test_df.reset_index()
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [19]:
test_df.info()

In [20]:
# Checking if gender has any impact on customer churn
%matplotlib inline
sns.countplot(x = "gender", hue = "Churn", data = test_df, palette="Set1")

# From the barchart, it is evident that equal amount of customers have churned in both genders & the customer base is also split evenly between both the genders. 

In [21]:
# Assessing the impact of being a senior citizen on churn rate
sns.countplot(y = "SeniorCitizen", hue = "Churn", data = test_df, palette="Set2")

# The proportion of senior citizens that are churning is higher than those in the not churning category meaning that they are to be watched out for. 

In [22]:
# Plotting churn across tenure
display(spark.sql("select cast(tenure as int),churn,count(churn) from TelcoChurnAnalysis group by tenure,churn order by cast(tenure as int)"))

#From the plot, it is evident that the highest churn happens within the first 5 - 8 months, which is probably true as most telcos offer a bonus that the customer becomes eligible for if they stay with them for >3 months. Given that the bonus / cashback takes a a while to get credited / disbursed - it is likely that people leave after that. 

tenure,churn,count(churn)
0,No,11
1,No,233
1,Yes,380
2,No,115
2,Yes,123
3,No,106
3,Yes,94
4,No,93
4,Yes,83
5,No,69


In [23]:
# Plotting the churn with respect to monthly charges
display(spark.sql("select cast(monthlycharges as int),churn,count(churn) from TelcoChurnAnalysis group by monthlycharges,churn order by cast(monthlycharges as int)"))

# From the plot, it is apparent that the proportion of customers that churn are high in the $40-$50 monthly range (probably users that are having only a single line in their plan) and for those with a monthly bill greater than $65.

monthlycharges,churn,count(churn)
18,No,1
18,No,2
18,Yes,1
18,No,4
18,No,7
18,No,1
18,No,1
18,Yes,1
18,No,1
18,No,5


In [24]:
# Visualizing the levels of internet services opted for by users & plotting churn against it.
# Reference for creating stacked bar graph : https://stackoverflow.com/questions/50319614/count-plot-with-stacked-bars-per-hue

df_plot_isp = test_df.groupby(['Churn', 'InternetService']).size().reset_index().pivot(columns='Churn', index='InternetService', values=0)
df_plot_isp.plot(kind = "bar", stacked = True, cmap = 'Pastel2')

# Close to half the customers that have opted for the fiber optic internet service are churning which indicates that the tech savvy group are not satisfied with the price / offering ratio. Given that Fiber Optic is inherently expensive compared to DSL, customers probably expect to get the best when paying a premium for it. 

#On the other hand, customers without internet are not a concern for churn at all. 

In [25]:
# Creating a visualization to check if the type of contract has an effect on churn 

display(spark.sql("select Contract, churn, count(*) as Count from TelcoChurnAnalysis group by Contract, churn"))

# The customers opting for a short term contract are the ones churning the most. 

Contract,churn,Count
Month-to-month,No,2220
Month-to-month,Yes,1655
Two year,Yes,48
One year,No,1307
One year,Yes,166
Two year,No,1647


In [26]:
# Plotting the monthly charges of customers that churn across diferent contract types
sns.catplot(x="Contract", y="MonthlyCharges", hue="Churn", kind="box", data=test_df)

# From the plot, it is evident that the median monthly charges of the customers that churn is way higher than that of those who do not churn across all types of contracts. MonthlyCharges might be a very good indicator of customer churn in this regard. 

In [27]:
# Checking the impact of MonthlyCharges & TotalCharges while simultaneously comparing it with tenure
fig, ax = plt.subplots(1,3)
sns.scatterplot(x = "MonthlyCharges", y = "tenure", hue = "Churn", style = "Churn", data = test_df, palette="Set3", ax=ax[0])
sns.scatterplot(x = "MonthlyCharges", y = "TotalCharges", hue = "Churn", style = "Churn", data = test_df, palette="Set2", ax=ax[1])
sns.scatterplot(x = "TotalCharges", y = "tenure", hue = "Churn", style = "Churn", data = test_df, palette = "Set1", ax=ax[2])
plt.tight_layout()

# From the plots, it is pretty evident that the higher monthly charges lead the customer to churn, and it even more prominent in the initial stages of the tenure. Once again, this points in the direction that tenure & MonthlyCharges could be important predictors of churn. 

In [28]:
# Calculating and plotting the correlation between the independent variables & the target variable 
# Factorizing categorical variables in order to be able to create a correlation plot - https://stackoverflow.com/questions/48035381/correlation-among-multiple-categorical-variables-pandas

corr_matrix = test_df.apply(lambda x: pd.factorize(x)[0]).corr()
sns.heatmap(corr_matrix, xticklabels=corr_matrix.columns, yticklabels=corr_matrix.columns, linewidths=1.5, cmap='YlOrBr')

In [29]:
df.drop('customerID')

In [30]:

churn_df=df
(train_data,test_data)=churn_df.randomSplit([0.7,0.3],24)


print("Records for training:"+str(train_data.count()))

print("Records for test:"+str(test_data.count()))


In [31]:
# Appyling string indexer transformer to target value
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator,StringIndexer,VectorAssembler

catColumns=["gender","SeniorCitizen","Partner","Dependents","PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport",
           "StreamingTV","StreamingMovies","Contract","PaperlessBilling","PaymentMethod"]

In [32]:
stages=[]

for catCol in catColumns:
  
  stringIndexer=StringIndexer(inputCol=catCol,outputCol=catCol+"Index")
  
  encoder=OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()],outputCols=[catCol+"catvec"])
  
  stages+=[stringIndexer,encoder]
 

In [33]:
stages

In [34]:
label_Idx= StringIndexer(inputCol="Churn", outputCol="label")
stages += [label_Idx]

#Appyling string indexer transformer to target value

In [35]:
from pyspark.ml.feature import Imputer
imputer=Imputer(inputCols=["TotalCharges"],outputCols=["TotalChargesimputed"])
stages += [imputer]

# Totalcharges was having missing values
# Handling Missing values

In [36]:
stages

In [37]:
from pyspark.ml.feature import QuantileDiscretizer
tenure_bin=QuantileDiscretizer(numBuckets=3,inputCol="tenure",outputCol="tenure_bins")
stages += [tenure_bin]

In [38]:
stages

In [39]:
numericCols=["tenure_bins","MonthlyCharges"]
assembleInputs=assemblerInputs=[c+ "catvec" for c in catColumns]+ numericCols
assembler=VectorAssembler(inputCols=assembleInputs,outputCol="features")
stages+=[assembler]

In [40]:

pipeline=Pipeline().setStages(stages)

pipelineModel=pipeline.fit(train_data)


#setting up the various stages in a pipeline and the fitting our model with train data

In [41]:
trainprepDF=pipelineModel.transform(train_data)

In [42]:
testprepDF=pipelineModel.transform(test_data)

In [43]:
trainprepDF.head(1)
# Let us view the first column of data after all the tranformers have been applied

In [44]:
trainprepDF.select("tenure_bins").show()

In [45]:
from pyspark.ml.classification import LogisticRegression

# Creating the initial LogisticRegression Model
lr=LogisticRegression(labelCol="label",featuresCol="features",maxIter=10)


# Train model with trainingData
lrModel=lr.fit(trainprepDF)

In [46]:
print("Coefficients:" +str(lrModel.coefficients))
print("Intercept:"+str(lrModel.intercept))

In [47]:
summary=lrModel.summary

In [48]:
# Printing the metrics of the model
accuracy=summary.accuracy
falsePositiveRate=summary.weightedFalsePositiveRate
truePositiveRate=summary.weightedTruePositiveRate
fMeasure=summary.weightedFMeasure()
precision=summary.weightedPrecision
recall=summary.weightedRecall

In [49]:
display(lrModel,trainprepDF,"ROC")

False Positive Rate,True Positive Rate,Threshold
0.0,0.0,0.8176586948452697
0.0,0.0384615384615384,0.8176586948452697
0.0153846153846153,0.0384615384615384,0.7814509812634578
0.0153846153846153,0.0769230769230769,0.7570397576748995
0.0307692307692307,0.0769230769230769,0.6924007034365477
0.0307692307692307,0.1153846153846153,0.6903355759910935
0.0307692307692307,0.1538461538461538,0.67355287401764
0.0461538461538461,0.1538461538461538,0.646646368280868
0.0461538461538461,0.1923076923076923,0.6388276602933302
0.0461538461538461,0.2307692307692307,0.6333630701936241


In [50]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions=lrModel.transform(testprepDF)
evaluatorLR=BinaryClassificationEvaluator(rawPredictionCol="prediction")
area_under_curve = evaluatorLR.evaluate(predictions)

print("areunderROC=%g" %area_under_curve)

evaluatorLR.getMetricName()

In [51]:
 #Analysis:We can see that Area under ROC from train to test has come down from 84 to 72

In [52]:
results=predictions.select(['prediction','label'])
count=predictions.count()

correct=results.filter(results.prediction==results.label).count()
wrong=results.filter(results.prediction!=results.label).count()
tp=results.filter(results.prediction==1.0).filter(results.prediction==results.label).count()
fp=results.filter(results.prediction==1.0).filter(results.prediction!=results.label).count()
tn=results.filter(results.prediction==0.0).filter(results.prediction==results.label).count()
fn=results.filter(results.prediction==0.0).filter(results.prediction!=results.label).count()

accuracy=(tp+tn)/count

precision_lr=tp/(tp+fp)
recall_lr=tp/(tp+fn)
f1_score_lr=2*((precision_lr*recall_lr)/(precision_lr+recall_lr))

print("correct: %s\nWrong: %s\ntp: %s\nfp: %s\ntn: %s\nfn: %s\nAccuracy: %s\nprecision: %s\nrecall: %s" %(correct,wrong,tp,fp,tn,fn,accuracy,precision_lr,recall_lr))
print("F1 score:",f1_score_lr)

In [53]:
from pyspark.ml.classification import RandomForestClassifier

rf=RandomForestClassifier(labelCol="label",
                         featuresCol="features").setImpurity("gini").setMaxDepth(6).setNumTrees(50).setFeatureSubsetStrategy("auto").setSeed(10)

rfModel=rf.fit(trainprepDF)

In [54]:

predictionss=rfModel.transform(testprepDF)

In [55]:
results=predictionss.select(['prediction','label'])
count=predictionss.count()

correct=results.filter(results.prediction==results.label).count()
wrong=results.filter(results.prediction!=results.label).count()
tp=results.filter(results.prediction==1.0).filter(results.prediction==results.label).count()
fp=results.filter(results.prediction==1.0).filter(results.prediction!=results.label).count()
tn=results.filter(results.prediction==0.0).filter(results.prediction==results.label).count()
fn=results.filter(results.prediction==0.0).filter(results.prediction!=results.label).count()

accuracy=(tp+tn)/count

precision_rf=tp/(tp+fp) 
recall_rf=tp/(tp+fn) 
f1_score_rf=2*((precision_rf*recall_rf)/(precision_rf+recall_rf))


print("correct: %s\nWrong: %s\ntp: %s\nfp: %s\ntn: %s\nfn: %s\nAccuracy: %s\nprecision: %s\nrecall: %s" %(correct,wrong,tp,fp,tn,fn,accuracy,precision_rf,recall_rf))
print("F1 score:",f1_score_rf)

In [56]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(trainprepDF)
predictions_DecisionTree = dtModel.transform(testprepDF)

In [57]:
results=predictions_DecisionTree.select(['prediction','label'])
count=predictions_DecisionTree.count()

correct=results.filter(results.prediction==results.label).count()
wrong=results.filter(results.prediction!=results.label).count()
tp=results.filter(results.prediction==1.0).filter(results.prediction==results.label).count()
fp=results.filter(results.prediction==1.0).filter(results.prediction!=results.label).count()
tn=results.filter(results.prediction==0.0).filter(results.prediction==results.label).count()
fn=results.filter(results.prediction==0.0).filter(results.prediction!=results.label).count()


accuracy=(tp+tn)/count

precision_dt=tp/(tp+fp) 
recall_dt=tp/(tp+fn) 
f1_score_dt=2*((precision_dt*recall_dt)/(precision_dt+recall_dt))
print("correct: %s\nWrong: %s\ntp: %s\nfp: %s\ntn: %s\nfn: %s\nAccuracy: %s\nprecision: %s\nrecall: %s" %(correct,wrong,tp,fp,tn,fn,accuracy,precision_dt,recall_dt))
print("F1 score:",f1_score_dt)
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

In [58]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(trainprepDF)
predictions_gbt = gbtModel.transform(testprepDF)

In [59]:
results=predictions_gbt.select(['prediction','label'])
count=predictions_gbt.count()

correct=results.filter(results.prediction==results.label).count()
wrong=results.filter(results.prediction!=results.label).count()
tp=results.filter(results.prediction==1.0).filter(results.prediction==results.label).count()
fp=results.filter(results.prediction==1.0).filter(results.prediction!=results.label).count()
tn=results.filter(results.prediction==0.0).filter(results.prediction==results.label).count()
fn=results.filter(results.prediction==0.0).filter(results.prediction!=results.label).count()

accuracy=(tp+tn)/count

precision_gbt=tp/(tp+fp) 
recall_gbt=tp/(tp+fn) 
f1_score_gbt=2*((precision_gbt*recall_gbt)/(precision_gbt+recall_gbt))
print("correct: %s\nWrong: %s\ntp: %s\nfp: %s\ntn: %s\nfn: %s\nAccuracy: %s\nprecision: %s\nrecall: %s" %(correct,wrong,tp,fp,tn,fn,accuracy,precision_gbt,recall_gbt))
print("F1 score:",f1_score_gbt)
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions_gbt, {evaluator.metricName: "areaUnderROC"})))

In [60]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [2, 4, 6])
             .addGrid(gbt.maxBins, [20, 60])
             .addGrid(gbt.maxIter, [10, 20])
             .build())
cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvModel = cv.fit(trainprepDF)
predictions_cv = cvModel.transform(testprepDF)
evaluator.evaluate(predictions_cv)

In [61]:
results=predictions_cv.select(['prediction','label'])
count=predictions_cv.count()

correct=results.filter(results.prediction==results.label).count()
wrong=results.filter(results.prediction!=results.label).count()
tp=results.filter(results.prediction==1.0).filter(results.prediction==results.label).count()
fp=results.filter(results.prediction==1.0).filter(results.prediction!=results.label).count()
tn=results.filter(results.prediction==0.0).filter(results.prediction==results.label).count()
fn=results.filter(results.prediction==0.0).filter(results.prediction!=results.label).count()


accuracy=(tp+tn)/count

precision_gbtcv=tp/(tp+fp) 
recall_gbtcv=tp/(tp+fn) 
f1_score_gbtcv=2*((precision_gbtcv*recall_gbtcv)/(precision_gbtcv+recall_gbtcv))
print("correct: %s\nWrong: %s\ntp: %s\nfp: %s\ntn: %s\nfn: %s\nAccuracy: %s\nprecision: %s\nrecall: %s" %(correct,wrong,tp,fp,tn,fn,accuracy,precision_gbtcv,recall_gbtcv))
print("F1 score:",f1_score_gbtcv)
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions_cv, {evaluator.metricName: "areaUnderROC"})))




In [62]:
#Based on the above two models, the prediction of churn by the two models seem to be reasonalble, though not good.


#In Logistic regression precison is 61% recall is 57%
#In RandomForest Regressor precision is 63% and recall is 50%


#1)It all depends on our business objective whether to use precision or recall as our classification metrics
#2)what precision is basically what percentage of the churned user did our model predict it accuratly of the total user we identified as churn
#3)recal is Out of number of users really churned how many did we identify correctly
#4)To predict the Churn more accurately, we would need more features that have more influence on the churn and are critical in determining the churn rate.


In [63]:
print("Model","                   ","Precision","               ","Recall","         ","F1 score")
print("Logistic Regression","    ",precision_lr," ",recall_lr," ",f1_score_lr)
print("Random Forest","          ",precision_rf," ",recall_rf," ",f1_score_rf)
print("Decision_Tree","          ",precision_dt," ",recall_dt,"",f1_score_dt)
print("Gradient_BoostedTree","   ",precision_gbt,"  ",recall_gbt," ",f1_score_gbt)

print("Gradient_BoostedTree(cv)",precision_gbtcv," ",recall_gbtcv,"  ",f1_score_gbtcv)