In [1]:
df = sqlContext.sql("select * from churn_modelling")
df = df.drop('RowNumber', 'CustomerId', 'Surname')
df.printSchema()

In [2]:
from pyspark.ml.feature import VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

In [3]:
geography_indexer = StringIndexer(inputCol='Geography',outputCol='GeographyInd')
geography_encoder = OneHotEncoder(inputCol='GeographyInd',outputCol='GeographyEnc')
gender_indexer = StringIndexer(inputCol='Gender',outputCol='GenderInd')
gender_encoder = OneHotEncoder(inputCol='GenderInd',outputCol='GenderEnc')

In [4]:
assembler = VectorAssembler(inputCols=['CreditScore', 'GeographyEnc', 'GenderEnc', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                                        'HasCrCard', 'IsActiveMember', 'EstimatedSalary'],outputCol='features')

In [5]:
for i in range(20,120,20):
  churn_model = RandomForestClassifier(featuresCol='features',labelCol='Exited', numTrees = i)
  pipeline = Pipeline(stages=[geography_indexer,geography_encoder,gender_indexer,gender_encoder,assembler,churn_model])
  train_data, test_data = df.randomSplit([0.7,.3])
  fit_model = pipeline.fit(train_data)
  results = fit_model.transform(test_data)
  print(i)
  my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Exited')
  AUC = my_eval.evaluate(results)
  print(AUC)
  my_acc = MulticlassClassificationEvaluator(labelCol="Exited", predictionCol="prediction", metricName="accuracy")
  Accuracy = my_acc.evaluate(results)
  print(Accuracy)
  print('\n')

In [6]:
churn_model = RandomForestClassifier(featuresCol='features',labelCol='Exited', numTrees = 60)

In [7]:
pipeline = Pipeline(stages=[geography_indexer,geography_encoder,gender_indexer,gender_encoder,assembler,churn_model])

In [8]:
train_data, test_data = df.randomSplit([0.7,.3])

In [9]:
fit_model = pipeline.fit(train_data)
results = fit_model.transform(test_data)

In [10]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Exited')
AUC = my_eval.evaluate(results)
AUC

In [11]:
my_acc = MulticlassClassificationEvaluator(labelCol="Exited", predictionCol="prediction", metricName="accuracy")
Accuracy = my_acc.evaluate(results)
Accuracy