In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
spark = SparkSession.builder.master("local").\
    appName("CNPJ").\
        config("spark.executer.memory","1gb").\
            getOrCreate()

In [3]:
churn = spark.read.csv("D:\downloads\Churn.csv", inferSchema=True, header=True, sep=";")

In [5]:
churn.show(5)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

In [7]:
formula = RFormula(formula="Exited ~ .",featuresCol="features", labelCol="label",handleInvalid="skip")

In [8]:
churn_trans = formula.fit(churn).transform(churn).select("features","label")

In [9]:
churn_trans.show(truncate=False)

+----------------------------------------------------------------+-----+
|features                                                        |label|
+----------------------------------------------------------------+-----+
|[619.0,1.0,0.0,0.0,42.0,2.0,0.0,1.0,1.0,1.0,1.0134888E7]        |1.0  |
|[608.0,0.0,0.0,0.0,41.0,1.0,8380786.0,1.0,0.0,1.0,1.1254258E7]  |0.0  |
|[502.0,1.0,0.0,0.0,42.0,8.0,1596608.0,3.0,1.0,0.0,1.1393157E7]  |1.0  |
|(11,[0,1,4,5,7,10],[699.0,1.0,39.0,1.0,2.0,9382663.0])          |0.0  |
|[850.0,0.0,0.0,0.0,43.0,2.0,1.2551082E7,1.0,1.0,1.0,790841.0]   |0.0  |
|[645.0,0.0,0.0,1.0,44.0,8.0,1.1375578E7,2.0,1.0,0.0,1.4975671E7]|1.0  |
|[822.0,1.0,0.0,1.0,50.0,7.0,0.0,2.0,1.0,1.0,100628.0]           |0.0  |
|[376.0,0.0,1.0,0.0,29.0,4.0,1.1504674E7,4.0,1.0,0.0,1.1934688E7]|1.0  |
|[501.0,1.0,0.0,1.0,44.0,4.0,1.4205107E7,2.0,0.0,1.0,749405.0]   |0.0  |
|[684.0,1.0,0.0,1.0,27.0,2.0,1.3460388E7,1.0,1.0,1.0,7172573.0]  |0.0  |
|[528.0,1.0,0.0,1.0,31.0,6.0,1.0201672E7,2.0,0.0,0.

In [10]:
churnTreino, churnTeste = churn_trans.randomSplit([0.7,0.3])

In [11]:
churnTreino.count()

7014

In [12]:
churnTeste.count()

2986

In [13]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

In [14]:
modelo = dt.fit(churnTreino)

In [15]:
previsao = modelo.transform(churnTeste)

In [17]:
previsao.show(truncate=False)

+---------------------------------------------------------+-----+--------------+----------------------------------------+----------+
|features                                                 |label|rawPrediction |probability                             |prediction|
+---------------------------------------------------------+-----+--------------+----------------------------------------+----------+
|(11,[0,1,3,4,7,10],[668.0,1.0,1.0,46.0,2.0,2938802.0])   |0.0  |[136.0,33.0]  |[0.8047337278106509,0.1952662721893491] |0.0       |
|(11,[0,1,3,4,7,10],[748.0,1.0,1.0,40.0,1.0,6041676.0])   |0.0  |[4545.0,535.0]|[0.8946850393700787,0.10531496062992125]|0.0       |
|(11,[0,1,3,4,7,10],[794.0,1.0,1.0,33.0,2.0,1.7812271E7]) |0.0  |[4545.0,535.0]|[0.8946850393700787,0.10531496062992125]|0.0       |
|(11,[0,1,4,5,7,10],[449.0,1.0,21.0,7.0,2.0,1.7574392E7]) |0.0  |[4545.0,535.0]|[0.8946850393700787,0.10531496062992125]|0.0       |
|(11,[0,1,4,5,7,10],[474.0,1.0,30.0,9.0,2.0,6315822.0])   |0.0  |[454

In [20]:
#avaliando a preformance
avaliar = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label", metricName="areaUnderROC")

In [21]:
areaUnderRoc = avaliar.evaluate(previsao)

In [22]:
areaUnderRoc

0.6875091476841871