In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
from spark_tree_plotting import plot_tree
from spark_tree_plotting import export_graphviz

In [3]:
spark = SparkSession.builder.appName("homeworktree").getOrCreate()

21/11/29 19:53:50 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.65.128 instead (on interface ens33)
21/11/29 19:53:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/11/29 19:53:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
data = spark.read.csv("fertility_Diagnosis.txt", inferSchema=True, header=True)
# data = spark.read.format("libsvm").load("audit_risk.csv")

In [5]:
data.show()

+-----+----+------------+------------------+-----------------------+------+--------------------------+----------+------------------+-----------+
|Epoca|Edad|Enfermedades|Accidente_o_trauma|Intervencion_quirurgica|Fiebre|Frecuencia_Consumo_Alcohol|Tabaquismo|num_Horas_Sentados|Diagnostico|
+-----+----+------------+------------------+-----------------------+------+--------------------------+----------+------------------+-----------+
|-0.33|0.69|           0|                 1|                      1|     0|                       0.8|         0|              0.88|          N|
|-0.33|0.94|           1|                 0|                      1|     0|                       0.8|         1|              0.31|          O|
|-0.33| 0.5|           1|                 0|                      0|     0|                       1.0|        -1|               0.5|          N|
|-0.33|0.75|           0|                 1|                      1|     0|                       1.0|        -1|              0.3

In [6]:
data.columns

['Epoca',
 'Edad',
 'Enfermedades',
 'Accidente_o_trauma',
 'Intervencion_quirurgica',
 'Fiebre',
 'Frecuencia_Consumo_Alcohol',
 'Tabaquismo',
 'num_Horas_Sentados',
 'Diagnostico']

### Vectorizamos para crear features y indexamos Diagnostico

In [7]:
Diagnistic_Indexed = StringIndexer(inputCol='Diagnostico',outputCol='Diagnostico_Indexed')

In [8]:
assembler = VectorAssembler(inputCols = [ "Edad",
                                          "Enfermedades",
                                          "Accidente_o_trauma",
                                          "Intervencion_quirurgica",
                                          "Frecuencia_Consumo_Alcohol",
                                          "Tabaquismo",
                                          "num_Horas_Sentados" 
],outputCol="features")

In [9]:
feature_Indexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures", maxCategories=4 )

In [10]:
training_data, test_data = data.randomSplit([0.7,0.3])

In [11]:
dt = DecisionTreeClassifier(labelCol="Diagnostico_Indexed",featuresCol="indexedFeatures")

In [12]:
pipeline = Pipeline(stages=[Diagnistic_Indexed,assembler,feature_Indexer,dt])

In [13]:
fit_model = pipeline.fit(training_data)

In [14]:
result = fit_model.transform(test_data)

In [15]:
result.select("prediction","Diagnostico_Indexed","features").show(5)

+----------+-------------------+--------------------+
|prediction|Diagnostico_Indexed|            features|
+----------+-------------------+--------------------+
|       0.0|                0.0|[0.5,1.0,0.0,0.0,...|
|       0.0|                0.0|[0.53,1.0,1.0,0.0...|
|       0.0|                0.0|[0.53,1.0,1.0,1.0...|
|       0.0|                0.0|[0.58,1.0,0.0,1.0...|
|       0.0|                0.0|[0.61,1.0,1.0,1.0...|
+----------+-------------------+--------------------+
only showing top 5 rows



In [16]:
evaluator = MulticlassClassificationEvaluator(
            labelCol = "Diagnostico_Indexed",
            predictionCol="prediction",
            metricName="accuracy"
)

In [17]:
accuracy = evaluator.evaluate(result)

In [18]:
print ("Test Error = %g " %(1.0-accuracy)) 

Test Error = 0.26087 


In [19]:
treeModel = fit_model.stages[3]

In [31]:
print(treeModel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_6ac3f193ebfe, depth=5, numNodes=17, numClasses=2, numFeatures=7
  If (feature 0 <= 0.655)
   Predict: 0.0
  Else (feature 0 > 0.655)
   If (feature 0 <= 0.6799999999999999)
    If (feature 6 <= 0.28)
     Predict: 0.0
    Else (feature 6 > 0.28)
     If (feature 1 in {0.0})
      If (feature 3 in {0.0})
       Predict: 0.0
      Else (feature 3 not in {0.0})
       Predict: 1.0
     Else (feature 1 not in {0.0})
      Predict: 1.0
   Else (feature 0 > 0.6799999999999999)
    If (feature 6 <= 0.095)
     Predict: 0.0
    Else (feature 6 > 0.095)
     If (feature 6 <= 0.41000000000000003)
      Predict: 0.0
     Else (feature 6 > 0.41000000000000003)
      If (feature 6 <= 0.45499999999999996)
       Predict: 1.0
      Else (feature 6 > 0.45499999999999996)
       Predict: 0.0



In [None]:
fit_model.

### GRAFICA DEL MODELO

In [21]:
from PIL import Image
import io

In [32]:
Diagnistic_Indexed_model = Diagnistic_Indexed.fit(data)

In [37]:
list_name = ["Edad",
             "Enfermedades",
             "Accidente_o_trauma",
             "Intervencion_quirurgica",
              "Frecuencia_Consumo_Alcohol",
             "Tabaquismo",
             "num_Horas_Sentados"
            ]
tree_plot = plot_tree(treeModel,
                      featureNames= list_name,
                    
                      classNames=Diagnistic_Indexed_model.labels,
                      filled=True,
                      roundedCorners=True,
                      roundLeaves=True)

from IPython.display import Image

Image(tree_plot)

TypeError: 'JavaPackage' object is not callable

In [39]:
json_tree = data._jvm.com.vfive.spark.ml.SparkMLTree(treeModel._java_obj)

AttributeError: 'DataFrame' object has no attribute '_jvm'

In [22]:
from pydotplus import graph_from_dot_data
from sklearn.tree import export_graphviz

In [47]:
list_name = ["Edad",
             "Enfermedades",
             "Accidente_o_trauma",
             "Intervencion_quirurgica",
              "Frecuencia_Consumo_Alcohol",
             "Tabaquismo",
             "num_Horas_Sentados"
            ]
Target = ['1','2','3']
dot_data = export_graphviz(dt, out_file= "tree1.dot", class_names= Target,
                          feature_names= list_name,impurity=False, filled= True)


NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
~$ spark-shell/pyspark/spark-submit --packages julioasotodv:spark-tree-plotting:0.2


In [48]:
treeModel.numNodes

11

In [40]:
#newdata = data.selectExpr("-0.33 as epoca","0.69 as edad"
#data = data.select(col("-0.33").alias("epoca"),col("02").alias("e"),col("13").alias("d"),col("14").alias("a"),col("05").alias("b"),col("0.8").alias("c"),col("07").alias("e"),col("0.88").alias("f"),col("n").alias("g"))
#data.show(5)
from functools import reduce

oldColumns = data.schema.names
newColumns = ["epoca", "edad","enfermedades","accidente_o_trauma","intervencion_quirurgica","fiebre","frecuencia","tabaquismo","num_Horas_sentados","diagnostico"]

df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), data)
df.printSchema()
df.show()

root
 |-- epoca: double (nullable = true)
 |-- edad: double (nullable = true)
 |-- enfermedades: integer (nullable = true)
 |-- accidente_o_trauma: integer (nullable = true)
 |-- intervencion_quirurgica: integer (nullable = true)
 |-- fiebre: integer (nullable = true)
 |-- frecuencia: double (nullable = true)
 |-- tabaquismo: integer (nullable = true)
 |-- num_Horas_sentados: double (nullable = true)
 |-- diagnostico: string (nullable = true)

+-----+----+------------+------------------+-----------------------+------+----------+----------+------------------+-----------+
|epoca|edad|enfermedades|accidente_o_trauma|intervencion_quirurgica|fiebre|frecuencia|tabaquismo|num_Horas_sentados|diagnostico|
+-----+----+------------+------------------+-----------------------+------+----------+----------+------------------+-----------+
|-0.33|0.94|           1|                 0|                      1|     0|       0.8|         1|              0.31|          O|
|-0.33| 0.5|           1|           

21/11/24 20:59:02 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: -0.33, 0.69, 0, 1, 1, 0, 0.8, 0, 0.88, N
 Schema: -0.33, 0.69, 02, 13, 14, 05, 0.8, 07, 0.88, N
Expected: 02 but found: 0
CSV file: file:///home/angel/Documents/notebooks/fertility_Diagnosis.txt
