This notebook is meant as an example of how to handle PySpark-MySQL communication.

In [13]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.config("spark.jars", "/usr/share/java/mysql-connector-java-8.0.22.jar") \
    .master("local").appName("PySpark_MySQL_test").getOrCreate()

In [26]:
wine_df = (spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/TestDB")
    .option("driver", "com.mysql.jdbc.Driver").option("dbtable", "Wines")
    .option("user", "greg").option("password", "greg").load())

In [6]:
train_df, test_df = wine_df.randomSplit([.8, .2], seed=12345)

In [14]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

predictors = ["fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides",
              "free_so2", "total_so2", "density", "pH", "sulphates", "alcohol"]
vec_assembler = VectorAssembler(inputCols=predictors, outputCol="features")
vec_train_df = vec_assembler.transform(train_df)
vec_train_df.select("features", "is_red").show(5)

+--------------------+------+
|            features|is_red|
+--------------------+------+
|[3.8,0.31,0.02,11...|     0|
|[3.9,0.225,0.4,4....|     0|
|[4.2,0.17,0.36,1....|     0|
|[4.2,0.215,0.23,5...|     0|
|[4.4,0.32,0.39,4....|     0|
+--------------------+------+
only showing top 5 rows



In [15]:
lr = LogisticRegression(labelCol="is_red", featuresCol="features")
lr_model = lr.fit(vec_train_df)

In [16]:
vec_test_df = vec_assembler.transform(test_df)
predictions = lr_model.transform(vec_test_df)

In [19]:
# do this once RF is working
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vec_assembler, lr])
pipeline_model = pipeline.fit(train_df)
pipeline_predictions = pipeline_model.transform(test_df)

In [None]:
len(pandas_pred)

In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="is_red")
evaluator.evaluate(predictions)

0.9917049465379387

In [20]:
evaluator.evaluate(pipeline_predictions)

0.9917049465379387

In [None]:
# some information on what to do is in Learning Spark chapter 5