In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('wine_quality').getOrCreate()

In [0]:
data_white = spark.read.csv('/FileStore/tables/winequality_white.csv',inferSchema='True',header='true',sep=';')
data_red = spark.read.csv('/FileStore/tables/winequality_red.csv',inferSchema='True',header='true',sep=';')

In [0]:
data_white.head(1)

In [0]:
data_red.head(1)

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
data_white.columns

In [0]:
data_red.columns

In [0]:
assembler = VectorAssembler(inputCols=['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol'], outputCol='features')

In [0]:
output_white = assembler.transform(data_white)

In [0]:
output_red = assembler.transform(data_red)

In [0]:
output_white.printSchema()

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier

In [0]:
dtc_white = DecisionTreeClassifier(labelCol='quality',featuresCol='features')

In [0]:
dtc_red = DecisionTreeClassifier(labelCol='quality',featuresCol='features')

In [0]:
final_data_white = output_white.select('features','quality')
final_data_red = output_red.select('features','quality')

In [0]:
train_data_white,test_data_white = final_data_white.randomSplit([0.7,0.3])

In [0]:
train_data_red,test_data_red = final_data_red.randomSplit([0.7,0.3])

In [0]:
dtc_model_white = dtc_red.fit(train_data_white)

In [0]:
dtc_model_red = dtc_red.fit(train_data_red)

In [0]:
dtc_preds_white = dtc_model_red.transform(test_data_white)

In [0]:
dtc_preds_red = dtc_model_red.transform(test_data_red)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
my_binary_eval_white = BinaryClassificationEvaluator(labelCol='quality')
my_binary_eval_red = BinaryClassificationEvaluator(labelCol='quality')

In [0]:
print('Decision Tree Clasifier Results - White Wine:')
print(my_binary_eval_white.evaluate(dtc_preds_white))

In [0]:
print('Decision Tree Clasifier Results - Red Wine:')
print(my_binary_eval_red.evaluate(dtc_preds_red))