In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('wine_quality').getOrCreate()

In [0]:
data_white = spark.read.csv('/FileStore/tables/winequality_white.csv',inferSchema='True',header='true',sep=';')
data_red = spark.read.csv('/FileStore/tables/winequality_red.csv',inferSchema='True',header='true',sep=';')

In [0]:
data_white.head(1)

In [0]:
data_red.head(1)

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
data_white.columns

In [0]:
data_red.columns

In [0]:
assembler = VectorAssembler(inputCols=['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol'], outputCol='features')

In [0]:
output_white = assembler.transform(data_white)

In [0]:
output_red = assembler.transform(data_red)

In [0]:
output_white.printSchema()

In [0]:
from pyspark.ml.classification import RandomForestClassifier

In [0]:
rfc_white = RandomForestClassifier(labelCol='quality',featuresCol='features',numTrees=300)

In [0]:
rfc_red = RandomForestClassifier(labelCol='quality',featuresCol='features',numTrees=300)

In [0]:
final_data_white = output_white.select('features','quality')

In [0]:
final_data_red = output_red.select('features','quality')

In [0]:
final_data_white.printSchema()

In [0]:
final_data_white.show()

In [0]:
final_data_red.show()

In [0]:
train_data_white,test_data_white = final_data_white.randomSplit([0.7,0.3])
train_data_red,test_data_red = final_data_red.randomSplit([0.7,0.3])

In [0]:
rfc_model_white = rfc_white.fit(train_data_white)

In [0]:
rfc_model_red = rfc_red.fit(final_data_red)

In [0]:
rfc_model_white.featureImportances

In [0]:
rfc_model_red.featureImportances

In [0]:
rfc_preds_white = rfc_model_white.transform(test_data_white)
rfc_preds_red = rfc_model_red.transform(test_data_red)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
my_binary_eval_white = BinaryClassificationEvaluator(labelCol='quality')
my_binary_eval_red = BinaryClassificationEvaluator(labelCol='quality')

In [0]:
print('Random Forest Clasifier Results - White Wine:')
print(my_binary_eval_white.evaluate(rfc_preds_white))

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
my_multiclass_eval_white = MulticlassClassificationEvaluator(labelCol='quality',metricName='accuracy')
my_multiclass_eval_red = MulticlassClassificationEvaluator(labelCol='quality',metricName='accuracy')

In [0]:
rfc_acc_white = my_multiclass_eval_white.evaluate(rfc_preds_white)
rfc_acc_red = my_multiclass_eval_red.evaluate(rfc_preds_red)

In [0]:
rfc_acc_white

In [0]:
rfc_acc_red

In [0]:
rfc_preds_white.show()

In [0]:
rfc_preds_white.filter(rfc_preds_white.quality==8).show()

Predictions using Ramdon Forest are quite good! Similar to Linear Regression