In [1]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64/"
os.environ["SPARK_HOME"] = "/opt/spark-2.4.0"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
red_df = spark.read.option("delimiter", ";").csv("winequality-red.csv", header=True, inferSchema=True)
white_df = spark.read.option("delimiter", ";").csv("winequality-white.csv", header=True, inferSchema=True)

In [3]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

In [4]:
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=red_df.columns, outputCol=vector_col)
df_vector = assembler.transform(red_df).select(vector_col)
matrix = Correlation.corr(df_vector, vector_col)

In [5]:
matrix.collect()[0]["pearson({})".format(vector_col)].values

array([ 1.        , -0.25613089,  0.67170343,  0.11477672,  0.09370519,
       -0.15379419, -0.11318144,  0.66804729, -0.68297819,  0.18300566,
       -0.06166827,  0.12405165, -0.25613089,  1.        , -0.55249568,
        0.00191788,  0.06129777, -0.01050383,  0.07647   ,  0.02202623,
        0.23493729, -0.26098669, -0.20228803, -0.39055778,  0.67170343,
       -0.55249568,  1.        ,  0.14357716,  0.20382291, -0.06097813,
        0.03553302,  0.36494718, -0.54190414,  0.31277004,  0.10990325,
        0.22637251,  0.11477672,  0.00191788,  0.14357716,  1.        ,
        0.05560954,  0.187049  ,  0.20302788,  0.35528337, -0.08565242,
        0.00552712,  0.04207544,  0.01373164,  0.09370519,  0.06129777,
        0.20382291,  0.05560954,  1.        ,  0.00556215,  0.04740047,
        0.20063233, -0.26502613,  0.37126048, -0.22114054, -0.12890656,
       -0.15379419, -0.01050383, -0.06097813,  0.187049  ,  0.00556215,
        1.        ,  0.66766645, -0.02194583,  0.0703775 ,  0.05

In [6]:
train_cols = red_df.columns[:-1]
vectorAssembler = VectorAssembler(inputCols = train_cols, outputCol = 'features')
v_df_red = vectorAssembler.transform(red_df)
v_df_red = v_df_red.select(['features', 'quality'])
(train_df_red, test_df_red) = v_df_red.randomSplit([0.8,0.2])

In [7]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', labelCol='quality')
lr_model_red = lr.fit(train_df_red)
trainingSummary_red = lr_model_red.summary
print("RMSE: %f" % trainingSummary_red.rootMeanSquaredError)
print("MSE: %f" % trainingSummary_red.meanSquaredError)
print("MAE: %f" % trainingSummary_red.meanAbsoluteError)

RMSE: 0.643467
MSE: 0.414050
MAE: 0.499519


In [8]:
lr_predictions_red = lr_model_red.transform(test_df_red)
lr_predictions_red.select("prediction","quality","features").show(5)

+------------------+-------+--------------------+
|        prediction|quality|            features|
+------------------+-------+--------------------+
|  6.55316957774243|      7|[5.1,0.42,0.0,1.8...|
|6.3149538874919084|      7|[5.3,0.57,0.01,1....|
|   5.5071496346784|      6|[5.4,0.58,0.08,1....|
| 5.693596489689618|      6|[5.4,0.74,0.0,1.2...|
| 5.540644770812189|      6|[5.4,0.74,0.09,1....|
+------------------+-------+--------------------+
only showing top 5 rows



In [9]:
from pyspark.ml.evaluation import RegressionEvaluator

lr_evaluator_red = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="quality", metricName="mae")

print("MSE on test data = %g" % lr_evaluator_red.evaluate(lr_predictions_red))

MSE on test data = 0.512117


In [10]:
#Дальше по белому вину

In [11]:
# convert to vector column first
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=white_df.columns, outputCol=vector_col)
df_vector = assembler.transform(white_df).select(vector_col)
matrix = Correlation.corr(df_vector, vector_col)
matrix.collect()[0]["pearson({})".format(vector_col)].values

array([ 1.00000000e+00, -2.26972901e-02,  2.89180698e-01,  8.90207014e-02,
        2.30856437e-02, -4.93958591e-02,  9.10697562e-02,  2.65331014e-01,
       -4.25858291e-01, -1.71429850e-02, -1.20881123e-01, -1.13662831e-01,
       -2.26972901e-02,  1.00000000e+00, -1.49471811e-01,  6.42860601e-02,
        7.05115715e-02, -9.70119393e-02,  8.92605036e-02,  2.71138455e-02,
       -3.19153683e-02, -3.57281469e-02,  6.77179428e-02, -1.94722969e-01,
        2.89180698e-01, -1.49471811e-01,  1.00000000e+00,  9.42116243e-02,
        1.14364448e-01,  9.40772210e-02,  1.21130798e-01,  1.49502571e-01,
       -1.63748211e-01,  6.23309403e-02, -7.57287301e-02, -9.20909088e-03,
        8.90207014e-02,  6.42860601e-02,  9.42116243e-02,  1.00000000e+00,
        8.86845359e-02,  2.99098354e-01,  4.01439311e-01,  8.38966455e-01,
       -1.94133454e-01, -2.66643659e-02, -4.50631222e-01, -9.75768289e-02,
        2.30856437e-02,  7.05115715e-02,  1.14364448e-01,  8.86845359e-02,
        1.00000000e+00,  

In [12]:
white_df.corr('alcohol','quality')

0.4355747154613733

In [14]:
train_cols_w = white_df.columns[:-1]
vectorAssembler = VectorAssembler(inputCols = train_cols_w, outputCol = 'features')
v_df_white = vectorAssembler.transform(white_df)
v_df_white = v_df_white.select(['features', 'quality'])
(train_df_white, test_df_white) = v_df_white.randomSplit([0.8,0.2])
lr_white = LinearRegression(featuresCol='features', labelCol='quality')
lr_model_white = lr_white.fit(train_df_white)
trainingSummary_white = lr_model_white.summary
print("RMSE: %f" % trainingSummary_white.rootMeanSquaredError)
print("MSE: %f" % trainingSummary_white.meanSquaredError)
print("MAE: %f" % trainingSummary_white.meanAbsoluteError)

RMSE: 0.759129
MSE: 0.576277
MAE: 0.589302


In [None]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","quality","features").show(5)
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="quality", metricName="mae")

print("MSE on test data = %g" % lr_evaluator.evaluate(lr_predictions))