## Linear Regression

선형 방정식을 푸는 방법은 크게 두가지가 있다.
1. optimial한 값을 찾기 위해선 solve를 이용한다 (단, 해가 존재할 때)
   * SVD, 그람슈미트 대각화, QR분해 등을 이용한다.
2. regression을 활용한다. (정확한 해가 존재하지 않을때)


아래에선 머신러닝 방법인 선형 회귀를 이용하여 선형 방정식의 근사해를 푼다.

크게 선형 회귀를 푸는 방법으로 두 가지가 있다
1. Ordinary Least Squares (Optimial)
2. Gradient Descent

In [10]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("chapter9").getOrCreate()

In [11]:
df = (
    spark.read.format("com.databricks.spark.csv")
    .options(header="true", inferSchema="true")
    .load("Advertising.csv", header="true", inferSchema="true")
    .select("TV", "Radio", "Newspaper", "Sales")
)

In [6]:
df.show(5, True)
df.printSchema()

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



In [7]:
df.describe().show()

25/09/21 10:42:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+-----------------+------------------+------------------+------------------+
|summary|               TV|             Radio|         Newspaper|             Sales|
+-------+-----------------+------------------+------------------+------------------+
|  count|              200|               200|               200|               200|
|   mean|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|              0.7|               0.0|               0.3|               1.6|
|    max|            296.4|              49.6|             114.0|              27.0|
+-------+-----------------+------------------+------------------+------------------+



In [12]:
def get_dummy_supervised(df, indexCol, categoricalCols, continuousCols, labelCol):
    indexers = [
        StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
        for c in categoricalCols
    ]

    # default setting: dropLast=True
    encoders = [
        OneHotEncoder(
            inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol()),
        )
        for indexer in indexers
    ]

    assembler = VectorAssembler(
        inputCols=[encoder.getOutputCol() for encoder in encoders] + continuousCols,
        outputCol="features",
    )

    pipeline = Pipeline(stages=indexers + encoders + [assembler])

    model = pipeline.fit(df)
    data = model.transform(df)

    data = data.withColumn("label", col(labelCol))

    return data.select(indexCol, "features", "label")


In [13]:
def get_dummy_unsupervised(df, indexCol, categoricalCols, continuousCols):
    """
    Get dummy variables and concat with continuous variables for unsupervised learning.
    :param df: the dataframe
    :param categoricalCols: the name list of the categorical data
    :param continuousCols:  the name list of the numerical data
    :return k: feature matrix

    :author: Wenqiang Feng
    :email:  von198@gmail.com
    """

    indexers = [
        StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
        for c in categoricalCols
    ]

    # default setting: dropLast=True
    encoders = [
        OneHotEncoder(
            inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol()),
        )
        for indexer in indexers
    ]

    assembler = VectorAssembler(
        inputCols=[encoder.getOutputCol() for encoder in encoders] + continuousCols,
        outputCol="features",
    )

    pipeline = Pipeline(stages=indexers + encoders + [assembler])

    model = pipeline.fit(df)
    data = model.transform(df)

    return data.select(indexCol, "features")


In [22]:
from pyspark.ml.linalg import Vectors


def transform_data(data):
    # toDf() is not a valid method, use toDF() instead (capital DF)
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]), r[-1]]).toDF(
        ["features", "label"]
    )


data = transform_data(df)


In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorIndexer

featureIndexer = VectorIndexer(
    inputCol="features", outputCol="indexedFeatures", maxCategories=4
).fit(data)

# data = featureIndexer.transform(data)

In [23]:
data.show(5, True)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows



In [24]:
# Split data into 60% training and 40% testing
(trainingData, testData) = data.randomSplit([0.6, 0.4], seed=12345)


In [26]:
trainingData.show(5, True)
testData.show(5, True)


+---------------+-----+
|       features|label|
+---------------+-----+
| [0.7,39.6,8.7]|  1.6|
| [4.1,11.6,5.7]|  3.2|
| [5.4,29.9,9.4]|  5.3|
|[7.3,28.1,41.4]|  5.5|
|[7.8,38.9,50.6]|  6.6|
+---------------+-----+
only showing top 5 rows

+----------------+-----+
|        features|label|
+----------------+-----+
|  [8.4,27.2,2.1]|  5.7|
|   [8.6,2.1,1.0]|  4.8|
|[17.2,45.9,69.3]|  9.3|
|[18.7,12.1,23.4]|  6.7|
|[19.4,16.0,22.3]|  6.6|
+----------------+-----+
only showing top 5 rows



In [37]:
from pyspark.ml.regression import DecisionTreeRegressor, LinearRegression

from pyspark.ml.evaluation import RegressionEvaluator

model = LinearRegression(featuresCol="indexedFeatures", labelCol="label")

dt = DecisionTreeRegressor(featuresCol="indexedFeatures", labelCol="label")

In [38]:
pipeline_lr = Pipeline(stages=[featureIndexer, model])
pipeline_dt = Pipeline(stages=[featureIndexer, dt])

m_lr = pipeline_lr.fit(trainingData)
m_dt = pipeline_dt.fit(trainingData)


25/09/21 11:04:21 WARN Instrumentation: [c6d77a22] regParam is zero, which might cause numerical instability and overfitting.


In [39]:
# Make Predictions
predictions_lr = m_lr.transform(testData)
predictions_dt = m_dt.transform(testData)

predictions_lr.select("features", "label", "prediction").show(5)
predictions_dt.select("features", "label", "prediction").show(5)

+----------------+-----+------------------+
|        features|label|        prediction|
+----------------+-----+------------------+
|  [8.4,27.2,2.1]|  5.7| 7.973557851423301|
|   [8.6,2.1,1.0]|  4.8| 3.561616044078974|
|[17.2,45.9,69.3]|  9.3|11.711653904464114|
|[18.7,12.1,23.4]|  6.7| 5.818699366218825|
|[19.4,16.0,22.3]|  6.6|6.5393465940091815|
+----------------+-----+------------------+
only showing top 5 rows

+----------------+-----+------------------+
|        features|label|        prediction|
+----------------+-----+------------------+
|  [8.4,27.2,2.1]|  5.7|1.6000000000000005|
|   [8.6,2.1,1.0]|  4.8|               3.2|
|[17.2,45.9,69.3]|  9.3| 8.633333333333333|
|[18.7,12.1,23.4]|  6.7| 7.266666666666668|
|[19.4,16.0,22.3]|  6.6| 7.266666666666668|
+----------------+-----+------------------+
only showing top 5 rows



In [40]:
# Evaluatin
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse"
)

rmse_lr = evaluator.evaluate(predictions_lr)
print(f"Root Mean Squared Error (RMSE) of Linear Regression on test data = {rmse_lr}")

rmse_dt = evaluator.evaluate(predictions_dt)
print(f"Root Mean Squared Error (RMSE) of Decision Tree on test data = {rmse_dt}")


Root Mean Squared Error (RMSE) of Linear Regression on test data = 1.5890878967986943
Root Mean Squared Error (RMSE) of Decision Tree on test data = 1.592461187516625


In [42]:
import sklearn.metrics

y_true = predictions_lr.select("label").toPandas()
y_pred = predictions_lr.select("prediction").toPandas()

r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print(f"R2 Score of Linear Regression on test data = {r2_score}")


R2 Score of Linear Regression on test data = 0.8964471118845287


In [44]:
m_dt.stages[1].featureImportances

SparseVector(3, {0: 0.6226, 1: 0.3502, 2: 0.0271})