In [None]:
ボストンの住宅価格予測のデモ

データの場所

1. Title: Boston Housing Data

2. Sources:
   (a) Origin:  This dataset was taken from the StatLib library which is
                maintained at Carnegie Mellon University.
   (b) Creator:  Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the 
                 demand for clean air', J. Environ. Economics & Management,
                 vol.5, 81-102, 1978.
   (c) Date: July 7, 1993

3. Past Usage:
   -   Used in Belsley, Kuh & Welsch, 'Regression diagnostics ...', Wiley, 
       1980.   N.B. Various transformations are used in the table on
       pages 244-261.
    -  Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning.
       In Proceedings on the Tenth International Conference of Machine 
       Learning, 236-243, University of Massachusetts, Amherst. Morgan
       Kaufmann.

4. Relevant Information:

   Concerns housing values in suburbs of Boston.

5. Number of Instances: 506

6. Number of Attributes: 13 continuous attributes (including "class"
                         attribute "MEDV"), 1 binary-valued attribute.

7. Attribute Information:

    1. CRIM      per capita crime rate by town
    2. ZN        proportion of residential land zoned for lots over 
                 25,000 sq.ft.
    3. INDUS     proportion of non-retail business acres per town
    4. CHAS      Charles River dummy variable (= 1 if tract bounds 
                 river; 0 otherwise)
    5. NOX       nitric oxides concentration (parts per 10 million)
    6. RM        average number of rooms per dwelling
    7. AGE       proportion of owner-occupied units built prior to 1940
    8. DIS       weighted distances to five Boston employment centres
    9. RAD       index of accessibility to radial highways
    10. TAX      full-value property-tax rate per $10,000
    11. PTRATIO  pupil-teacher ratio by town
    12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
                 by town
    13. LSTAT    % lower status of the population
    14. MEDV     Median value of owner-occupied homes in $1000's

8. Missing Attribute Values:  None.

In [None]:
from pyspark import SparkContext

data = sc.textFile("hdfs:///user/hadoop/housing.data")

data2 = data.map(lambda x: [float(i) for i in x.strip().split()])
spark.createDataFrame(data2, ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]).show()

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

# data3 = data2.map(lambda x: LabeledPoint(x[-1],x[0:len(x)-2])).toDF()
data3 = data2.map(lambda x: [x[-1],Vectors.dense(x[0:len(x)-2])])
data4 = spark.createDataFrame(data3, ["label", "features"])
data4.show(5, truncate=False)

In [None]:
assembler = VectorAssembler(inputCols=["features"], outputCol="assembled")

# 交互作用項の追加
pe = PolynomialExpansion().setInputCol("features").setOutputCol("polyfeatures")

regressor = LinearRegression().setStandardization(False).setSolver("l-bfgs").setLabelCol("label")

# パラメータチューニングの設定
paramGrid = (ParamGridBuilder()
        .addGrid(pe.degree, [2,3])
        .addGrid(regressor.maxIter, [10,25,50])
        .addGrid(regressor.regParam, [0.0, 0.01, 0.1])
        .addGrid(regressor.featuresCol, ["features", "features", "polyFeatures"])
        .addGrid(regressor.featuresCol, ["polyfeatures"])
        .build())

# 評価にはRMSEを使う
evaluator = RegressionEvaluator(metricName="rmse")

# 交差検定モデルの作成
model = (CrossValidator()
    .setEstimator(Pipeline(stages=[assembler, pe, regressor]))
    .setEvaluator(evaluator)
    .setEstimatorParamMaps(paramGrid)
    .setNumFolds(5)
    .fit(data4))


In [None]:
print("best metrics is",  min(model.avgMetrics))

In [None]:
model.bestModel.transform(data4).show()

In [None]:
model.bestModel.stages[2].coefficients