In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("mllib").getOrCreate()

In [3]:
heart = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/claastraineryubdc_corestack/data/heart.csv")

In [4]:
heart.show(2)

+---+---+---------+---+----------+---+-----------+--------------+---------------+-------------+--------+--------------------+----+-------------+----+
|age|sex|pain type| BP|cholestrol|fbs|resting ecg|max heart rate|exercise angina|ST depression|ST slope|flouroscopy coloured|thal|heart disease|_c14|
+---+---+---------+---+----------+---+-----------+--------------+---------------+-------------+--------+--------------------+----+-------------+----+
| 70|  1|        4|130|       322|  0|          2|           109|              0|          2.4|       2|                   3|   3|            2|null|
| 67|  0|        3|115|       564|  0|          2|           160|              0|          1.6|       2|                   0|   7|            1|null|
+---+---+---------+---+----------+---+-----------+--------------+---------------+-------------+--------+--------------------+----+-------------+----+
only showing top 2 rows



In [5]:
from pyspark.sql.functions import min, max
heart.select(min("age"), max("age")).show()

+--------+--------+
|min(age)|max(age)|
+--------+--------+
|      29|      77|
+--------+--------+



In [12]:
heart_age = heart.select('age')

In [14]:
from pyspark.ml.feature import Bucketizer
bucketBorders = [29, 40, 50, 60, 70, 80]
bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("age").setOutputCol('Bucket')
bucketer.transform(heart_age).show()

+---+------+
|age|Bucket|
+---+------+
| 70|   4.0|
| 67|   3.0|
| 57|   2.0|
| 64|   3.0|
| 74|   4.0|
| 65|   3.0|
| 56|   2.0|
| 59|   2.0|
| 60|   3.0|
| 63|   3.0|
| 59|   2.0|
| 53|   2.0|
| 44|   1.0|
| 61|   3.0|
| 57|   2.0|
| 71|   4.0|
| 46|   1.0|
| 53|   2.0|
| 64|   3.0|
| 40|   1.0|
+---+------+
only showing top 20 rows



In [17]:
age_bucket = bucketer.transform(heart_age)

In [20]:
age_bucket.show(5)

+---+------+
|age|Bucket|
+---+------+
| 70|   4.0|
| 67|   3.0|
| 57|   2.0|
| 64|   3.0|
| 74|   4.0|
+---+------+
only showing top 5 rows



In [24]:
age_bucket.select('Bucket')\
  .groupBy('Bucket')\
  .agg({"Bucket" : "count"}).show()

+------+-------------+
|Bucket|count(Bucket)|
+------+-------------+
|   0.0|           12|
|   1.0|           67|
|   4.0|           10|
|   3.0|           74|
|   2.0|          107|
+------+-------------+



In [56]:
ad = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/claastraineryubdc_corestack/data/Advertising.csv")

In [57]:
ad.show(2)

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
+-----+-----+---------+-----+
only showing top 2 rows



In [58]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['TV','Radio','Newspaper'], outputCol = 'features')
ad_df = vectorAssembler.transform(ad)
ad_df = vhouse_df.select(['features', 'Sales'])
ad_df.show(3)

+-----------------+-----+
|         features|Sales|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
+-----------------+-----+
only showing top 3 rows



In [60]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='Sales', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(ad_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.04262382354938965,0.1708116439330275,0.0]
Intercept: 3.78122434128


In [61]:
spark.stop()