In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.types import DoubleType,IntegerType
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import SparseVector

In [2]:
spark = SparkSession.builder.appName("Regression").getOrCreate()

In [4]:
df = spark.read.format("csv").option("header", True)\
.option("inferSchema", True).option("delimiter", ",")\
.load("imports-85.data")
data = df.withColumnRenamed("wheel-base", "label").select("label", "length", "width", "height")

In [6]:
data.show()

+-----+------+-----+------+
|label|length|width|height|
+-----+------+-----+------+
| 88.6| 168.8| 64.1|  48.8|
| 88.6| 168.8| 64.1|  48.8|
| 94.5| 171.2| 65.5|  52.4|
| 99.8| 176.6| 66.2|  54.3|
| 99.4| 176.6| 66.4|  54.3|
| 99.8| 177.3| 66.3|  53.1|
|105.8| 192.7| 71.4|  55.7|
|105.8| 192.7| 71.4|  55.7|
|105.8| 192.7| 71.4|  55.9|
| 99.5| 178.2| 67.9|  52.0|
|101.2| 176.8| 64.8|  54.3|
|101.2| 176.8| 64.8|  54.3|
|101.2| 176.8| 64.8|  54.3|
|101.2| 176.8| 64.8|  54.3|
|103.5| 189.0| 66.9|  55.7|
|103.5| 189.0| 66.9|  55.7|
|103.5| 193.8| 67.9|  53.7|
|110.0| 197.0| 70.9|  56.3|
| 88.4| 141.1| 60.3|  53.2|
| 94.5| 155.9| 63.6|  52.0|
+-----+------+-----+------+
only showing top 20 rows



In [8]:
from pyspark.ml.regression import LinearRegression

assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
y = assembler.transform(data)

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
model = lr.fit(y)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [0.22836801258821893,0.8223218915856468,0.580595102043434]
Intercept: -26.380531957157498
numIterations: 11
objectiveHistory: [0.5, 0.38579526656819896, 0.13000842393266873, 0.12985504772567413, 0.12963704261349218, 0.12947103310674205, 0.1294164378448031, 0.1294050846483987, 0.12940508261516015, 0.1294050824628613, 0.12940508245526855]
+--------------------+
|           residuals|
+--------------------+
|  -4.611862798093398|
|  -4.611862798093398|
|  -2.501339043881387|
|-0.11328232985025011|
| -0.6777467081673763|
|  0.3413419946315486|
|  -2.878914311626758|
|  -2.878914311626758|
| -2.9950333320354474|
| -0.8412496309870932|
|  2.3922947158520174|
|  2.3922947158520174|
|  2.3922947158520174|
|  2.3922947158520174|
| -0.6335041529149237|
| -0.6335041529149237|
| -1.3908023008371515|
|  0.4019071188106693|
|   2.084135889634638|
|   2.787341183548463|
+--------------------+
only showing top 20 rows

RMSE: 2.517190
r2: 0.824407


In [10]:
from pyspark.sql.functions import col, when
logistic_df = df.withColumn("label", when(col("num-of-doors") == "four", 1).otherwise(0)).select("label", "length", "width", "height")

In [11]:
from pyspark.ml.classification import LogisticRegression
assembler = VectorAssembler(inputCols=logistic_df.columns[1:], outputCol="features")
z = assembler.transform(logistic_df)

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
model = lr.fit(z)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

Coefficients: [0.0,0.0,0.000100509510875788]
Intercept: 0.22531532410664368


In [12]:
mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# Fit the model
mlr_model = mlr.fit(z)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlr_model.coefficientMatrix))
print("Multinomial intercepts: " + str(mlr_model.interceptVector))

Multinomial coefficients: DenseMatrix([[ 0.00000000e+00,  0.00000000e+00, -7.35292649e-05],
             [ 0.00000000e+00,  0.00000000e+00,  7.35292649e-05]])
Multinomial intercepts: [-0.11156262444620539,0.11156262444620539]
