In [1]:
# Import the findspark module 
import findspark

# Initialize via the full spark path
findspark.init("/usr/local/spark/")

In [2]:
# Import the SparkSession module
from pyspark.sql import SparkSession

# Build the SparkSession
spark = SparkSession.builder \
   .master("local") \
   .appName("Linear Regression Model") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()
   
# Main entry point for Spark functionality.     
sc = spark.sparkContext

In [3]:
# Read a text file from HDFS
rdd = sc.textFile('./data/cal_housing.data')

# Read a text file from HDFS
header = sc.textFile('./data/cal_housing.domain')

In [4]:
# Use a lambda function to split lines on commas
rdd = rdd.map(lambda line: line.split(","))

In [5]:
# Import the spark sql and sql.types, and sql.functions modules 
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Using the lambda function, for each input line of our rdd, we convert the line into a Row structure. Each row is composed of attributes from our data domain and their respective values.
# Map the new RDD (after appyling lamda) to a DF using toDF()

df = rdd.map (lambda line: Row(longitude=line[0], 
                              latitude=line[1], 
                              housingMedianAge=line[2],
                              totalRooms=line[3],
                              totalBedRooms=line[4],
                              population=line[5], 
                              households=line[6],
                              medianIncome=line[7],
                              medianHouseValue=line[8])).toDF()

In [6]:
def convertColumn(df, names, newType):
  for name in names: 
     df = df.withColumn(name, df[name].cast(newType))
  return df 

# Assign column names to `columns`
columns = ['households', 'housingMedianAge', 'latitude', 'longitude', 'medianHouseValue', 'medianIncome', 'population', 'totalBedRooms', 'totalRooms']

# Convert the DataFrame columns to the floats
df = convertColumn(df, columns, FloatType())

#df.printSchema()

In [7]:
#I decided not to devide by 100.000, because we dont need scaling here and it looks more plausable 
df_1 = df.withColumn("medianIncomePerHousehold", col("medianIncome")*1000/col("households")) \
   .withColumn("medianHouseValuePerPopulation", col("medianHouseValue")/col("population")) \
   .withColumn("medianHouseValuePerHousingMedianAge", col("medianHouseValue")/col("housingMedianAge"))

columns = ['medianIncomePerHousehold', 'medianHouseValuePerPopulation', 'medianHouseValuePerHousingMedianAge']
df_1 = convertColumn(df_1, columns, FloatType())
# Returns the first row as a :class:`Row`.
df_1.select('medianIncomePerHousehold','medianHouseValuePerPopulation', 'medianHouseValuePerHousingMedianAge').show(5)

+------------------------+-----------------------------+-----------------------------------+
|medianIncomePerHousehold|medianHouseValuePerPopulation|medianHouseValuePerHousingMedianAge|
+------------------------+-----------------------------+-----------------------------------+
|                66.07302|                    1405.5901|                          11039.024|
|                7.294728|                    149.31279|                          17071.428|
|                41.00226|                      709.879|                           6771.154|
|               25.767578|                    611.64874|                          6563.4614|
|               14.850193|                     605.6637|                           6580.769|
+------------------------+-----------------------------+-----------------------------------+
only showing top 5 rows



In [8]:
df = df.withColumn("roomsPerHousehold", col("totalRooms")/col("households")) \
   .withColumn("populationPerHousehold", col("population")/col("households")) \
   .withColumn("bedroomsPerRoom", col("totalBedRooms")/col("totalRooms"))
# Returns the first row as a :class:`Row`.
#df.select("roomsPerHousehold","populationPerHousehold","bedroomsPerRoom").show(1)

In [9]:
df = df.select( "medianHouseValue",
                "longitude",
                "latitude",
                "housingMedianAge",
                "totalBedRooms", 
                "totalRooms",
                "population",
                "households",
                "roomsPerHousehold",
                "populationPerHousehold",
                "bedroomsPerRoom", 
                "medianIncome")
#df.show(5)

In [10]:
# Import the DenseVector Module
from pyspark.ml.linalg import DenseVector

# Use a lambda function to create an RDD with the label and a dense vector (a value array) of features
input_data = df.rdd.map(lambda x: (x[0], DenseVector(x[1:])))

# Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
df_ml = spark.createDataFrame(input_data, ["label", "features"])

# Prints the first row to the console.
df_ml.take(5)

[Row(label=452600.0, features=DenseVector([-122.23, 37.88, 41.0, 129.0, 880.0, 322.0, 126.0, 6.9841, 2.5556, 0.1466, 8.3252])),
 Row(label=358500.0, features=DenseVector([-122.22, 37.86, 21.0, 1106.0, 7099.0, 2401.0, 1138.0, 6.2381, 2.1098, 0.1558, 8.3014])),
 Row(label=352100.0, features=DenseVector([-122.24, 37.85, 52.0, 190.0, 1467.0, 496.0, 177.0, 8.2881, 2.8023, 0.1295, 7.2574])),
 Row(label=341300.0, features=DenseVector([-122.25, 37.85, 52.0, 235.0, 1274.0, 558.0, 219.0, 5.8174, 2.5479, 0.1845, 5.6431])),
 Row(label=342200.0, features=DenseVector([-122.25, 37.85, 52.0, 280.0, 1627.0, 565.0, 259.0, 6.2819, 2.1815, 0.1721, 3.8462]))]

In [11]:
# Import the StandardScaler Module
from pyspark.ml.feature import StandardScaler

# Standardizes features by removing the mean and scaling to unit variance using column summary
# statistics on the samples in the training set.
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

# Fits a model to the input dataset with optional parameters.
scaler = standardScaler.fit(df_ml)

# Transforms the input dataset with optional parameters.
scaled_df = scaler.transform(df_ml)

# Using scaled values or non scaled values doesn't have an effect here.
scaled_df = scaled_df.select("label","features_scaled")
scaled_df = scaled_df.withColumn("features",col("features_scaled"))
scaled_df = scaled_df.select("label","features")

# Prints the first row to the console.
scaled_df.show(1)

+--------+--------------------+
|   label|            features|
+--------+--------------------+
|452600.0|[-61.007270679927...|
+--------+--------------------+
only showing top 1 row



In [12]:
# Randomly splits this :class:`DataFrame` with the provided weights.
train_data, test_data = scaled_df.randomSplit([.8,.2],seed=1234)

In [13]:
train_data.take(5)

[Row(label=14999.0, features=DenseVector([-61.4764, 18.8721, 2.8604, 0.0665, 0.0449, 0.0159, 0.0209, 4.9511, 0.2166, 4.9241, 0.2821])),
 Row(label=14999.0, features=DenseVector([-58.8261, 16.0303, 4.1317, 0.6338, 0.3681, 0.5545, 0.5885, 1.4425, 0.2687, 5.7305, 2.2072])),
 Row(label=14999.0, features=DenseVector([-58.4069, 17.0416, 1.5097, 0.5674, 0.2837, 0.4327, 0.4289, 1.5255, 0.2877, 6.6543, 1.1054])),
 Row(label=22500.0, features=DenseVector([-61.0522, 17.7579, 2.6221, 0.1733, 0.1357, 0.1907, 0.1648, 1.899, 0.3301, 4.2504, 1.408])),
 Row(label=22500.0, features=DenseVector([-60.5381, 17.7673, 4.1317, 0.1875, 0.049, 0.1475, 0.1386, 0.816, 0.3034, 12.7245, 0.4167]))]

In [14]:
# Import the LinearRegression Module
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol="label", maxIter=10000, regParam=0, elasticNetParam=1)

# Fits a model to the input dataset with optional parameters.
linearModel = lr.fit(train_data)

In [15]:
# Transform  is used to perform value predictions using the trained model
predicted = linearModel.transform(test_data)

In [16]:
predicted.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [17]:
predicted.select("label","features","prediction").show()

+-------+--------------------+------------------+
|  label|            features|        prediction|
+-------+--------------------+------------------+
|14999.0|[-61.261818434728...| 81395.28159154393|
|17500.0|[-59.060707297721...| 195855.7107780762|
|33200.0|[-59.385135162751...| 93288.21803263761|
|34600.0|[-60.328469051178...|-75413.89191184286|
|35000.0|[-58.581553878836...|29673.063382339198|
|36600.0|[-59.390123607217...|63185.855945119634|
|36700.0|[-61.815840408308...| 43623.47798000416|
|39400.0|[-60.662877613113...| 29124.33374533709|
|40900.0|[-59.459999909473...|  74330.6296063033|
|41300.0|[-61.501397048157...|158324.32299417863|
|42100.0|[-59.674621068680...| 99524.94548265962|
|42500.0|[-60.238627547139...|119665.66694469936|
|42500.0|[-59.499930313040...| 72223.08001787867|
|42500.0|[-59.395115859656...| 85448.20409646537|
|42600.0|[-59.390123607217...| 54742.86744293338|
|43000.0|[-60.553070907291...| 80267.83391900081|
|43300.0|[-59.784427774503...| 64571.75558787072|


In [18]:
# Projects a set of expressions and returns a new :class:`DataFrame`.
predictions = predicted.select("prediction").rdd.map(lambda x: x[0])

# Projects a set of expressions and returns a new :class:`DataFrame`.
labels = predicted.select("label").rdd.map(lambda x: x[0])

# Zips this RDD with another one, returning key-value pairs with the
# first element in each RDD second element in each RDD, etc.
predictionAndLabel = predictions.zip(labels).collect()

# Built-in mutable sequence.
# Print out first 5 instances of `predictionAndLabel` 
predictionAndLabel[:5]

[(81395.28159154393, 14999.0),
 (195855.7107780762, 17500.0),
 (93288.21803263761, 33200.0),
 (-75413.89191184286, 34600.0),
 (29673.063382339198, 35000.0)]

In [19]:
# Model coefficients.
linearModel.coefficients

DenseVector([-82691.3385, -87375.313, 14365.4742, 10394.8402, 1688.6633, -46549.7439, 39245.7148, 7001.0789, -331.5516, 16960.1748, 80932.8868])

In [20]:
# Model intercept.
linearModel.intercept

-3554020.494834979

In [21]:
# Returns the root mean squared error, which is defined as the
# square root of the mean squared error.
linearModel.summary.rootMeanSquaredError

68245.64450390877

In [22]:
# Returns R^2, the coefficient of determination. More about this here: https://www.ncl.ac.uk/webtemplate/ask-assets/external/maths-resources/statistics/regression-and-correlation/coefficient-of-determination-r-squared.html
linearModel.summary.r2

0.6502674985280331