# CSCIE63 - Linear Regression Example

### Our Task - Build a Linear Regression Model to Predict HorsePower given Displacement

In [None]:
import os
import pyspark.mllib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark

from pyspark import SparkContext, SparkConf
from pyspark.sql import Row
from IPython.display import display, HTML
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD 
from pyspark.sql.functions import *
from pyspark.sql import SQLContext

In [None]:
# Setup your Spark context 
conf = (SparkConf()
         .setMaster("local[*]")
         .setAppName("e63CarsLinear")
         .set("spark.executor.memory", "2g"))

# Use get or create here so that if the cell is evaluated multiple times we don't get multiple SparkContexts.
sc = SparkContext.getOrCreate(conf)
sqlContext = SQLContext(sc)

In [None]:
# Load all your Data 

auto = sc.textFile("auto_mpg_original.csv", use_unicode=False)
autordd = auto.map(lambda line: line.split(","))
num_data = autordd.count()
print ('Total Count:' + str(num_data))

In [None]:
# Map the RDD Colums in prep to clean up as well as do preliminary Data Analysis 

mappedautordd = autordd.map(lambda line: Row(mpg=line[0],cylinders=line[1], displacement=line[2], horsepower=line[3], weight=line[4], acceleration=line[5], my=line[6],origin=line[7],name=line[8]))

In [None]:
mappedautordd.take(5)

In [None]:
# Clean up our Data remove the N/A

df = mappedautordd.toDF()
df = df.filter(df['mpg'] > 0)
df = df.filter(df['horsepower'] > 0)
num_data = df.count()
print ('Total Count:' + str(num_data))
df.take(5)

In [None]:
#Summary of our Input Data

df2 = df.select(df.displacement.cast('float'),df.horsepower.cast('float')) 
df2.take(2)
df2.describe('displacement', 'horsepower').show()
myvar=df2.collect()

In [None]:
# Look at the distribution of our Data 

%pylab inline
targets = df.select(df.horsepower.cast('float')).rdd.flatMap(lambda x: x).collect()
hist(targets, bins=40, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(8, 5)

In [None]:
# Setup some functions we could use later . 
# Alternative is to use Spark builtin RegressionMetrics

def squared_error(actual, pred): 
    return (pred - actual)**2
def abs_error(actual, pred): 
    return np.abs(pred - actual)
def squared_log_error(pred, actual):
    return (np.log(pred + 1) - np.log(actual + 1))**2

In [None]:
# Create our Training and Test Data 

# LabeledPoint
transformed = df2.rdd.map(lambda r : LabeledPoint(r[0],r[1:])) 

print "---------------------------------"
print "How does Labeled Point Look like "
print "---------------------------------"
print " "
print transformed.take(5)
print " "
# Split our Data 

print "---------------------------------"
print "Splitting our Data"
print "---------------------------------"


trainingData, testingData = transformed.randomSplit([.8,.2],seed=1234)
train_size = trainingData.count()
test_size = testingData.count()
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d " % num_data
print "Train + Test size : %d" % (train_size + test_size)
trainingData = trainingData.cache()
testingData = testingData.cache()

print""
print"---------------------"
print" VIEW OUR TESTDATA "
print"---------------------"


print testingData.take(5)

In [None]:
# We train our Model now 
# Please note use LinearRegressionWithSGD ONLY 

linearModel = LinearRegressionWithSGD.train(trainingData,iterations=1000,step=.0001)

print("Linear Model Info:" +str (linearModel))

# We validate our Model against training Data
print "---------------------------------"
print "Predictions"
print "---------------------------------"
print ""

true_vs_predicted = testingData.map(lambda p: (p.label, linearModel.predict(p.features)))
print ("Linear Model predictions: " + str(true_vs_predicted.take(10))) 
print ""
# Gather Metrics 
mse = true_vs_predicted.map(lambda (t, p): squared_error(t, p)).mean() 
mae = true_vs_predicted.map(lambda (t, p): abs_error(t, p)).mean() 
rmsle=np.sqrt(true_vs_predicted.map(lambda(t,p):squared_log_error(t,p)).mean())


print "---------------------------------"
print "Model Metrics"
print "---------------------------------"
print ""

print ("Linear Model - Mean Squared Error: %2.4f" % mse)
print ("Linear Model - Mean Absolute Error: %2.4f" % mae)
print ("Linear Model - Root Mean Squared Log Error: %2.4f" % rmsle)



In [None]:
tvp = testingData.map(lambda p: (float(p.label), float(linearModel.predict(p.features)), float(p.features[0]))).toDF().toPandas()
tvp.columns = ['horsepower', 'predicted horsepower', 'displacement']

# Plot for 20% of Train Data

plt.figure(1, figsize=(10,6))
plt.scatter(tvp['displacement'], tvp['horsepower'], c='b', label='hp actual')
plt.plot(tvp['displacement'], tvp['predicted horsepower'], c='r',label='predicted hp')
plt.scatter(tvp['displacement'], tvp['predicted horsepower'], c='r', label='predicted hp values')
plt.xlabel('displacement')
plt.ylabel('horsepower')
plt.title('Horsepower - Actual vs. Predictions 20% Test Data')
plt.legend()
plt.show()

In [None]:
# Plot for 20% of All Data

plt.figure(2, figsize=(12,8))
tvp2 = trainingData.map(lambda p: (float(p.label), float(linearModel.predict(p.features)), float(p.features[0]))).toDF().toPandas() 
tvp2.columns = ['horsepower', 'predicted horsepower', 'displacement'] 
plt.scatter(tvp['displacement'], tvp['horsepower'], c='b', label='hp .2 sample data')
plt.scatter(tvp2['displacement'], tvp2['horsepower'], c='g', label='hp . 8 training data')
plt.plot(tvp2['displacement'], tvp2['predicted horsepower'], c='r', label='predicted hp')
plt.scatter(tvp['displacement'], tvp['predicted horsepower'], c='m', label='predicted hp values')
plt.xlabel('displacement')
plt.ylabel('horsepower')
plt.legend()
plt.title('Horsepower - Actual vs. Predictions - All Data')
plt.show()