In [11]:
#SVMWithSGD

from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[11], values[0:10])

data = sc.textFile("winequality-white.csv")
parsedData = data.map(parsePoint)

# Build the model with L1 penalty 
model1 = SVMWithSGD.train(parsedData, regType='l1', step=0.0001)

# Build the model with L2 penalty
model2 = SVMWithSGD.train(parsedData, regType='l2')

# Evaluating the model with L1 on training data
labelsAndPreds1 = parsedData.map(lambda p: (p.label, model1.predict(p.features)))

# Evaluating the model with L2 on training data
labelsAndPreds2 = parsedData.map(lambda p: (p.label, model2.predict(p.features)))

#Calculating training error for model with L1
trainErr1 = labelsAndPreds1.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error for SVMWithSGD - L1 = " + str(trainErr1))

#Calculating training error for model with L2
trainErr2 = labelsAndPreds2.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error for SVMWithSGD - L2 = " + str(trainErr2))



Training Error for SVMWithSGD - L1 = 0.0367496937526
Training Error for SVMWithSGD - L2 = 0.0367496937526


In [56]:
#LogisticRegressionWithLBFGS

from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint
from numpy import array
from pyspark.mllib.feature import Normalizer

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[11], values[0:10])

data = sc.textFile("winequality-white.csv")

#nor = Normalizer(1)
#norData = nor.transform(data).collect()

parsedData = data.map(parsePoint)

# Build the model with L1 penalty 
model1 = LogisticRegressionWithLBFGS.train(parsedData, regType='l1')

# Build the model with L2 penalty
model2 = LogisticRegressionWithLBFGS.train(parsedData, regType='l2')

# Evaluating the model with L1 on training data
labelsAndPreds1 = parsedData.map(lambda p: (p.label, model1.predict(p.features)))

# Evaluating the model with L2 on training data
labelsAndPreds2 = parsedData.map(lambda p: (p.label, model2.predict(p.features)))

#Calculating training error for model with L1
trainErr1 = labelsAndPreds1.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error for LogisticRegressionWithLBFGS - L1 = " + str(trainErr1))

#Calculating training error for model with L2
trainErr2 = labelsAndPreds2.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error for LogisticRegressionWithLBFGS - L2 = " + str(trainErr2))




Training Error for LogisticRegressionWithLBFGS - L1 = 0.0367496937526
Training Error for LogisticRegressionWithLBFGS - L2 = 0.0367496937526


In [54]:
#LogisticRegressionWithSGD

from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from numpy import array
from pyspark.mllib.feature import Normalizer

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[11], values[0:10])

data = sc.textFile("winequality-white.csv")

#nor = Normalizer(1)
#norData = nor.transform(data).collect()

parsedData = data.map(parsePoint)

# Build the model with L1 penalty 
model1 = LogisticRegressionWithSGD.train(parsedData, regType='l1', iterations=100, step=0.0001, regParam = 0.00001)

# Build the model with L2 penalty
model2 = LogisticRegressionWithSGD.train(parsedData, regType='l2')

# Evaluating the model with L1 on training data
labelsAndPreds1 = parsedData.map(lambda p: (p.label, model1.predict(p.features)))

# Evaluating the model with L2 on training data
labelsAndPreds2 = parsedData.map(lambda p: (p.label, model2.predict(p.features)))

#Calculating training error for model with L1
trainErr1 = labelsAndPreds1.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error for LogisticRegressionWithSGD - L1 = " + str(trainErr1))

#Calculating training error for model with L2
trainErr2 = labelsAndPreds2.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error for LogisticRegressionWithSGD - L2 = " + str(trainErr2))




Training Error for LogisticRegressionWithSGD - L1 = 0.0367496937526
Training Error for LogisticRegressionWithSGD - L2 = 0.0467537770519


In [2]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD 
from numpy import array

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("winequality-white.csv")
parsedData = data.map(parsePoint)

# Build the model
model = LinearRegressionWithSGD.train(parsedData,100,0.0001)

# Evaluate the model on training data
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error for LinearRegressionWithSGD = " + str(MSE))

Mean Squared Error for LinearRegressionWithSGD = 4.2568240886


In [7]:
from pyspark.mllib.regression import LabeledPoint, RidgeRegressionWithSGD 
from numpy import array

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("winequality-white.csv")
parsedData = data.map(parsePoint)

# Build the model
model = RidgeRegressionWithSGD.train(parsedData,100,0.0001)

# Evaluate the model on training data
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error for RidgeRegressionWithSGD = " + str(MSE))

Mean Squared Error for RidgeRegressionWithSGD = 4.25682589184


In [4]:
from pyspark.mllib.regression import LabeledPoint, LassoWithSGD 
from numpy import array

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("winequality-white.csv")
parsedData = data.map(parsePoint)

# Build the model
model = LassoWithSGD.train(parsedData,100,step=0.0001)

# Evaluate the model on training data
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error for LassoWithSGD = " + str(MSE))

Mean Squared Error for LassoWithSGD = 4.25710987027


In [44]:
sc.stop()

In [60]:
sc

<pyspark.context.SparkContext at 0x10d6c18d0>