In [None]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [None]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Linear Methods

### Logistic Regression

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_svm_data.txt

In [None]:
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("sample_svm_data.txt")

parsedData = data.map(parsePoint)

In [None]:
parsedData.take(1)

In [None]:
model = LogisticRegressionWithLBFGS.train(parsedData)

labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))

trainErr = labelsAndPreds.filter(lambda v: v[0] != v[1]).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

### Linear Regression

In [None]:
from pyspark.mllib.regression import LinearRegressionWithSGD, LinearRegressionModel

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/ridge-data/lpsa.data

In [None]:
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("lpsa.data")
parsedData = data.map(parsePoint)

In [None]:
parsedData.take(1)

In [None]:
model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001)

valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))

MSE = valuesAndPreds.map(lambda v: (v[0] - v[1])**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()

print("Mean Squared Error = " + str(MSE))

In [None]:
sc.stop()