In [1]:
sc

<pyspark.context.SparkContext at 0x7f3292ebf790>

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Linear Methods

### Logistic Regression

In [5]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_svm_data.txt

--2016-09-26 08:55:31--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_svm_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.12.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.12.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39474 (39K) [text/plain]
Saving to: ‘sample_svm_data.txt.1’


2016-09-26 08:55:31 (670 KB/s) - ‘sample_svm_data.txt.1’ saved [39474/39474]



In [6]:
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("sample_svm_data.txt")

parsedData = data.map(parsePoint)

In [7]:
parsedData.take(1)

[LabeledPoint(1.0, [0.0,2.52078447202,0.0,0.0,0.0,2.00468443649,2.00034729927,0.0,2.22838704274,2.22838704274,0.0,0.0,0.0,0.0,0.0,0.0])]

In [9]:
model = LogisticRegressionWithLBFGS.train(parsedData)

labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))

trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

Training Error = 0.366459627329


### Linear Regression

In [10]:
from pyspark.mllib.regression import LinearRegressionWithSGD, LinearRegressionModel

!wget https://raw.githubusercontent.com/apache/spark/master/data/mllib/ridge-data/lpsa.data

--2016-09-26 08:58:53--  https://raw.githubusercontent.com/apache/spark/master/data/mllib/ridge-data/lpsa.data
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.12.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.12.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10395 (10K) [text/plain]
Saving to: ‘lpsa.data’


2016-09-26 08:58:54 (11,1 MB/s) - ‘lpsa.data’ saved [10395/10395]



In [11]:
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("lpsa.data")
parsedData = data.map(parsePoint)

In [12]:
parsedData.take(1)

[LabeledPoint(-0.4307829, [-1.63735562648,-2.00621178481,-1.86242597251,-1.02470580167,-0.522940888712,-0.863171185426,-1.04215728919,-0.864466507337])]

In [13]:
model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001)

valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))

MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()

print("Mean Squared Error = " + str(MSE))

Mean Squared Error = 7.4510328101


