In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [3]:
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt").toDF()
data = MLUtils.convertVectorColumnsToML(data)

In [4]:
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(692,[127,128,129...|  0.0|
|(692,[158,159,160...|  1.0|
|(692,[124,125,126...|  1.0|
|(692,[152,153,154...|  1.0|
|(692,[151,152,153...|  1.0|
+--------------------+-----+
only showing top 5 rows



## Linear Methods

### Logistic Regression

In [5]:
from pyspark.ml.classification import LogisticRegression

trainingData, testData = data.randomSplit([0.7, 0.3])

logr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

logrModel = logr.fit(trainingData)

print("Weights: %s Intercept: %s" % (logrModel.coefficients, logrModel.intercept))

Weights: (692,[235,262,263,272,300,350,351,373,377,378,379,400,401,405,406,407,427,428,433,434,435,455,456,461,462,483,484,489,490,496,511,512,517,539,540,568],[-5.66941646038e-05,-6.61845214531e-06,-0.000237392190065,-0.000121649196521,-0.000304540153515,0.000121044597822,3.735238165e-05,-5.92146746419e-05,4.04943643292e-05,0.000287982124862,8.9623466956e-05,-6.2270115717e-05,-1.59460323245e-05,0.000145475272444,0.000806553732949,0.000180116937072,-7.46633797934e-05,-0.000282876242809,0.000693812382515,0.000878107367771,1.57925836842e-05,-0.000268542635485,-0.000302600933472,0.000243716306416,0.000821951640224,-0.00040257937391,-0.000261238404013,0.00017830935341,0.00014042635226,-0.000275846816202,-0.000779339943587,-0.00034440691843,0.00018497199115,-0.000676529463912,-0.000497910504305,-0.000259180469562]) Intercept: 0.2133909175163545


In [6]:
trainingSummaryLR = logrModel.summary

In [7]:
trainingSummaryLR.roc.show()

+---+--------------------+
|FPR|                 TPR|
+---+--------------------+
|0.0|                 0.0|
|0.0|0.023809523809523808|
|0.0|0.047619047619047616|
|0.0| 0.07142857142857142|
|0.0| 0.09523809523809523|
|0.0| 0.11904761904761904|
|0.0| 0.14285714285714285|
|0.0| 0.16666666666666666|
|0.0| 0.19047619047619047|
|0.0| 0.21428571428571427|
|0.0| 0.23809523809523808|
|0.0|  0.2619047619047619|
|0.0|  0.2857142857142857|
|0.0| 0.30952380952380953|
|0.0|  0.3333333333333333|
|0.0| 0.35714285714285715|
|0.0| 0.38095238095238093|
|0.0| 0.40476190476190477|
|0.0| 0.42857142857142855|
|0.0|  0.4523809523809524|
+---+--------------------+
only showing top 20 rows



In [8]:
trainingSummaryLR.areaUnderROC

1.0

### Linear Regression

In [9]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

lrModel = lr.fit(trainingData)

print("Weights: %s Intercept: %s" % (lrModel.coefficients, lrModel.intercept))

Weights: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.

In [10]:
trainingSummaryLLS = lrModel.summary

print("numIterations: %s" % trainingSummaryLLS.totalIterations)

numIterations: 11


In [11]:
trainingSummaryLLS.residuals.show(5)

+--------------------+
|           residuals|
+--------------------+
| 0.23293845870885632|
|-0.29813292115965656|
|  0.5617896330603364|
| 0.30972897604056415|
| -0.2963071615339964|
+--------------------+
only showing top 5 rows



In [12]:
sc.stop()