In [21]:
from pyspark.ml.linalg import Vectors

denseVec = Vectors.dense(1.0, 2.0, 3.0)
denseVec

DenseVector([1.0, 2.0, 3.0])

In [24]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

dataset = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
    ["id", "hour", "mobile", "userFeatures", "clicked"])

assembler = VectorAssembler(
    inputCols=["hour", "mobile", "userFeatures"],
    outputCol="features")

output = assembler.transform(dataset)
print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
output.select("features", "clicked").show(truncate=False)

Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'
+-----------------------+-------+
|features               |clicked|
+-----------------------+-------+
|[18.0,1.0,0.0,10.0,0.5]|1.0    |
+-----------------------+-------+



In [23]:
size = 3
idx = [1,2]  # 非零元素所在的索引值
values = [2.0, 3.0] # 非零元素的值
sparseVec = Vectors.sparse(size, idx, values)
sparseVec

SparseVector(3, {1: 2.0, 2: 3.0})

In [7]:

# mllib in action
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [16]:
df = spark.read.json('/Users/yanghao/github/data/spark-data/ml.json')
df.show(2)

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|green|good|     1|14.386294994851129|
| blue| bad|     8|14.386294994851129|
+-----+----+------+------------------+
only showing top 2 rows



In [25]:
from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")

In [26]:
fitted = supervised.fit(df)
preparedDF = fitted.transform(df)
preparedDF.show(2, truncate=False)

+-----+----+------+------------------+--------------------------------------------------------------------+-----+
|color|lab |value1|value2            |features                                                            |label|
+-----+----+------+------------------+--------------------------------------------------------------------+-----+
|green|good|1     |14.386294994851129|(10,[1,2,3,5,8],[1.0,1.0,14.386294994851129,1.0,14.386294994851129])|1.0  |
|blue |bad |8     |14.386294994851129|(10,[2,3,6,9],[8.0,14.386294994851129,8.0,14.386294994851129])      |0.0  |
+-----+----+------+------------------+--------------------------------------------------------------------+-----+
only showing top 2 rows



In [27]:
# 建立测试集
train, test = preparedDF.randomSplit([0.7, 0.3])

In [28]:
# 建立模型
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol='label', featuresCol='features')

In [32]:
fittedLR = lr.fit(train)
fittedLR.transform(test).show(2)

+-----+---+------+------------------+--------------------+-----+--------------------+--------------------+----------+
|color|lab|value1|            value2|            features|label|       rawPrediction|         probability|prediction|
+-----+---+------+------------------+--------------------+-----+--------------------+--------------------+----------+
| blue|bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|[129.889408614214...|[1.0,3.8881819620...|       0.0|
| blue|bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|[129.889408614214...|[1.0,3.8881819620...|       0.0|
+-----+---+------+------------------+--------------------+-----+--------------------+--------------------+----------+
only showing top 2 rows



In [23]:
# pipeline
train, test = df.randomSplit([0.7, 0.3])
rForm = RFormula()
LR = LogisticRegression().setLabelCol("label").setFeaturesCol("features")

In [24]:
from pyspark.ml import Pipeline
stages = [rForm, LR]
pipeline = Pipeline().setStages(stages)
pipeline

Pipeline_7397f9e4b5ca

In [25]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder()\
    .addGrid(rForm.formula, [
    "lab ~ . + color:value1",
    "lab ~ . + color:value1 + color:value2"])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .addGrid(lr.regParam, [0.1, 2.0])\
    .build()

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()\
    .setMetricName("areaUnderROC")\
    .setRawPredictionCol('prediction')\
    .setLabelCol("label")

In [27]:
from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit()\
    .setTrainRatio(0.75)\
    .setEstimatorParamMaps(params)\
    .setEstimator(pipeline)\
    .setEvaluator(evaluator)

In [28]:
tvsFitted = tvs.fit(train)

In [29]:
evaluator.evaluate(tvsFitted.transform(test))

1.0

In [1]:
from pyspark.ml.feature import Tokenizer

In [2]:
Tokenizer?

[0;31mInit signature:[0m [0mTokenizer[0m[0;34m([0m[0minputCol[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0moutputCol[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
A tokenizer that converts the input string to lowercase and then
splits it by white spaces.

>>> df = spark.createDataFrame([("a b c",)], ["text"])
>>> tokenizer = Tokenizer(inputCol="text", outputCol="words")
>>> tokenizer.transform(df).head()
Row(text='a b c', words=['a', 'b', 'c'])
>>> # Change a parameter.
>>> tokenizer.setParams(outputCol="tokens").transform(df).head()
Row(text='a b c', tokens=['a', 'b', 'c'])
>>> # Temporarily modify a parameter.
>>> tokenizer.transform(df, {tokenizer.outputCol: "words"}).head()
Row(text='a b c', words=['a', 'b', 'c'])
>>> tokenizer.transform(df).head()
Row(text='a b c', tokens=['a', 'b', 'c'])
>>> # Must use keyword arguments to specify params.
>>> tokenizer.setParams("text")
Traceback (most recent call last):
    ...
TypeError

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [6]:
from pyspark.ml.classification import RandomForestClassifier
rfClassfier = RandomForestClassifier()

## Graph

In [12]:
bikeStations = spark.read.option("header", "true")\
    .csv("/Users/yanghao//github/data/spark-data/201508_station_data.csv")
tripData = spark.read.option('header','true')\
    .csv("/Users/yanghao//github/data/spark-data/201508_trip_data.csv")

In [13]:
bikeStations.show(2, False)

+----------+---------------------------------+---------+-----------+---------+--------+------------+
|station_id|name                             |lat      |long       |dockcount|landmark|installation|
+----------+---------------------------------+---------+-----------+---------+--------+------------+
|2         |San Jose Diridon Caltrain Station|37.329732|-121.901782|27       |San Jose|8/6/2013    |
|3         |San Jose Civic Center            |37.330698|-121.888979|15       |San Jose|8/5/2013    |
+----------+---------------------------------+---------+-----------+---------+--------+------------+
only showing top 2 rows



In [15]:
tripData.printSchema()

root
 |-- Trip ID: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- Start Date: string (nullable = true)
 |-- Start Station: string (nullable = true)
 |-- Start Terminal: string (nullable = true)
 |-- End Date: string (nullable = true)
 |-- End Station: string (nullable = true)
 |-- End Terminal: string (nullable = true)
 |-- Bike #: string (nullable = true)
 |-- Subscriber Type: string (nullable = true)
 |-- Zip Code: string (nullable = true)



In [17]:
# 建图
stationVertices = bikeStations.withColumnRenamed("name", "id").distinct()
tripEdges = tripData\
    .withColumnRenamed("start Station", "src")\
    .withColumnRenamed("End Station", "dst")

In [None]:
from graphframes import GraphFrame
stationGraph = GraphFrame(stationVertices, tripEdges)
stationGraph.cache()

## MLlib中的pipeline操作

主要概念:

+ DataFrame: 

+ Transformer: 将DataFrame转化为另一个DataFrame，因此ML model也是一个Transformer，因为他将一个DataFrame转化为另一个包含prediction的DataFrame

+ Estimator: fit on the DataFrame，产生一个Transformer，比如learning algorithm是一个Estimator，他将一个DataFrame转化为model

+ Pipeline: 包含多个Transformer和Estimator

+ Parameter: 

### pipeline的原理

sequence of stages。

Pipeline是一个Estimator，pipeline.fit返回一个transformer，类型是PipelineModel。

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

training = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))
], ['label', 'features'])

In [4]:
training.show()

+-----+--------------+
|label|      features|
+-----+--------------+
|  1.0| [0.0,1.1,0.1]|
|  0.0|[2.0,1.0,-1.0]|
|  0.0| [2.0,1.3,1.0]|
|  1.0|[0.0,1.2,-0.5]|
+-----+--------------+



In [5]:
# lr, estimator
lr = LogisticRegression(maxIter=10, regParam=0.01)
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

LogisticRegression parameters:
aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bou

In [7]:
model1 = lr.fit(training)

print("model 1 was fit using parameters:")
print(model1.extractParamMap())

model 1 was fit using parameters:
{Param(parent='LogisticRegression_de17b80fcfc0', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'): 2, Param(parent='LogisticRegression_de17b80fcfc0', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.0, Param(parent='LogisticRegression_de17b80fcfc0', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial.'): 'auto', Param(parent='LogisticRegression_de17b80fcfc0', name='featuresCol', doc='features column name'): 'features', Param(parent='LogisticRegression_de17b80fcfc0', name='fitIntercept', doc='whether to fit an intercept term'): True, Param(parent='LogisticRegression_de17b80fcfc0', name='labelCol', doc='label column name'): 'label', Param(parent='LogisticRegression_de17b80fcfc0', name='maxIter', doc='maximum num

In [10]:
# 利用字典指定参数
paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30
paramMap.update({lr.regParam: 0.1, lr.threshold:0.55})

paramMap2 = {lr.probabilityCol: "myProbability"}
paramMapCombined = paramMap.copy()
paramMapCombined.update(paramMap2)

model2 = lr.fit(training, paramMapCombined)
model2.extractParamMap()

{Param(parent='LogisticRegression_de17b80fcfc0', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'): 2,
 Param(parent='LogisticRegression_de17b80fcfc0', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.0,
 Param(parent='LogisticRegression_de17b80fcfc0', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial.'): 'auto',
 Param(parent='LogisticRegression_de17b80fcfc0', name='featuresCol', doc='features column name'): 'features',
 Param(parent='LogisticRegression_de17b80fcfc0', name='fitIntercept', doc='whether to fit an intercept term'): True,
 Param(parent='LogisticRegression_de17b80fcfc0', name='labelCol', doc='label column name'): 'label',
 Param(parent='LogisticRegression_de17b80fcfc0', name='maxIter', doc='maximum number of iterations (>= 0)'): 

In [11]:
# Prepare test data
test = spark.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])


In [14]:
prediction = model2.transform(test)
prediction.show(truncate=False)

+-----+--------------+----------------------------------------+----------------------------------------+----------+
|label|features      |rawPrediction                           |myProbability                           |prediction|
+-----+--------------+----------------------------------------+----------------------------------------+----------+
|1.0  |[-1.0,1.5,1.3]|[-2.80465694187465,2.80465694187465]    |[0.05707304171033982,0.9429269582896601]|1.0       |
|0.0  |[3.0,2.0,-0.1]|[2.4958763566420368,-2.4958763566420368]|[0.9238522311704088,0.07614776882959111]|0.0       |
|1.0  |[0.0,2.2,-1.5]|[-2.0935249027914122,2.0935249027914122]|[0.1097277611477915,0.8902722388522085] |1.0       |
+-----+--------------+----------------------------------------+----------------------------------------+----------+



In [15]:
result = prediction.select("features", "label", "myProbability", "prediction") \
    .collect()

for row in result:
    print("features=%s, label=%s -> prob=%s, prediction=%s"
          % (row.features, row.label, row.myProbability, row.prediction))

features=[-1.0,1.5,1.3], label=1.0 -> prob=[0.05707304171033982,0.9429269582896601], prediction=1.0
features=[3.0,2.0,-0.1], label=0.0 -> prob=[0.9238522311704088,0.07614776882959111], prediction=0.0
features=[0.0,2.2,-1.5], label=1.0 -> prob=[0.1097277611477915,0.8902722388522085], prediction=1.0


## MLlib中的数据类型

### local vector



In [5]:
import numpy as np
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors

dv1 = np.array([1.0, 0.0, 3.0])
dv2 = [1.0, 0.0, 3.0]

sv1 = Vectors.sparse(3, [0,2], [1.0, 3.0])

sv2 = sps.csc_matrix((np.array([1.0, 3.0]), np.array([0,2]), np.array([0,2])),shape=(3,1))

In [6]:
sv1, sv2

(SparseVector(3, {0: 1.0, 2: 3.0}),
 <3x1 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Column format>)

In [7]:
import networkx
networkx.__version__

'2.1'

In [8]:
import pyvis
pyvis.__version__

'0.1.6.0'

In [33]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)

# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    rid, text, prob, prediction = row
    print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))

(4, spark i j k) --> prob=[0.15964077387874753,0.8403592261212525], prediction=1.000000
(5, l m n) --> prob=[0.8378325685476744,0.16216743145232562], prediction=0.000000
(6, spark hadoop spark) --> prob=[0.06926633132976037,0.9307336686702395], prediction=1.000000
(7, apache hadoop) --> prob=[0.982157533344422,0.017842466655578065], prediction=0.000000
