In [1]:
#Set the Spark Context
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import HiveContext
import matplotlib.pyplot as plt
from pylab import *

def getSparkContext():
    """
    Gets the Spark Context
    """
    conf = (SparkConf()
         .setMaster("local") # run on local
         .setAppName("Logistic Regression") # Name of App
         .set("spark.executor.memory", "1g")) # Set 1 gig of memory
    sc = SparkContext(conf = conf) 
    return sc

sc = getSparkContext()

In [25]:
hiveContext = HiveContext(sc)

test1 = hiveContext.sql("""
    select a.model, a.serial, a.location, a.rating, cast(b.period - a.inceptionperiod as string) as age, 1.0 as label
    from hvt as a left join hvt_operatingo as b on a.serial = b.serialo
""")
test2 = hiveContext.sql("""
    select a.model, a.serial, a.location, a.rating, cast(b.period - a.inceptionperiod as string) as age, 1.0 as label
    from hvt as a left join hvt_failuref as b on a.serial = b.serialf
""")
test = test1.unionAll(test2)
test.first()


Row(model=1, serial=31, location=5, rating=900, age=u'0', label=1.0)

In [None]:
# Convert Spark DataFrame to Panda DataFrame (PDF)
testf = test.toPandas()

In [27]:
%matplotlib inline

x = testf.age
y = testf.rating
plt.plot(x,y)
#z = hvt_PDF.hightempdays
#z1 = pandas_df.criticality
#fig = plt.figure()


AttributeError: 'function' object has no attribute 'toPandas'

In [9]:
from pyspark.sql import Row, SQLContext

# Prepare training documents, which are labeled.
LabeledDocument = Row("model", "serial", "location", "age", "label")
training = test.map(lambda x: LabeledDocument(*x)).toDF()
training.show()

training.toPandas()

AttributeError: 'function' object has no attribute 'map'

In [7]:
#from pyspark.ml.feature import VectorAssembler
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="age", outputCol="age1")
#vectorAssembler = VectorAssembler(inputCols=["model", "serial", "location", "age", "label"], outputCol="vector") 
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.01)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)

In [8]:
print model

PipelineModel-b6df23e5


In [9]:
# Prepare test documents, which are unlabeled.
Document = Row("model", "serial", "location", "age")
test = sc.parallelize([(1, 31, 5, "10"),
                      (1, 31, 5, "50"),
                      (1, 31, 5, "100")]) \
    .map(lambda x: Document(*x)).toDF()

# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected = prediction.select("model", "serial", "location", "age", "prediction")
for row in selected.collect():
    print row

sc.stop()

Row(model=1, serial=31, location=5, age=u'10', prediction=1.0)
Row(model=1, serial=31, location=5, age=u'50', prediction=1.0)
Row(model=1, serial=31, location=5, age=u'100', prediction=0.0)
