In [1]:
from pyspark.ml import linalg as ml_linalg
from pyspark.mllib.regression import LabeledPoint

def as_mllib(v):
    if isinstance(v, ml_linalg.SparseVector):
        return MLLibVectors.sparse(v.size, v.indices, v.values)
    elif isinstance(v, ml_linalg.DenseVector):
        return MLLibVectors.dense(v.toArray())
    else:
        raise TypeError("Unsupported type: {0}".format(type(v)))

In [2]:
# Load the Dataset 
from pyspark.sql import DataFrame
data_file = '/home/user/elicon/data/Test_Data/California_Quake/unlabelled/retweeted/processed/napa_quake.txt'

df = spark.read.option("header","true").csv(data_file)
df.createOrReplaceTempView("tweets")

In [3]:
from pyspark.mllib.classification import SVMModel
from pyspark.ml import PipelineModel

In [4]:
pipeModel = PipelineModel.load("target/tmp/HashingTF_Binary_model")

In [5]:
from pyspark.ml.linalg import Vector as MLVector, Vectors as MLVectors
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors

# Extract features using the trained pipeline
tfidfData = pipeModel.transform(df)


In [6]:
# Create feature vectors
feats = tfidfData.select("features").rdd
feats_vec = feats.map(lambda y: LabeledPoint(0, as_mllib(y[0])))
feats_vector = spark.createDataFrame(feats_vec)

In [7]:
# Trained model 
sameModel = SVMModel.load(sc, "target/tmp/SVMWithSGD_Binary_model")

In [8]:
labeled_data = feats_vec.map(lambda p: (p.label, sameModel.predict(p.features)))
doo_label = spark.createDataFrame(labeled_data)
fin = doo_label.selectExpr("_1 as label", "_2 as prediction")

In [12]:
f_preds = fin.select("prediction")

In [13]:
f_preds.show()

+----------+
|prediction|
+----------+
|         0|
|         1|
|         0|
|         0|
|         0|
|         0|
|         1|
|         1|
|         0|
|         1|
|         1|
|         1|
|         1|
|         0|
|         0|
|         1|
|         1|
|         1|
|         1|
|         0|
+----------+
only showing top 20 rows



In [15]:
from pyspark.sql import *

In [17]:
f_preds.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("/home/user/elicon/data/Test_Data/California_Quake/unlabelled/retweeted/processed/napa_quake_test.txt")