## Exercise 51 - Logistic Regression on Text Data

In [52]:
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.sql.types import *

In [17]:
trainingDataPath = "/data/students/bigdata-01QYD/ex_data/Ex51/data/trainingData.csv"
unlabeledDataPath = "/data/students/bigdata-01QYD/ex_data/Ex51/data/unlabeledData.csv"
outputPath = "res_ex51/"

In [18]:
trainingData = spark.read.load(trainingDataPath, format="csv", header=True, inferSchema=True)
unlabeledData = spark.read.load(unlabeledDataPath, format="csv", header=True, inferSchema=True)
trainingData.show()
unlabeledData.show()

+-----+--------------------+
|label|                text|
+-----+--------------------+
|    1|The Spark system ...|
|    1|Spark is a new di...|
|    0|Turin is a beauti...|
|    0|Turin is in the n...|
+-----+--------------------+

+-----+--------------------+
|label|                text|
+-----+--------------------+
| null|Spark performs be...|
| null|Comparison betwee...|
| null|Turin is in Piedmont|
+-----+--------------------+



In [41]:
# use two predictors : Presence of word "Spark" (bool) and the number of words
spark.udf.register("checkSpark", lambda text: "SPARK" in text.upper(), BooleanType())
spark.udf.register("getNumWords", lambda text: len(text.split(" ")), IntegerType())

<function __main__.<lambda>(text)>

In [42]:
updatedTrainingData = trainingData.selectExpr("label","checkSpark(text) as spark", "getNumWords(text) as numWords", "text")
updatedTrainingData.show()

+-----+-----+--------+--------------------+
|label|spark|numWords|                text|
+-----+-----+--------+--------------------+
|    1| true|       7|The Spark system ...|
|    1| true|       6|Spark is a new di...|
|    0|false|       5|Turin is a beauti...|
|    0|false|       8|Turin is in the n...|
+-----+-----+--------+--------------------+



In [43]:
updatedUnlabeledData.printSchema()

root
 |-- label: string (nullable = true)
 |-- spark: integer (nullable = true)
 |-- numWords: integer (nullable = true)
 |-- text: string (nullable = true)



In [44]:
updatedUnlabeledData = unlabeledData.selectExpr("label","checkSpark(text) as spark", "getNumWords(text) as numWords", "text")
updatedUnlabeledData.show()

+-----+-----+--------+--------------------+
|label|spark|numWords|                text|
+-----+-----+--------+--------------------+
| null| true|       5|Spark performs be...|
| null| true|       5|Comparison betwee...|
| null|false|       4|Turin is in Piedmont|
+-----+-----+--------+--------------------+



In [45]:
# convert the features with the VectorAssembler into a single column called "features"
assembler = VectorAssembler(inputCols=['spark','numWords'], outputCol='features')

# define the model
lr = LogisticRegression().setMaxIter(10).setRegParam(0.01)

# define the pipeline with both assembler and model
pipeline = Pipeline().setStages([assembler,lr])
classificationModel = pipeline.fit(updatedTrainingData)

predictions = classificationModel.transform(updatedUnlabeledData)

In [46]:
predictions.show()

+-----+-----+--------+--------------------+---------+--------------------+--------------------+----------+
|label|spark|numWords|                text| features|       rawPrediction|         probability|prediction|
+-----+-----+--------+--------------------+---------+--------------------+--------------------+----------+
| null| true|       5|Spark performs be...|[1.0,5.0]|[-3.1272480248757...|[0.04199718899423...|       1.0|
| null| true|       5|Comparison betwee...|[1.0,5.0]|[-3.1272480248757...|[0.04199718899423...|       1.0|
| null|false|       4|Turin is in Piedmont|[0.0,4.0]|[3.19966999960026...|[0.96082185681571...|       0.0|
+-----+-----+--------+--------------------+---------+--------------------+--------------------+----------+



In [50]:
# select just the most relevant information
selctedPredictions = predictions.selectExpr("text","prediction")
selctedPredictions.show()

+--------------------+----------+
|                text|prediction|
+--------------------+----------+
|Spark performs be...|       1.0|
|Comparison betwee...|       1.0|
|Turin is in Piedmont|       0.0|
+--------------------+----------+



In [51]:
selctedPredictions.write.csv(outputPath, header=True)