In [1]:
#import findspark and initiate
import findspark
findspark.init()

In [3]:
#create SparkSession using pyspark configuration
import pyspark
from pyspark.sql import SparkSession
conf = pyspark.SparkConf()
spark = SparkSession.builder.appName("project").config(conf = conf).getOrCreate()
spark
sc=spark.sparkContext

In [4]:
#read in merged dataset
df=spark.read.parquet("hdfs://ip-172-31-74-188.ec2.internal:8020/user/hadoop/df")

In [6]:
#covert controversiality into double, since it is label and either 0 or 1
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

df=df.withColumn('controversiality', col('controversiality').cast(DoubleType()))

In [7]:
#split data into training and testing
split_data = df.randomSplit([0.7, 0.3])
train_data = split_data[0]
test_data = split_data[1]

In [9]:
#build logistic regression model to predict controversiality
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

#create vectorizer that makes one column based on all predictor variables
log_features=VectorAssembler(
                inputCols=["authorindex", 
               "subredditindex", 
               "parentindex", 
               "score",
              "timeofday"], 
                outputCol="features")

#define logistic regression model
log_mod = LogisticRegression(labelCol="controversiality", featuresCol="features")

#set up pipeline
pipeline_log = Pipeline(stages=[log_features,
                                log_mod])

#train model using pipeline
log_fit = pipeline_log.fit(train_data)

In [17]:
log_fit.stages[-1].summary

<pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary at 0x7f6c9c2d8d30>

In [10]:
#make predictions for testing data on logistic regression model
logpredictions = log_fit.transform(test_data)

In [11]:
logpredictions.select("controversiality", "prediction").show()

+----------------+----------+
|controversiality|prediction|
+----------------+----------+
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             1.0|       0.0|
|             0.0|       0.0|
|             1.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
+----------------+----------+
only showing top 20 rows



In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator=BinaryClassificationEvaluator(labelCol='controversiality', rawPredictionCol="rawPrediction")
evaluator.getMetricName()
evaluator.evaluate(logpredictions)

0.8699466965706979

The BinaryClassificationEvaluator gives us the area under the ROC for our logistic model. At approximately 87%, this tells us that the model is very good at predicting whether a comment will be controversial or not.