In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, unix_timestamp, when, lit, length, array_sort
)

from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Tokenizer, SQLTransformer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Predictive Analytics I')
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

answers_input_path = os.path.join(project_path, 'data/answers')

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

In [None]:
answersDF = (
    spark
    .read
    .option('path', answers_input_path)
    .load()
)

questionsDF = (
    spark
    .read
    .option('path', questions_input_path)
    .load()
)

In [None]:
data_with_label = (
    questionsDF.alias('questions')
    .join(answersDF.alias('answers'), questionsDF['accepted_answer_id'] == answersDF['answer_id'])
    .select(
        col('questions.tags'),
        col('questions.creation_date').alias('question_time'),
        col('questions.title'),
        col('questions.body').alias('message'),
        col('answers.creation_date').alias('answer_time')
    )
    .withColumn('response_time', unix_timestamp('answer_time') - unix_timestamp('question_time'))
    .withColumn('label', when(col('response_time') <= 7200, lit(1)).otherwise(0))
)

<b>Take a look at the distribution of classes</b>

In [None]:
(
    data_with_label
    .groupBy('label')
    .count()
).show()

In [None]:
data_with_label.printSchema()

In [None]:
data_with_basic_features = (
    data_with_label
    .withColumn('title_complexity', length('title'))
)

In [None]:
train_data, test_data = data_with_basic_features.randomSplit([0.7, 0.3], 24)

In [None]:
features = ['title_complexity']

assembler = VectorAssembler(inputCols=(features), outputCol='features')

# Classifier:
rf = RandomForestClassifier(labelCol='label', featuresCol='features', seed=42)

pipeline = Pipeline(stages=[assembler, rf])

rf_model = pipeline.fit(train_data)

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol='label', metricName='areaUnderROC')

predictions = rf_model.transform(test_data)

evaluator.evaluate(predictions)

In [None]:
(
    data_with_basic_features
    .withColumn('tags_sorted', array_sort('tags'))
    .withColumn
)

In [None]:
train_data.printSchema()

In [None]:
sqlTrans = SQLTransformer(statement="SELECT *, size(words) AS message_size FROM __THIS__")

In [None]:
features = ['title_complexity', 'message_size']

tokenizer = Tokenizer(inputCol='message', outputCol='words')

assembler = VectorAssembler(inputCols=(features), outputCol='features')

# Classifier:
rf = RandomForestClassifier(labelCol='label', featuresCol='features', seed=42)

pipeline = Pipeline(stages=[tokenizer, sqlTrans, assembler, rf])

rf_model = pipeline.fit(train_data)

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol='label', metricName='areaUnderROC')

predictions = rf_model.transform(test_data)

evaluator.evaluate(predictions)