## Setup

Here, we start back up again with a spark session that is capable of working with ML.

In [None]:
# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.4.0

# install spark-nlp
%pip install spark-nlp==5.1.3

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [3]:
import sagemaker
sess = sagemaker.Session()
# bucket = sess.default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [4]:
# import json
import sparknlp
# import numpy as np
# import pandas as pd
# from sparknlp.base import *
# from pyspark.ml import Pipeline
# from sparknlp.annotator import *
# import pyspark.sql.functions as F
# from pyspark.sql import SparkSession
# from sparknlp.pretrained import PretrainedPipeline

# from pyspark.sql.functions import col, lower, regexp_extract, regexp_replace, array, lit
# from pyspark.ml.feature import CountVectorizer
# from pyspark.sql.types import IntegerType, ArrayType, DoubleType
# # from pyspark.sql.functions import udf
# from pyspark.ml.linalg import SparseVector, Vectors

from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier#, MultilayerPerceptronClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model

from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np

In [None]:
# Import pyspark and build Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","32G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")\
    .config("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.ContainerCredentialsProvider")\
    .getOrCreate()

In [6]:
print(f"Spark version: {spark.version}")
print(f"sparknlp version: {sparknlp.version()}")

Spark version: 3.4.0
sparknlp version: 5.1.3


## Read in the Saved Data

Here, we will read in the saved data above as a fresh starting point.

In [None]:
%%time
# Read in data from project bucket
bucket = "project17-bucket-alex"
directory = "matt-submissions-cv"

s3_path = f"s3a://{bucket}/{directory}"
submissions_cv = spark.read.parquet(s3_path, header = True)

In [8]:
submissions_cv.select(['subreddit', 'cv_array']).show(10)

[Stage 1:>                                                          (0 + 1) / 1]

+-------------------+--------------------+
|          subreddit|            cv_array|
+-------------------+--------------------+
|  NoStupidQuestions|[0.0, 0.0, 0.0, 0...|
|     TrueOffMyChest|[1.0, 2.0, 0.0, 0...|
|     TrueOffMyChest|[2.0, 0.0, 3.0, 0...|
|  NoStupidQuestions|[0.0, 0.0, 0.0, 0...|
|           antiwork|[0.0, 0.0, 0.0, 0...|
|  NoStupidQuestions|[0.0, 0.0, 0.0, 1...|
|relationship_advice|[0.0, 0.0, 0.0, 0...|
|           antiwork|[0.0, 0.0, 0.0, 0...|
|           antiwork|[0.0, 0.0, 0.0, 1...|
|   unpopularopinion|[1.0, 1.0, 0.0, 0...|
+-------------------+--------------------+
only showing top 10 rows



                                                                                

In [9]:
# extract vocabulary from dataframe
word_cols = [col for col in submissions_cv.columns if 'word_' in col]
vocabulary = [word.replace('word_', '') for word in word_cols]

# print the first ten vocabulary words
print(f"First ten vocabulary words: {', '.join(vocabulary[:10])}")

First ten vocabulary words: like, feel, want, know, time, tell, get, im, think, friend


## Machine Learning Model Training

Here, we run a Machine Learning model on our data, after splitting it into training and testing sets.

In [41]:
# select only the columns we want
df = submissions_cv.select(['subreddit'] + word_cols)

In [42]:
# split data into training and testing
train_data, test_data = df.randomSplit([0.90, 0.10], 6000)

In [45]:
# show the training data
train_data.select(['subreddit'] + word_cols[:10]).show(10)

[Stage 39:>                                                         (0 + 1) / 1]

+-------------+---------+---------+---------+---------+---------+---------+--------+-------+----------+-----------+
|    subreddit|word_like|word_feel|word_want|word_know|word_time|word_tell|word_get|word_im|word_think|word_friend|
+-------------+---------+---------+---------+---------+---------+---------+--------+-------+----------+-----------+
|AmItheAsshole|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|     0.0|    0.0|       0.0|        0.0|
|AmItheAsshole|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|     0.0|    0.0|       0.0|        0.0|
|AmItheAsshole|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|     0.0|    0.0|       0.0|        0.0|
|AmItheAsshole|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|     0.0|    0.0|       0.0|        0.0|
|AmItheAsshole|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|     0.0|    0.0|       0.0|        0.0|
|AmItheAsshole|      0.0|      0.0|      0.0|      0.0|      0.0|      0

                                                                                

In [47]:
# create a string indexer and make sure it works
stringIndexer_subreddit = StringIndexer(inputCol = "subreddit", outputCol = "subreddit_idx")
subreddit_labels = stringIndexer_subreddit.fit(train_data).labels
print(subreddit_labels)

['relationship_advice', 'NoStupidQuestions', 'TrueOffMyChest', 'AmItheAsshole', 'antiwork', 'unpopularopinion', 'socialskills', 'AskMen', 'explainlikeimfive', 'tifu', 'OutOfTheLoop', 'AskWomen']


In [48]:
# create a vector assembler with the appropriate input variables
vectorAssembler_features = VectorAssembler(
    inputCols = word_cols,
    outputCol = 'input_features')

In [49]:
# create the random forest classification model
model = RandomForestClassifier(
    labelCol = 'subreddit_idx',
    featuresCol = 'input_features',
    numTrees = 50)

In [50]:
# create a label converter to bring the numeric predictions back to string labels
labelConverter = IndexToString(
    inputCol = 'prediction', 
    outputCol = 'predicted_subreddit', 
    labels = subreddit_labels)

In [51]:
# create the pipline with appropriate stages
pipeline_model = Pipeline(
    stages = [stringIndexer_subreddit,
              vectorAssembler_features, 
              model, labelConverter])

In [52]:
# fit the model on the training data
model = pipeline_model.fit(train_data)

                                                                                

In [53]:
# transform the data by applying the model
train_predictions = model.transform(train_data)
test_predictions = model.transform(test_data)

## Machine Learning Model Evaluation

Here, we evaluate our model, having been trained on the training data, on both our training and testing data.

In [59]:
# create a collection of evaluators for evaluating model predictions

# accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol = 'subreddit_idx',
                                                       predictionCol = 'prediction',
                                                       metricName = 'accuracy')
# f1-score
evaluator_f1 = MulticlassClassificationEvaluator(labelCol = 'subreddit_idx',
                                                 predictionCol = 'prediction',
                                                 metricName = 'f1')
# weighted precision
evaluator_precision = MulticlassClassificationEvaluator(labelCol = 'subreddit_idx',
                                                        predictionCol = 'prediction',
                                                        metricName = 'weightedPrecision')
# weighted recall
evaluator_recall = MulticlassClassificationEvaluator(labelCol = 'subreddit_idx',
                                                     predictionCol = 'prediction',
                                                     metricName = 'weightedRecall')
# weighted true positive rate
evaluator_tprate = MulticlassClassificationEvaluator(labelCol = 'subreddit_idx',
                                                     predictionCol = 'prediction',
                                                     metricName = 'weightedTruePositiveRate')

In [61]:
# evaluate the predictions

# accuracy
train_accuracy = evaluator_accuracy.evaluate(train_predictions)
test_accuracy = evaluator_accuracy.evaluate(test_predictions)
print(f'Training Accuracy: {round(train_accuracy, 4)}')
print(f'Testing Accuracy:  {round(test_accuracy, 4)}')

# f1-score
train_f1 = evaluator_f1.evaluate(train_predictions)
test_f1 = evaluator_f1.evaluate(test_predictions)
print(f'Training F1-Score: {round(train_f1, 4)}')
print(f'Testing F1-Score:  {round(test_f1, 4)}')

# weighted precision
train_precision = evaluator_precision.evaluate(train_predictions)
test_precision = evaluator_precision.evaluate(test_predictions)
print(f'Training Weighted Precision: {round(train_precision, 4)}')
print(f'Testing Weighted Precision:  {round(test_precision, 4)}')

# weighted recall
train_recall = evaluator_recall.evaluate(train_predictions)
test_recall = evaluator_recall.evaluate(test_predictions)
print(f'Training Weighted Recall: {round(train_recall, 4)}')
print(f'Testing Weighted Recall:  {round(test_recall, 4)}')

# weighted true positive rate
train_tprate = evaluator_tprate.evaluate(train_predictions)
test_tprate = evaluator_tprate.evaluate(test_predictions)
print(f'Training Weighted True Positive Rate: {round(train_tprate, 4)}')
print(f'Testing Weighted True Positive Rate:  {round(test_tprate, 4)}')

Training Accuracy: 0.5376
Testing Accuracy:  0.5361
Training F1-Score: 0.4237
Testing F1-Score:  0.4216
Training Weighted Precision: 0.4833
Testing Weighted Precision:  0.4647
Training Weighted Recall: 0.5376
Testing Weighted Recall:  0.5361
Training Weighted True Positive Rate: 0.5376
Testing Weighted True Positive Rate:  0.5361


In [69]:
# create a data frame for the evaluation metrics
eval_df = pd.DataFrame({'dataset': ['train',  'test'] * 5,
                        'metric': ['accuracy'] * 2 + ['f1'] * 2 + ['precision'] * 2 + ['recall'] * 2 + ['tprate'] * 2,
                        'value': [train_accuracy, test_accuracy, train_f1, test_f1, train_precision, test_precision, train_recall, test_recall, train_tprate, test_tprate]
                       })

In [70]:
# save the evaluation metrics to CSV
eval_df.to_csv('../../data/matt-submissions-cv-eval-metrics.csv', index = False)

## Model Evaluation Visualization

Here, we prepare data for visualization through confusion matrices and other metrics.

In [63]:
# extract the predictions and true values for the training data
y_pred_train = train_predictions.select('prediction').collect()
y_orig_train = train_predictions.select('subreddit_idx').collect()

# extract the predictions and true values for the testing data
y_pred_test = test_predictions.select('prediction').collect()
y_orig_test = test_predictions.select('subreddit_idx').collect()

                                                                                

In [64]:
# create confusion matrices from the predictions and true values
cm_train = confusion_matrix(y_orig_train, y_pred_train)
cm_test = confusion_matrix(y_orig_test, y_pred_test)

In [65]:
# create data frames for training and testing confusion matrices
cm_train_df = pd.DataFrame(cm_train, columns = subreddit_labels, index = subreddit_labels)
cm_test_df = pd.DataFrame(cm_test, columns = subreddit_labels, index = subreddit_labels)

In [67]:
# save the confusion matrices to CSV
cm_train_df.to_csv('../../data/matt-submissions-cv-train-cm.csv', index = False)
cm_test_df.to_csv('../../data/matt-submissions-cv-test-cm.csv', index = False)