## Setup

Here, we start back up again with a spark session that is capable of working with NLP.

In [None]:
# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.4.0

# install spark-nlp
%pip install spark-nlp==5.1.3

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [2]:
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
import json
import sparknlp
import numpy as np
import pandas as pd
from sparknlp.base import *
from pyspark.ml import Pipeline
from sparknlp.annotator import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from sparknlp.pretrained import PretrainedPipeline

from pyspark.sql.functions import col, lower, regexp_extract, regexp_replace, array, lit
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.types import IntegerType, ArrayType, DoubleType
# from pyspark.sql.functions import udf
from pyspark.ml.linalg import SparseVector, Vectors

In [None]:
# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","32G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.ContainerCredentialsProvider")\
    .getOrCreate()

In [5]:
print(f"Spark version: {spark.version}")
print(f"sparknlp version: {sparknlp.version()}")

Spark version: 3.4.0
sparknlp version: 5.1.3


## Read in the Saved Data

Here, we will read in the saved data above as a fresh starting point.

In [72]:
%%time
# Read in data from project bucket
bucket = "project17-bucket-alex"
directory = "matt-submissions-age-gender"

s3_path = f"s3a://{bucket}/{directory}"
submissions_age_gender = spark.read.parquet(s3_path, header = True)

CPU times: user 3.61 ms, sys: 0 ns, total: 3.61 ms
Wall time: 483 ms


In [73]:
submissions_age_gender.select(['title', 'selftext', 'regex_age', 'regex_gender']).show(10)

+--------------------+--------------------+---------+------------+
|               title|            selftext|regex_age|regex_gender|
+--------------------+--------------------+---------+------------+
|my boyfriend(27) ...|So my boyfriend(m...|       27|           f|
|Confused in an in...|\nIn a new relati...|       21|           f|
|Asking for phone ...|So, I (21M) was a...|       21|           m|
|LDR bf of 3 month...|I(25F) met my bf(...|       25|           f|
|I break up with m...|I (23m) shared a ...|       23|           m|
|How can I get mor...|My boyfriend(32M)...|       23|           f|
|I (35F) can't get...|So I live with an...|       35|           f|
|I think I'm a les...|I (25f) have been...|       25|           f|
|I (24F) snore too...|So, I (24 F) am i...|       24|           f|
|One of my best fr...|I’m on mobile so ...|       21|           f|
+--------------------+--------------------+---------+------------+
only showing top 10 rows



In [74]:
df = submissions_age_gender.select(['selftext', 'regex_age', 'regex_gender'])
del(submissions_age_gender)

## Sentiment Model

In [None]:
MODEL_NAME = 'sentimentdl_use_twitter'

documentAssembler = DocumentAssembler().setInputCol("selftext").setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name = "tfhub_use", lang = "en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang = "en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

In [16]:
# pyspark == 3.4.0 works, pyspark == 3.5.0 does not
pipelineModel = nlpPipeline.fit(df)
results = pipelineModel.transform(df)

In [17]:
results = results.withColumn('sentiment', F.explode(results.sentiment.result))
final_data = results.select('selftext', 'regex_age', 'regex_gender', 'sentiment')
final_data.show()

+--------------------+---------+------------+---------+
|            selftext|regex_age|regex_gender|sentiment|
+--------------------+---------+------------+---------+
|So my boyfriend(m...|       27|           f| negative|
|\nIn a new relati...|       21|           f|  neutral|
|So, I (21M) was a...|       21|           m| negative|
|I(25F) met my bf(...|       25|           f| negative|
|I (23m) shared a ...|       23|           m| negative|
|My boyfriend(32M)...|       23|           f| negative|
|So I live with an...|       35|           f| negative|
|I (25f) have been...|       25|           f| negative|
|So, I (24 F) am i...|       24|           f| negative|
|I’m on mobile so ...|       21|           f| negative|
|TDLR: can ex’s be...|       22|           f| negative|
|I (22m) have been...|       22|           m| negative|
|I (21f) am having...|       21|           f| negative|
|I (16f) am thinki...|       16|           f| negative|
|Hi everyone. I (2...|       21|           f| ne

### Save Data for ML

In [18]:
final_data.write.parquet(
    "s3a://project17-bucket-alex/matt-age-gender-sentiment"
)

                                                                                

### Save Data for Visualization

In [13]:
final_data = final_data.select('regex_age', 'regex_gender', 'sentiment').cache()

In [14]:
# save the results to CSV for visualization
final_data.toPandas().to_csv('../../data/nlp-data/submission_age_gender_sentiment_new.csv', index = False)

                                                                                

## CountVectorizer on `relationship_advice`

In [75]:
# show the initial data
df.show(10)

+--------------------+---------+------------+
|            selftext|regex_age|regex_gender|
+--------------------+---------+------------+
|So my boyfriend(m...|       27|           f|
|\nIn a new relati...|       21|           f|
|So, I (21M) was a...|       21|           m|
|I(25F) met my bf(...|       25|           f|
|I (23m) shared a ...|       23|           m|
|My boyfriend(32M)...|       23|           f|
|So I live with an...|       35|           f|
|I (25f) have been...|       25|           f|
|So, I (24 F) am i...|       24|           f|
|I’m on mobile so ...|       21|           f|
+--------------------+---------+------------+
only showing top 10 rows



In [76]:
# keep only alphanumeric characters and spaces
df = df.withColumn('mono_id', F.monotonically_increasing_id())
df = df.withColumn('selftext', F.lower(F.regexp_replace('selftext', '[\(\)\{\},.:;\'\"\?\n\*0-9]', '')))
df.show(10)

+--------------------+---------+------------+-------+
|            selftext|regex_age|regex_gender|mono_id|
+--------------------+---------+------------+-------+
|so my boyfriendm ...|       27|           f|      0|
|in a new relation...|       21|           f|      1|
|so i m was at a p...|       21|           m|      2|
|if met my bfm on ...|       25|           f|      3|
|i m shared a tikt...|       23|           m|      4|
|my boyfriendm and...|       23|           f|      5|
|so i live with an...|       35|           f|      6|
|i f have been ref...|       25|           f|      7|
|so i  f am in a r...|       24|           f|      8|
|i’m on mobile so ...|       21|           f|      9|
+--------------------+---------+------------+-------+
only showing top 10 rows



In [None]:

documentAssembler = DocumentAssembler().setInputCol("selftext").setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("tokenized")

stopwords_cleaner = StopWordsCleaner().pretrained("stopwords_iso", "en").setInputCols(["tokenized"]).setOutputCol("cleaned")

stemmer = Stemmer().setInputCols(["cleaned"]).setOutputCol("stemmed")

lemmatizer = LemmatizerModel.pretrained().setInputCols(["stemmed"]).setOutputCol("lemmatized")

# countvectorizer = CountVectorizer().setInputCol("lemmatized").setOutputCol("cv")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          tokenizer,
          stopwords_cleaner,
          stemmer,
          lemmatizer
          # countvectorizer
      ])

In [30]:
# fit and transform the data using the pipeline
pipelineModel = nlpPipeline.fit(df)
results = pipelineModel.transform(df)

In [54]:
# extract the result from the lemmatizer (has other unneeded data)
results = results.withColumn('result', F.col('lemmatized').result)
results.select('selftext', 'result').show(10)

+--------------------+--------------------+
|            selftext|              result|
+--------------------+--------------------+
|so my boyfriendm ...|[boyfriendm, f, d...|
|in a new relation...|[new, relationshi...|
|so i m was at a p...|[m, parti, night,...|
|if met my bfm on ...|[meet, bfm, cruis...|
|i m shared a tikt...|[m, share, tiktok...|
|my boyfriendm and...|[boyfriendm, date...|
|so i live with an...|[live, hous, yr, ...|
|i f have been ref...|[f, reflect, rece...|
|so i  f am in a r...|[f, relationship,...|
|i’m on mobile so ...|[i’m, mobil, sorr...|
+--------------------+--------------------+
only showing top 10 rows



In [32]:
# build the countvectorizer
n_words = 500
countvectorizer = CountVectorizer(vocabSize = n_words).setInputCol("result").setOutputCol("cv")

# fit and transform the data using CV
fitted = countvectorizer.fit(results)
transformed = fitted.transform(results)

                                                                                

In [56]:
transformed.select('mono_id', 'selftext', 'result', 'cv').show(10)

+-------+--------------------+--------------------+--------------------+
|mono_id|            selftext|              result|                  cv|
+-------+--------------------+--------------------+--------------------+
|      0|so my boyfriendm ...|[boyfriendm, f, d...|(500,[0,1,2,3,6,1...|
|      1|in a new relation...|[new, relationshi...|(500,[0,1,2,3,4,5...|
|      2|so i m was at a p...|[m, parti, night,...|(500,[2,4,6,7,15,...|
|      3|if met my bfm on ...|[meet, bfm, cruis...|(500,[1,2,3,4,7,9...|
|      4|i m shared a tikt...|[m, share, tiktok...|(500,[5,17,21,30,...|
|      5|my boyfriendm and...|[boyfriendm, date...|(500,[0,1,3,4,5,6...|
|      6|so i live with an...|[live, hous, yr, ...|(500,[0,1,2,3,4,6...|
|      7|i f have been ref...|[f, reflect, rece...|(500,[1,4,7,8,10,...|
|      8|so i  f am in a r...|[f, relationship,...|(500,[5,8,10,11,1...|
|      9|i’m on mobile so ...|[i’m, mobil, sorr...|(500,[0,2,3,4,6,7...|
+-------+--------------------+--------------------+

In [57]:
# make a user-defined-function to apply to the cv column to extract dense vector representations
to_dense = F.udf(lambda v: v.toArray().tolist(), ArrayType(DoubleType()))

# apply the UDF and see the result of the transformation
transformed_array = transformed.withColumn('cv_array', to_dense('cv'))
transformed_array.select('result', 'cv', 'cv_array').show(10)

[Stage 47:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+
|              result|                  cv|            cv_array|
+--------------------+--------------------+--------------------+
|[boyfriendm, f, d...|(500,[0,1,2,3,6,1...|[2.0, 3.0, 1.0, 1...|
|[new, relationshi...|(500,[0,1,2,3,4,5...|[1.0, 10.0, 1.0, ...|
|[m, parti, night,...|(500,[2,4,6,7,15,...|[0.0, 0.0, 1.0, 0...|
|[meet, bfm, cruis...|(500,[1,2,3,4,7,9...|[0.0, 1.0, 7.0, 2...|
|[m, share, tiktok...|(500,[5,17,21,30,...|[0.0, 0.0, 0.0, 0...|
|[boyfriendm, date...|(500,[0,1,3,4,5,6...|[4.0, 4.0, 0.0, 2...|
|[live, hous, yr, ...|(500,[0,1,2,3,4,6...|[9.0, 2.0, 3.0, 1...|
|[f, reflect, rece...|(500,[1,4,7,8,10,...|[0.0, 1.0, 0.0, 0...|
|[f, relationship,...|(500,[5,8,10,11,1...|[0.0, 0.0, 0.0, 0...|
|[i’m, mobil, sorr...|(500,[0,2,3,4,6,7...|[2.0, 0.0, 1.0, 2...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



                                                                                

In [58]:
# grab the sorted vocabulary from the CV model
top_n_words = fitted.vocabulary
top_n_words[:10]

['feel',
 'like',
 'want',
 'time',
 'know',
 'tell',
 'friend',
 'think',
 'relationship',
 'thing']

In [59]:
# create a dataframe with one column per word, with values as their frequencies within each post
word_counts_df = transformed_array.select(
    ['mono_id', 'regex_age', 'regex_gender'] + [(F.col("cv_array")[x]).alias(top_n_words[x]) for x in range(0, len(top_n_words))]
)

In [62]:
# show the result
word_counts_df_sample = word_counts_df.select(['regex_age', 'regex_gender'] + top_n_words[:10]).limit(10).cache()
word_counts_df_sample.show()



+---------+------------+----+----+----+----+----+----+------+-----+------------+-----+
|regex_age|regex_gender|feel|like|want|time|know|tell|friend|think|relationship|thing|
+---------+------------+----+----+----+----+----+----+------+-----+------------+-----+
|       27|           f| 2.0| 3.0| 1.0| 1.0| 0.0| 0.0|   1.0|  0.0|         0.0|  0.0|
|       21|           f| 1.0|10.0| 1.0| 2.0| 2.0| 2.0|   0.0|  3.0|         1.0|  1.0|
|       21|           m| 0.0| 0.0| 1.0| 0.0| 1.0| 0.0|   2.0|  1.0|         0.0|  0.0|
|       25|           f| 0.0| 1.0| 7.0| 2.0| 2.0| 0.0|   0.0|  1.0|         0.0|  1.0|
|       23|           m| 0.0| 0.0| 0.0| 0.0| 0.0| 1.0|   0.0|  0.0|         0.0|  0.0|
|       23|           f| 4.0| 4.0| 0.0| 2.0| 2.0| 3.0|   4.0|  1.0|         0.0|  3.0|
|       35|           f| 9.0| 2.0| 3.0| 1.0| 2.0| 0.0|   2.0|  1.0|         2.0|  1.0|
|       25|           f| 0.0| 1.0| 0.0| 0.0| 1.0| 0.0|   0.0|  2.0|         2.0|  0.0|
|       24|           f| 0.0| 0.0| 0.0| 0.0

                                                                                

In [63]:
# save off to CSV for visualization later
word_counts_df_sample.toPandas().to_csv('../../data/nlp-data/age-gender-cv-sample.csv', index = False)

## CountVectorizer on Full DF

In [6]:
%%time
# Read in data from project bucket
bucket = "project17-bucket-alex"
#output_prefix_data = "project_2022"

# List of 12 directories each containing 1 month of data
directories = ["project_2022_" + str(i) + "/submissions" for i in range(1, 13)]

# Iterate through 12 directories and merge each monthly data set to create one big data set
submissions = None
for directory in directories:
    s3_path = f"s3a://{bucket}/{directory}"
    month_df = spark.read.parquet(s3_path, header = True)
    
    if submissions is None:
        submissions = month_df
    else:
        submissions = submissions.union(month_df)

23/11/26 15:32:19 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
23/11/26 15:32:24 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


CPU times: user 27.9 ms, sys: 3.15 ms, total: 31.1 ms
Wall time: 13.7 s


In [7]:
submissions.select(['subreddit', 'author', 'title', 'selftext', 'created_utc', 'num_comments']).show(10)

[Stage 12:>                                                         (0 + 1) / 1]

+-----------------+-------------------+--------------------+--------------------+-------------------+------------+
|        subreddit|             author|               title|            selftext|        created_utc|num_comments|
+-----------------+-------------------+--------------------+--------------------+-------------------+------------+
|NoStupidQuestions|          [deleted]|Who do you call w...|           [deleted]|2022-01-22 18:14:03|           4|
|    AmItheAsshole|          [deleted]|AITA for blowing ...|           [removed]|2022-01-22 18:14:04|           7|
|    AmItheAsshole|       go_awaythrow|AITA if I cut my ...|           [removed]|2022-01-22 18:14:12|           1|
|NoStupidQuestions|          [deleted]|   [deleted by user]|           [removed]|2022-01-22 18:14:16|           1|
|           AskMen|          [deleted]|Do men actually l...|           [removed]|2022-01-22 18:14:21|           1|
|         antiwork|        Vivid_Steel|For Those of You ...|In most states in...

                                                                                

In [8]:
# filter out deleted/removed/empty submissions
invalid_submissions = ['[deleted]', '[removed]', '']
df = submissions.filter(~col('selftext').isin(invalid_submissions))
df.select(['subreddit', 'author', 'title', 'selftext', 'created_utc', 'num_comments']).show(10)

+-------------------+--------------------+--------------------+--------------------+-------------------+------------+
|          subreddit|              author|               title|            selftext|        created_utc|num_comments|
+-------------------+--------------------+--------------------+--------------------+-------------------+------------+
|           antiwork|         Vivid_Steel|For Those of You ...|In most states in...|2022-01-22 18:14:28|           1|
|   unpopularopinion| ballonfightaddicted|Waking up 15-30 m...|I like waking up ...|2022-01-22 18:15:24|           5|
|      AmItheAsshole|       geosunsetmoth|AITA for refusing...|I (NB 19) am auti...|2022-01-22 18:15:30|         425|
|  NoStupidQuestions|           Killdreth|Can I do anything...|I don’t know why,...|2022-01-22 18:15:45|           2|
|     TrueOffMyChest|          sadness_18|I hate people who...|I've been called ...|2022-01-22 18:15:46|           5|
|relationship_advice|  Natural_Rabbit8936|I went thru my

                                                                                

In [9]:
# keep only alphanumeric characters and spaces
df = df.withColumn('mono_id', F.monotonically_increasing_id())
df = df.withColumn('selftext', F.lower(F.regexp_replace('selftext', '[\(\)\{\},.:;\'\"\?\n\*0-9]', '')))
df.select(['subreddit', 'author', 'title', 'selftext', 'created_utc', 'num_comments']).show(10)

+-------------------+--------------------+--------------------+--------------------+-------------------+------------+
|          subreddit|              author|               title|            selftext|        created_utc|num_comments|
+-------------------+--------------------+--------------------+--------------------+-------------------+------------+
|           antiwork|         Vivid_Steel|For Those of You ...|in most states in...|2022-01-22 18:14:28|           1|
|   unpopularopinion| ballonfightaddicted|Waking up 15-30 m...|i like waking up ...|2022-01-22 18:15:24|           5|
|      AmItheAsshole|       geosunsetmoth|AITA for refusing...|i nb  am autistic...|2022-01-22 18:15:30|         425|
|  NoStupidQuestions|           Killdreth|Can I do anything...|i don’t know why ...|2022-01-22 18:15:45|           2|
|     TrueOffMyChest|          sadness_18|I hate people who...|ive been called c...|2022-01-22 18:15:46|           5|
|relationship_advice|  Natural_Rabbit8936|I went thru my

In [None]:

documentAssembler = DocumentAssembler().setInputCol("selftext").setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("tokenized")

stopwords_cleaner = StopWordsCleaner().pretrained("stopwords_iso", "en").setInputCols(["tokenized"]).setOutputCol("cleaned")

stemmer = Stemmer().setInputCols(["cleaned"]).setOutputCol("stemmed")

lemmatizer = LemmatizerModel.pretrained().setInputCols(["stemmed"]).setOutputCol("lemmatized")

# countvectorizer = CountVectorizer().setInputCol("lemmatized").setOutputCol("cv")

nlpPipeline = Pipeline(
      stages = [
          documentAssembler,
          tokenizer,
          stopwords_cleaner,
          stemmer,
          lemmatizer
          # countvectorizer
      ])

In [11]:
# fit and transform the data using the pipeline
pipelineModel = nlpPipeline.fit(df)
results = pipelineModel.transform(df)



In [12]:
# extract the result from the lemmatizer (has other unneeded data)
results = results.withColumn('result', F.col('lemmatized').result)
results.select('selftext', 'result').show(10)

[Stage 18:>                                                         (0 + 1) / 1]

+--------------------+--------------------+
|            selftext|              result|
+--------------------+--------------------+
|in most states in...|[state, countri, ...|
|i like waking up ...|[like, wake, -, m...|
|i nb  am autistic...|[nb, autist, thin...|
|i don’t know why ...|[don’t, know, rea...|
|ive been called c...|[iv, call, close,...|
|but i’m still in ...|         [i’m, love]|
|apologies if this...|[apologi, mess, i...|
|i never really un...|[stand, shoot, go...|
|is it if  she smi...|[smile, lot, laug...|
|throwaway because...|[throwawai, boyfr...|
+--------------------+--------------------+
only showing top 10 rows



                                                                                

In [13]:
# build the countvectorizer
n_words = 500
countvectorizer = CountVectorizer(vocabSize = n_words).setInputCol("result").setOutputCol("cv")

# fit and transform the data using CV
fitted = countvectorizer.fit(results)
transformed = fitted.transform(results)

                                                                                

In [14]:
transformed.select('mono_id', 'subreddit', 'selftext', 'result', 'cv').show(10)

[Stage 23:>                                                         (0 + 1) / 1]

+-------+-------------------+--------------------+--------------------+--------------------+
|mono_id|          subreddit|            selftext|              result|                  cv|
+-------+-------------------+--------------------+--------------------+--------------------+
|      0|           antiwork|in most states in...|[state, countri, ...|(500,[0,2,4,11,12...|
|      1|   unpopularopinion|i like waking up ...|[like, wake, -, m...|(500,[0,1,8,35,43...|
|      2|      AmItheAsshole|i nb  am autistic...|[nb, autist, thin...|(500,[0,1,2,4,6,7...|
|      3|  NoStupidQuestions|i don’t know why ...|[don’t, know, rea...|(500,[0,1,3,5,8,1...|
|      4|     TrueOffMyChest|ive been called c...|[iv, call, close,...|(500,[2,3,5,16,22...|
|      5|relationship_advice|but i’m still in ...|         [i’m, love]|(500,[15,26],[1.0...|
|      6|relationship_advice|apologies if this...|[apologi, mess, i...|(500,[1,3,5,7,9,1...|
|      7|  NoStupidQuestions|i never really un...|[stand, shoot, go...

                                                                                

In [15]:
# make a user-defined-function to apply to the cv column to extract dense vector representations
to_dense = F.udf(lambda v: v.toArray().tolist(), ArrayType(DoubleType()))

# apply the UDF and see the result of the transformation
transformed_array = transformed.withColumn('cv_array', to_dense('cv'))
transformed_array.select('result', 'cv', 'cv_array').show(10)

[Stage 24:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+
|              result|                  cv|            cv_array|
+--------------------+--------------------+--------------------+
|[state, countri, ...|(500,[0,2,4,11,12...|[1.0, 0.0, 1.0, 0...|
|[like, wake, -, m...|(500,[0,1,8,35,43...|[2.0, 1.0, 0.0, 0...|
|[nb, autist, thin...|(500,[0,1,2,4,6,7...|[2.0, 2.0, 2.0, 0...|
|[don’t, know, rea...|(500,[0,1,3,5,8,1...|[1.0, 1.0, 0.0, 1...|
|[iv, call, close,...|(500,[2,3,5,16,22...|[0.0, 0.0, 3.0, 1...|
|         [i’m, love]|(500,[15,26],[1.0...|[0.0, 0.0, 0.0, 0...|
|[apologi, mess, i...|(500,[1,3,5,7,9,1...|[0.0, 1.0, 0.0, 1...|
|[stand, shoot, go...|(500,[3,16,33,82,...|[0.0, 0.0, 0.0, 1...|
|[smile, lot, laug...|(500,[39,439],[1....|[0.0, 0.0, 0.0, 0...|
|[throwawai, boyfr...|(500,[2,3,4,5,11,...|[0.0, 0.0, 1.0, 5...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



                                                                                

In [16]:
# grab the sorted vocabulary from the CV model
top_n_words = fitted.vocabulary
top_n_words[:10]

['like',
 'feel',
 'want',
 'know',
 'time',
 'tell',
 'get',
 'im',
 'think',
 'friend']

In [18]:
# create a dataframe with one column per word, with values as their frequencies within each post
remove_cols = ['document', 'tokenized', 'cleaned', 'stemmed', 'lemmatized', 'result', 'cv']
keep_cols = [col for col in transformed_array.columns if col not in remove_cols]

word_counts_df = transformed_array.select(
    keep_cols + [(F.col("cv_array")[x]).alias('word_' + top_n_words[x]) for x in range(0, len(top_n_words))]
)

In [113]:
# show the result
show_cols = ['subreddit'] + ['word_' + word for word in top_n_words[:10]]
word_counts_df_sample = word_counts_df.select(show_cols).limit(10).cache()
word_counts_df_sample.show()

+-------------------+---------+---------+---------+---------+---------+---------+--------+-------+----------+-----------+
|          subreddit|word_like|word_feel|word_want|word_know|word_time|word_tell|word_get|word_im|word_think|word_friend|
+-------------------+---------+---------+---------+---------+---------+---------+--------+-------+----------+-----------+
|           antiwork|      1.0|      0.0|      1.0|      0.0|      2.0|      0.0|     0.0|    0.0|       0.0|        0.0|
|   unpopularopinion|      2.0|      1.0|      0.0|      0.0|      0.0|      0.0|     0.0|    0.0|       1.0|        0.0|
|      AmItheAsshole|      2.0|      2.0|      2.0|      0.0|      1.0|      0.0|     1.0|    4.0|       4.0|        0.0|
|  NoStupidQuestions|      1.0|      1.0|      0.0|      1.0|      0.0|      1.0|     0.0|    0.0|       3.0|        0.0|
|     TrueOffMyChest|      0.0|      0.0|      3.0|      1.0|      0.0|      1.0|     0.0|    0.0|       0.0|        0.0|
|relationship_advice|   

23/11/26 00:07:01 WARN CacheManager: Asked to cache already cached data.


In [114]:
# save off to CSV for visualization later
word_counts_df_sample.toPandas().to_csv('../../data/nlp-data/full-df-cv-sample.csv', index = False)

### Save DF for later ML Use

In [20]:
word_counts_df.write.mode("overwrite").parquet(
    "s3a://project17-bucket-alex/matt-submissions-cv"
)

                                                                                

In [21]:
word_counts_df.count(), len(word_counts_df.columns)

                                                                                

(977181, 570)