In [1]:
# !pip3 install nb_black==1.0.7 nltk==3.6.6 jupyterlab_pygments==0.1.2

In [2]:
%%time
!pip freeze | grep -E 'boto3|black|sagemaker|sagemaker_pyspark|jupyter|jupyterlab|pandas|pyspark|nltk'

black @ file:///home/conda/feedstock_root/build_artifacts/black-recipe_1599478779128/work
boto3==1.20.25
hdijupyterutils==0.19.1
jupyter @ file:///home/conda/feedstock_root/build_artifacts/jupyter_1611871900595/work
jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1610375432619/work
jupyter-console==6.4.0
jupyter-core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1612125253553/work
jupyter-packaging @ file:///home/conda/feedstock_root/build_artifacts/jupyter-packaging_1613054948399/work
jupyter-server==1.13.0
jupyterlab==3.2.4
jupyterlab-launcher==0.13.1
jupyterlab-pygments @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_pygments_1601375948261/work
jupyterlab-server @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_server_1613760084674/work
jupyterlab-widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1609173350931/work
nb-black==1.0.7
nltk==3.6.6
pandas==1.1.5
pyls-black @ file:

In [3]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [4]:
import os
from typing import List

import boto3
import nltk
import sagemaker_pyspark
from pyspark import SparkConf, keyword_only
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import (
    CountVectorizer,
    IDF,
    RegexTokenizer,
    StopWordsRemover,
    Tokenizer,
)
from pyspark.ml.param.shared import (
    HasInputCol,
    HasOutputCol,
    Param,
    Params,
    TypeConverters,
)
from pyspark.ml.linalg import SparseVector, Vectors
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql import Column, SparkSession, functions as F, types as T
from pyspark.sql.dataframe import DataFrame as pdf

In [5]:
# # not used
# import dask.dataframe as dd
# import pandas as pd

# used for display purposes only
from pandas import DataFrame as pd_DataFrame, option_context as pd_option_context

## User Inputs

In [6]:
# S3
s3_bucket_name = "sagemakertestwillz3s"
path_to_folder = "/datasets/twitter/kinesis-demo/"

# Data processing
all_cols_to_process = ["created_at", "user_joined", "reviewText"]

In [7]:
def show_pyspark_df(df: pdf, nrows: int = 5) -> pd_DataFrame:
    """Display the first n rows of a PySpark DataFrame as a Pandas DataFrame."""
    return df.limit(nrows).toPandas()


def get_existing_csv_files_list(s3_bucket_name: str, prefix: str) -> List[str]:
    """Get list of files in subfolder in S3 bucket."""
    s3_resource = boto3.resource("s3")
    bucket = s3_resource.Bucket(s3_bucket_name)
    files_found_objects_list = list(bucket.objects.filter(Prefix=prefix))
    files_found_names_list = [w.key for w in files_found_objects_list]
    return files_found_names_list


def remove_punctuation(column_obj: Column) -> Column:
    """Removes punctuation from a DataFrame text column."""
    return F.regexp_replace(column_obj, "\\p{Punct}", "")


def remove_lead_trail_spaces(column_obj: Column) -> Column:
    """Removes leading and trailing spaces from a DataFrame text column."""
    return F.trim(column_obj)


def replace_multiple_spaces(column_obj: Column) -> Column:
    """Replace multiple spaces with a single space."""
    return F.regexp_replace(column_obj, r"\s+", " ")

Download NLTK stopwords, if not previously done

In [8]:
%%time
if not os.path.isdir(
    os.path.join(os.path.expanduser("~"), "nltk_data", "corpora", "stopwords")
):
    nltk.download("stopwords")

from nltk.corpus import stopwords

all_stopwords = stopwords.words("english")

CPU times: user 1.46 ms, sys: 0 ns, total: 1.46 ms
Wall time: 1.23 ms


## PySpark Setup

In [9]:
%%time
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

CPU times: user 674 µs, sys: 139 µs, total: 813 µs
Wall time: 809 µs


Start a Spark session

In [10]:
%%time
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .appName("schema_test")
    .getOrCreate()
)

CPU times: user 21.1 ms, sys: 5.39 ms, total: 26.5 ms
Wall time: 3.62 s


## Load Data

### Get List of S3 CSV Data Files

Get a list of all the CSV files containing the tweets data (files with a prefix `tweets_*.csv`), and not the metadata (prefix `tweets_metadata_*.csv`), from `csvs/` folder in the S3 bucket path at `<bucket-name>/datasets/twitter/kinesis-demo/`

In [11]:
%%time
existing_csv_files_list = get_existing_csv_files_list(
    s3_bucket_name, path_to_folder[1:] + "csvs/tweets_"
)
files_csvs_list = [f for f in existing_csv_files_list if "metadata" not in f]
files_csvs_list

CPU times: user 82 ms, sys: 15.6 ms, total: 97.5 ms
Wall time: 186 ms


['datasets/twitter/kinesis-demo/csvs/tweets_15_hc2021123017_s20211230133054.csv']

### Load all CSV Files into Single PySpark `DataFrame`

Read all CSV files from the `csvs/` in the S3 bucket path at `<bucket-name>/datasets/twitter/kinesis-demo/` into a PySpark `DataFrame`

In [12]:
%%time
df = spark.read.csv(
    [f's3a://{s3_bucket_name}' + f"/{f}" for f in files_csvs_list],
    header=True,
    inferSchema=True
).withColumnRenamed("text", "reviewText")

CPU times: user 2.95 ms, sys: 598 µs, total: 3.55 ms
Wall time: 5.41 s


Get the number of rows (retrieved tweets) in the data

In [13]:
print(f"Raw data contains {df.count():,} rows and {len(df.columns):,} columns")

Raw data contains 2,055 rows and 54 columns


Show the first 4 rows from the PySpark `DataFrame`

In [22]:
with pd_option_context("display.max_columns", 100):
    display(show_pyspark_df(df, 4))

Unnamed: 0,id,geo,coordinates,place,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,created_at,source,in_reply_to_user_id,in_reply_to_screen_name,source_text,place_id,place_url,place_place_type,place_name,place_full_name,place_country_code,place_country,place_bounding_box_type,place_bounding_box_coordinates,place_attributes,coords_type,coords_lon,coords_lat,geo_type,geo_lon,geo_lat,user_name,user_screen_name,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,user_contributors_enabled,user_joined,user_location,retweeted_tweet,tweet_text_urls,tweet_text_hashtags,tweet_text_usernames,num_urls_in_tweet_text,num_users_in_tweet_text,num_hashtags_in_tweet_text,reviewText,file_name
0,1476607990027407364,,,,,False,0,0,0,0,False,False,2021-12-30 17:35:53+00:00,"""<a href=""""http://twitter.com/download/iphone""...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Happy New Years 2022,GuessWho122021,216,255,0,1165,3214,False,False,False,2021-11-27 20:38:14+00:00,World Wide,no,https://t.co/yH8cBVuMj3,,,1,0,0,"Space colonists may turn to cannibalism, scien...",twitter_delivery_stream-1-2021-12-30-17-35-58-...
1,1476607992586018824,,,,,True,0,0,0,0,False,False,2021-12-30 17:35:54+00:00,"""<a href=""""http://twitter.com/download/iphone""...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,TheToysRusKid,JC1of1,202,313,0,18454,53653,False,False,False,2018-09-04 17:18:46+00:00,Southern California,no,,,,0,0,0,1. Beat Street2. Harlem Knights 3. National La...,twitter_delivery_stream-1-2021-12-30-17-35-58-...
2,1476607997405184000,,,,,False,0,0,0,0,False,False,2021-12-30 17:35:55+00:00,"""<a href=""""http://publicize.wp.com/"""" rel=""""no...",,,WordPress.com,,,,,,,,,[[]],{},,,,,,,BCABA Network,BcabaNetwork,2581,4710,42,3027,238152,False,False,False,2016-05-25 10:54:46+00:00,"West Midlands, England",no,https://t.co/kPAMK3TxSL,,,1,0,0,NASA Plans Coverage of Webb Space Telescope De...,twitter_delivery_stream-1-2021-12-30-17-35-58-...
3,1476607997925330945,,,,,False,0,0,0,0,False,False,2021-12-30 17:35:55+00:00,"""<a href=""""http://twitter.com/download/iphone""...",,,Twitter for iPhone,,,,,,,,,[[]],{},,,,,,,Jude Jackson 💙 Solidarity with #NHS workers,JudeJack,2138,4999,157,79973,109065,False,False,False,2009-02-21 14:48:52+00:00,International,no,https://t.co/cRcHiFCHS0,,,1,0,0,"Exciting times!Back in 1915, Einstein publishe...",twitter_delivery_stream-1-2021-12-30-17-35-58-...


Get the schema for the PySpark `DataFrame`

In [15]:
# print(df.printSchema())

root
 |-- id: long (nullable = true)
 |-- geo: string (nullable = true)
 |-- coordinates: string (nullable = true)
 |-- place: string (nullable = true)
 |-- contributors: string (nullable = true)
 |-- is_quote_status: boolean (nullable = true)
 |-- quote_count: integer (nullable = true)
 |-- reply_count: integer (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- favorite_count: integer (nullable = true)
 |-- favorited: boolean (nullable = true)
 |-- retweeted: boolean (nullable = true)
 |-- created_at: string (nullable = true)
 |-- source: string (nullable = true)
 |-- in_reply_to_user_id: string (nullable = true)
 |-- in_reply_to_screen_name: string (nullable = true)
 |-- source_text: string (nullable = true)
 |-- place_id: string (nullable = true)
 |-- place_url: string (nullable = true)
 |-- place_place_type: string (nullable = true)
 |-- place_name: string (nullable = true)
 |-- place_full_name: string (nullable = true)
 |-- place_country_code: string (nullable = 

## Data Processing

For processing the data, text and non-text columns will be treated separately.

### Processing Non-Text Columns

We'll define a PySparkML pipeline ([v2.4.0](https://spark.apache.org/docs/2.4.0/ml-pipeline.html#pipeline), [latest version](https://spark.apache.org/docs/latest/ml-pipeline.html), [API for latest version](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.Pipeline.html)) to process all useful non-text columns from the data. This pipeline will accept a list of all useful columns and return a DataFrame with the same input columns and the processed versions (a suffix will be added to the column name to indicate that it is has been processed and keep it separate from the raw data). As an example, the column `created_at` will be converted from a string into a datetime datatype, and the converted version of this column will be returned as `created_at_dt`.

The following processing steps will be applied
- drop rows with duplicated tweets (retweets) since the text of the tweet (which will be used in NLP) is repeated
  - for analysing text data, we don't need multiple observations (rows) with the same text (re-tweets)
    - such rows are only useful when exploring the data after the text-based analysis has been completed
  - so, for NLP analysis, the duplicated (re-tweets) rows in the text column can be dropped
  - based on how data was collected using `twitter_s3.py`, an example of a retweet is
    - (row 10) `retweeted_tweet = 'no'` and text column = `'text here'`
    - (row 91) `retweeted_tweet = 'yes'` and text column = `'text here'`
    - (row 201) `retweeted_tweet = 'yes'` and text column = `'text here'`

    where we only need the first tweet (row 10) and so we can drop all rows corresponding to retweets
- drop rows where the tweet is missing or contains an empty string (if any)
- convert the following columns from the `string` datatype to `datetime`s
  - `created_at` (date and time when the tweet was posted)
  - `user_joined` (date and time when user joined Twitter)

Apply the non-text processing pipeline to process all the useful non-text columns from the data

In [16]:
%%time
# Select the columns to be processed, including the text column
df_processed = df.select(all_cols_to_process)

# Drop duplicates in the text column and remove and rows where the text is a blank string
df_processed = df_processed.dropDuplicates(subset=["reviewText"]).filter(df["reviewText"] != '')

# Drop rows with a missing value in the text column
df_processed = df_processed.na.drop(subset=["reviewText"])

# Apply datetime formatting for the two datetime columns
for c in ["created_at", "user_joined"]:
    df_processed = df_processed.withColumn(
        f"{c}_dt",
        F.to_timestamp(F.col(c), "yyyy-MM-dd HH:mm:ss"),
    )

print(df_processed.count())
show_pyspark_df(df_processed, 7)

1081
CPU times: user 12.8 ms, sys: 4.24 ms, total: 17.1 ms
Wall time: 2.39 s


Unnamed: 0,created_at,user_joined,reviewText,created_at_dt,user_joined_dt
0,2021-12-30 17:41:12+00:00,2021-08-21 13:39:34+00:00,I love you NASA,2021-12-30 17:41:12,2021-08-21 13:39:34
1,2021-12-30 17:48:32+00:00,2019-03-20 14:31:32+00:00,The X on my back is so large NASA can see it...,2021-12-30 17:48:32,2019-03-20 14:31:32
2,2021-12-30 17:47:42+00:00,2015-04-29 21:00:18+00:00,Moon and Uranus !,2021-12-30 17:47:42,2015-04-29 21:00:18
3,2021-12-30 17:43:37+00:00,2009-07-10 15:05:09+00:00,""" Church people be like """"we all know who cre...",2021-12-30 17:43:37,2009-07-10 15:05:09
4,2021-12-30 17:40:18+00:00,2021-02-23 00:52:38+00:00,Blue Origin is looking to hire a Work Center S...,2021-12-30 17:40:18,2021-02-23 00:52:38
5,2021-12-30 17:38:10+00:00,2021-06-05 07:13:05+00:00,I support this project good luck,2021-12-30 17:38:10,2021-06-05 07:13:05
6,2021-12-30 17:41:02+00:00,2009-03-22 15:14:18+00:00,This account is worth a follow,2021-12-30 17:41:02,2009-03-22 15:14:18


The non-text data processing is now ready and we can proceed to preparing the text data column for quantitative analysis.

### Processing Text Data

We'll now process the text data column. All the processed data columns from the previous section, including the text column, will be retained. However, here, we will only be processing the text column from this data.

#### Cleaning Text Data

Since we are looking to build up a useful text vocabulary to perform NLP tasks on, we'll first perform the following text cleaning steps
- change text to lowercase
- remove numbers
- remove punctuation
- replace multiple whitespaces with a single whitespace
- remove leading and trailing whitespace

The second and third of these steps will eliminate non-text components of the data in the text (tweet text) column. The last two of these steps will help with [tokenization](https://neptune.ai/blog/tokenization-in-nlp) during the NLP data preparation step (done in the next sub-section).

In [18]:
# Change text to lowercase
df_processed = df_processed.select(
    all_cols_to_process + ["created_at_dt", "user_joined_dt"] + [F.lower(F.col("reviewText")).alias("reviewText_processed")]
)

# Remove numbers
df_processed.select(
    all_cols_to_process
    + [
        F.regexp_replace(F.col("reviewText_processed"), "\d+", "").alias(
            "reviewText_processed"
        )
    ]
)

# Remove punctuation
df_processed = df_processed.select(
    all_cols_to_process
    + [remove_punctuation(F.col("reviewText_processed")).alias("reviewText_processed")]
)

# Replace multiple whitespaces with a single whitespace
df_processed = df_processed.select(
    all_cols_to_process
    + [
        replace_multiple_spaces(F.col("reviewText_processed")).alias(
            "reviewText_processed"
        )
    ]
)

# Remove leading and trailing spaces
df_processed = df_processed.select(
    all_cols_to_process
    + [
        remove_lead_trail_spaces(F.col("reviewText_processed")).alias(
            "reviewText_processed"
        )
    ]
)
print(df_processed.count())
show_pyspark_df(df_processed, 7)

1081


Unnamed: 0,created_at,user_joined,reviewText,reviewText_processed
0,2021-12-30 17:41:12+00:00,2021-08-21 13:39:34+00:00,I love you NASA,i love you nasa
1,2021-12-30 17:48:32+00:00,2019-03-20 14:31:32+00:00,The X on my back is so large NASA can see it...,the x on my back is so large nasa can see it l...
2,2021-12-30 17:47:42+00:00,2015-04-29 21:00:18+00:00,Moon and Uranus !,moon and uranus
3,2021-12-30 17:43:37+00:00,2009-07-10 15:05:09+00:00,""" Church people be like """"we all know who cre...",church people be like we all know who created ...
4,2021-12-30 17:40:18+00:00,2021-02-23 00:52:38+00:00,Blue Origin is looking to hire a Work Center S...,blue origin is looking to hire a work center s...
5,2021-12-30 17:38:10+00:00,2021-06-05 07:13:05+00:00,I support this project good luck,i support this project good luck
6,2021-12-30 17:41:02+00:00,2009-03-22 15:14:18+00:00,This account is worth a follow,this account is worth a follow


#### NLP on Cleaned Text Data

Now, we'll apply an NLP pipeline to extract features from the cleaned text data. This pipeline will consist of the following three steps
- tokenization
  - here we will restrict the minimum token length that we will accept using the `minTokenLength` key word
    - this is a hyperparameter of the NLP pipeline that can be tuned during future versions of this analysis
- removal of stop words
  - these are frequently occurring words that won't offer any useful information
- vectorization
  - this is the process of associating words or phrases from a text vocabulary to a real-valued vector
  - there are several approaches to vectorization, but we will restrict ourselves to TFIDF vectorization ([1](https://openclassrooms.com/en/courses/6532301-introduction-to-natural-language-processing/7067116-apply-the-tf-idf-vectorization-approach), [2](https://monkeylearn.com/blog/what-is-tf-idf/))
    - briefly, the disadvantage of the TFIDF technique is that the same words in two different vocabularies will produce different vector representations depending on the corpus being analysed
    - in PySpark this can be done using a combination of a `CountVectorizer` ([link](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.CountVectorizer.html)) and `IDF` ([link](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.IDF.html)) classes from the `pyspark.ml` module
      - `CountVectorizer` has three particularly useful hyperparameters `minDF`, `maxDF` and `vocabSize` that could be extensively tuned in future versions of this analysis

In [19]:
# Tokenization
tokenizer = RegexTokenizer(
    minTokenLength=3,
    inputCol="reviewText_processed",
    outputCol="tokens",
    toLowercase=True,
    pattern="\\s+",  # default (https://stackoverflow.com/a/13750765/4057186)
    # pattern="\\W",  # keep words
)

# Removal of Stop Words
remover = StopWordsRemover(
    inputCol="tokens", outputCol="tokens_no_stopwords", stopWords=all_stopwords
)

# TFIDF Vectorization
count_vec_params = dict(
    inputCol="tokens_no_stopwords",
    outputCol="rawFeatures",
    vocabSize=262144,  # default = 262144
    minDF=5,  # if this is a float, ignores terms with document freq less than this fraction; default = 1.0
    maxDF=0.75,  # if this is a float, ignores tokens with a document freq greater than this fraction; default = 9223372036854775807
)
count_vectorizer = CountVectorizer(**count_vec_params)
idf = IDF(minDocFreq=0, inputCol="rawFeatures", outputCol="1gram_idfv")
tfidf_vectorizer = Pipeline(stages=[count_vectorizer, idf])

assembler = VectorAssembler(inputCols=["1gram_idf"], outputCol="features")

# Combined text processing pipeline
pipe = Pipeline(stages=[tokenizer, remover, tfidf_vectorizer])

Apply the text processing pipeline to process the text column from the data

In [20]:
%%time
pipe_trained = pipe.fit(df_processed)
df_text_processed = pipe_trained.transform(df_processed)
print(df_text_processed.count())
show_pyspark_df(df_text_processed, 10)

1081
CPU times: user 208 ms, sys: 20.9 ms, total: 229 ms
Wall time: 5.91 s


Unnamed: 0,created_at,user_joined,reviewText,reviewText_processed,tokens,tokens_no_stopwords,rawFeatures,features
0,2021-12-30 17:41:12+00:00,2021-08-21 13:39:34+00:00,I love you NASA,i love you nasa,"[love, you, nasa]","[love, nasa]","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 2.443271677136423, 0.0, 0.0, 0.0, 0..."
1,2021-12-30 17:48:32+00:00,2019-03-20 14:31:32+00:00,The X on my back is so large NASA can see it...,the x on my back is so large nasa can see it l...,"[the, back, large, nasa, can, see, love, you]","[back, large, nasa, see, love]","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 2.443271677136423, 0.0, 0.0, 0.0, 0..."
2,2021-12-30 17:47:42+00:00,2015-04-29 21:00:18+00:00,Moon and Uranus !,moon and uranus,"[moon, and, uranus]","[moon, uranus]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2021-12-30 17:43:37+00:00,2009-07-10 15:05:09+00:00,""" Church people be like """"we all know who cre...",church people be like we all know who created ...,"[church, people, like, all, know, who, created...","[church, people, like, know, created, universe]","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 2.8756925952331156, ..."
4,2021-12-30 17:40:18+00:00,2021-02-23 00:52:38+00:00,Blue Origin is looking to hire a Work Center S...,blue origin is looking to hire a work center s...,"[blue, origin, looking, hire, work, center, sc...","[blue, origin, looking, hire, work, center, sc...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,2021-12-30 17:38:10+00:00,2021-06-05 07:13:05+00:00,I support this project good luck,i support this project good luck,"[support, this, project, good, luck]","[support, project, good, luck]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,2021-12-30 17:41:02+00:00,2009-03-22 15:14:18+00:00,This account is worth a follow,this account is worth a follow,"[this, account, worth, follow]","[account, worth, follow]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,2021-12-30 17:41:31+00:00,2019-09-21 17:15:40+00:00,My better alternative is hanging out with peo...,my better alternative is hanging out with peop...,"[better, alternative, hanging, out, with, peop...","[better, alternative, hanging, people, make, f...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 2.2860860936140104, 0.0, 0.0, 0.0, 0.0, ..."
8,2021-12-30 17:43:45+00:00,2013-01-18 14:40:16+00:00,FACTs.,facts,[facts],[facts],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,2021-12-30 17:41:08+00:00,2020-07-01 06:16:37+00:00,Very cool - this film is about you and your fa...,very cool this film is about you and your fail...,"[very, cool, this, film, about, you, and, your...","[cool, film, failings]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Topic Model Training

### LDA

In [None]:
%%time
num_topics = 10
lda_params_dict = dict(
    featuresCol="features",  # features or tokens_no_stopwords
    optimizer="online",
    maxIter=3,
    k=num_topics,
    topicConcentration=None,
)
lda = LDA(**lda_params_dict)

In [None]:
# %%time
# lda_model = lda.fit(df_processed)

In [21]:
# %%time
# df_full = df_text_processed.join(
#     df.select(["id", "created_at", "user_joined", "reviewText"]),
#     on=["created_at", "user_joined", "reviewText"],
#     how="left",
# )
# print(df_full.count())
# show_pyspark_df(df_full, 10)