# NLP Data Generation

### Bucket checks

In [2]:
!aws s3 ls


2023-08-29 23:43:16 sagemaker-studio-692960231031-wo7kgoszj2g
2023-08-29 23:50:01 sagemaker-us-east-1-692960231031
2023-08-30 00:34:21 vad49
2023-09-16 16:02:10 vad49-labdata


In [3]:
!aws s3 ls s3://project17-bucket-alex/stories-and-books-nlp/


                           PRE mapping/
                           PRE processed-data/


In [4]:
# save books into bucket if needed
if True is False:
    !aws s3 cp ../../data/external-data/books/ s3://project17-bucket-alex/books --recursive --exclude "*" --exclude ".ipynb_checkpoints/*" --include "*.txt"


### Setup

In [5]:
if True is True: # set to true only for the first un
    # Setup - Run only once per Kernel App
    %conda install openjdk -y

    # install PySpark
    #%pip install pyspark==3.2.0 s3fs pyarrow spark-nlp

    %pip install s3fs pyarrow

        # install PySpark
    %pip install pyspark==3.4.0

    # install spark-nlp
    %pip install spark-nlp==5.1.3

    # restart kernel
    from IPython.core.display import HTML
    HTML("<script>Jupyter.notebook.kernel.restart()</script>")


Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.10.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.10.0



## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - openjdk


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2023.08.22 |       h06a4308_0         123 KB
    certifi-2023.7.22          |  py310h06a4308_0         153 KB
    openjdk-11.0.13            |       h87a67e3_0       341.0 MB
    ------------------------------------------------------------
                                           Total:       341.3 MB

The following NEW packages will be INSTALLED:

  openjdk            pkgs/main/linux-64::openjdk-11.0.13-h87a6

In [6]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.functions import col, length, isnan, when, count, regexp_extract, weekofyear, hour, avg, to_date, unix_timestamp, lit, corr, concat_ws, udf, lower

from pyspark.sql.types import ArrayType, IntegerType, StringType

from pyspark.ml import Pipeline
from pyspark.ml.feature import SQLTransformer

import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
import pickle
import boto3
from io import BytesIO

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline


pd.set_option('display.max_colwidth', 150) 
#pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)


In [7]:
# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.ContainerCredentialsProvider")\
    .getOrCreate()

print(spark.version)




:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5c6673a0-5737-4808-960d-81c5ed8ffc67;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in c

3.4.0


### Bring in submissions and comments data

In [8]:
%%time

required_columns = ['subreddit', 'title', 'selftext', 'score', 'created_utc', 'url']


# read the full year

# Read in data from project bucket
bucket = "project17-bucket-alex"

# List of 12 directories each containing 1 month of data
directories = ["project_2022_"+str(i)+"/submissions" for i in range(1,13)]

# Iterate through 12 directories and merge each monthly data set to create one big data set
submissions = None
for directory in directories:
    s3_path = f"s3a://{bucket}/{directory}"
    month_df = spark.read.parquet(s3_path).select(*required_columns)
    
    if submissions is None:
        submissions = month_df
    else:
        submissions = submissions.union(month_df)

        


23/11/18 21:35:21 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
23/11/18 21:35:26 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


CPU times: user 33.7 ms, sys: 18.7 ms, total: 52.3 ms
Wall time: 10.8 s


In [9]:
# create small dfs

submissions_small = submissions.sample(withReplacement=False, fraction=0.001, seed=42)



In [10]:
# choose which to use

use_small = False  # to easily swap between the small and small dfs
submissions_active = submissions_small if use_small else submissions



In [11]:
#cache - only for when working with the small version

#submissions_active.cache()


### Submissions Data


In [12]:
submissions_active.printSchema()

root
 |-- subreddit: string (nullable = true)
 |-- title: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- score: long (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- url: string (nullable = true)



Let's remove submissions without a body should obviously go, but what about the submissions without a self text (deleted, removed or empty). We can keep where the author is empty.

In [13]:

def clean_submissions(df: DataFrame) -> DataFrame:
    
    # define conditions
    conditions = (col('selftext') != "[removed]") & (col('selftext') != "[deleted]") & (col('selftext').isNotNull() & (col('selftext') != ""))

    
    # apply filter
    cleaned_df = df.filter(conditions)
  

    return cleaned_df




In [14]:
submissions_active = clean_submissions(submissions_active)


In [15]:
# uncomment to check if needed

#display(f"submissions shape: ({submissions_active.count()}, {len(submissions_active.columns)})")

# display(submissions_active.limit(5).toPandas())


In [16]:
# use regex the remove text after 'Edit: ' or 'edit: '

# The regular expression pattern
pattern = r"(?i)^(.*?)(?=Edit:|$)"

# Apply the regular expression to create a new column with the modified text
submissions_active = submissions_active.withColumn("selftext_modified", regexp_extract(col("selftext"), pattern, 1))



In [17]:
# define stories as posts longer than a certain length 

story_length = 4500

submissions_active = submissions_active.filter(length(col("selftext")) > story_length)


In [18]:
# keep only the most engaging posts

# Calculate the approximate percentile of the 'score' column
quantile_value = submissions_active.approxQuantile("score", [0.85], 0.05)  # 0.05 is the relative error

# Filter the DataFrame to keep scores above or equal to this value
submissions_active = submissions_active.filter(col("score") >= quantile_value[0])



                                                                                

In [19]:
# uncomment to check if needed

#display(f"submissions shape: ({submissions_active.count()}, {len(submissions_active.columns)})")

#display(submissions_active.limit(5).toPandas())


### Books

In [20]:
%%time


import os
from pyspark.sql import Row

def process_gutenberg_books_to_df(directory_path):
    all_rows = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                start_reading = False
                paragraph = ""
                for line in file:
                    if "*** END OF THE PROJECT GUTENBERG" in line:
                        break
                    if start_reading:
                        if line.strip() == "":
                            if paragraph.strip():
                                all_rows.append(Row(text=paragraph.strip()))
                                paragraph = ""
                        else:
                            paragraph += line
                    elif "*** START OF THE PROJECT GUTENBERG" in line:
                        start_reading = True

    # create a dataframe from list
    return spark.createDataFrame(all_rows)

# run
books = process_gutenberg_books_to_df('../../data/external-data/books')

# check
books.show()

#display(f"shape: ({books.count()}, {len(books.columns)})")



                                                                                

+--------------------+
|                text|
+--------------------+
|       Metamorphosis|
|      by Franz Kafka|
|Translated by Dav...|
|                   I|
|One morning, when...|
|“What’s happened ...|
|Gregor then turne...|
|“Oh, God”, he tho...|
|He slid back into...|
|And he looked ove...|
|He was still hurr...|
|The first thing h...|
|It was a simple m...|
|The first thing h...|
|So then he tried ...|
|It took just as m...|
|But then he said ...|
|When Gregor was a...|
|After a while he ...|
|“Something’s fall...|
+--------------------+
only showing top 20 rows

CPU times: user 221 ms, sys: 20.3 ms, total: 241 ms
Wall time: 1.25 s


In [21]:
%%time

# keep only the selftext column
submissions_active = submissions_active.select("selftext")





CPU times: user 1.59 ms, sys: 0 ns, total: 1.59 ms
Wall time: 11.8 ms


In [22]:
%%time


# fix col name to match
submissions_active = submissions_active.withColumnRenamed("selftext", "text")

#display(f"shape: ({submissions_active.count()}, {len(submissions_active.columns)})")


CPU times: user 322 µs, sys: 87 µs, total: 409 µs
Wall time: 6.09 ms


In [23]:
%%time

# union
all_model_text = books.unionByName(submissions_active)



CPU times: user 327 µs, sys: 88 µs, total: 415 µs
Wall time: 12.5 ms


In [24]:
all_model_text.show()

+--------------------+
|                text|
+--------------------+
|       Metamorphosis|
|      by Franz Kafka|
|Translated by Dav...|
|                   I|
|One morning, when...|
|“What’s happened ...|
|Gregor then turne...|
|“Oh, God”, he tho...|
|He slid back into...|
|And he looked ove...|
|He was still hurr...|
|The first thing h...|
|It was a simple m...|
|The first thing h...|
|So then he tried ...|
|It took just as m...|
|But then he said ...|
|When Gregor was a...|
|After a while he ...|
|“Something’s fall...|
+--------------------+
only showing top 20 rows



### NLP Processing

In [25]:
%%time


# to lowercase
def lowercase_array(arr):
    return [elem.lower() for elem in arr]

# for custom tokenization
def custom_tokenize(text):
    # List of characters to keep
    keep_chars = set(" .,!?;")
    tokens = []
    buffer = ""

    for char in text:
        if char in keep_chars:
            if buffer:
                tokens.append(buffer)
                buffer = ""
            tokens.append(char)
        elif char.isalnum():  # keep alphanumeric
            buffer += char

    if buffer:
        tokens.append(buffer)

    return tokens

lowercase_array_udf = udf(lowercase_array, ArrayType(StringType()))

custom_tokenize_udf = udf(custom_tokenize, ArrayType(StringType()))

# apply
all_model_text = all_model_text.withColumn("custom_tokens", custom_tokenize_udf("text"))

all_model_text = all_model_text.withColumn("concatenated_tokens", concat_ws(" ", "custom_tokens"))

# pipelie
document_assembler = DocumentAssembler() \
    .setInputCol("concatenated_tokens") \
    .setOutputCol("document")


tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")


finisher = Finisher() \
    .setInputCols(["token"]) \
    .setOutputCols(["tokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(True)

# pipeline
pipeline = Pipeline().setStages([
    document_assembler,
    tokenizer,
    finisher
])

# run pipeline
all_model_text = pipeline.fit(all_model_text).transform(all_model_text)

# lowercase
all_model_text = all_model_text.withColumn("custom_tokens", lowercase_array_udf(col("custom_tokens")))

# drop excess cols
all_model_text = all_model_text.drop("concatenated_tokens", "tokens")






CPU times: user 42.4 ms, sys: 8.9 ms, total: 51.3 ms
Wall time: 900 ms


In [26]:
all_model_text.show()

+--------------------+--------------------+
|                text|       custom_tokens|
+--------------------+--------------------+
|       Metamorphosis|     [metamorphosis]|
|      by Franz Kafka|[by,  , franz,  ,...|
|Translated by Dav...|[translated,  , b...|
|                   I|                 [i]|
|One morning, when...|[one,  , morning,...|
|“What’s happened ...|[whats,  , happen...|
|Gregor then turne...|[gregor,  , then,...|
|“Oh, God”, he tho...|[oh, ,,  , god, ,...|
|He slid back into...|[he,  , slid,  , ...|
|And he looked ove...|[and,  , he,  , l...|
|He was still hurr...|[he,  , was,  , s...|
|The first thing h...|[the,  , first,  ...|
|It was a simple m...|[it,  , was,  , a...|
|The first thing h...|[the,  , first,  ...|
|So then he tried ...|[so,  , then,  , ...|
|It took just as m...|[it,  , took,  , ...|
|But then he said ...|[but,  , then,  ,...|
|When Gregor was a...|[when,  , gregor,...|
|After a while he ...|[after,  , a,  , ...|
|“Something’s fall...|[something

                                                                                

In [27]:
%%time


# build vocab
all_texts = [row.custom_tokens for row in all_model_text.collect()] 
global_vocab = sorted(set("".join(sum(all_texts, []))))  # flatten
char2idx = {char: idx + 1 for idx, char in enumerate(global_vocab)} 
char2idx["UNK"] = 0  # reserve 0 for unknown characters



def chars_to_ints(text):
    return [char2idx.get(char, char2idx["UNK"]) for char in "".join(text)]

chars_to_ints_udf = udf(chars_to_ints, ArrayType(IntegerType()))

# run
all_model_text = all_model_text.withColumn("text_as_int", chars_to_ints_udf(col("custom_tokens")))


                                                                                

CPU times: user 3min 43s, sys: 1min 30s, total: 5min 13s
Wall time: 7min 46s


In [28]:
%%time


#check
all_model_text.show(25)



+--------------------+--------------------+--------------------+
|                text|       custom_tokens|         text_as_int|
+--------------------+--------------------+--------------------+
|       Metamorphosis|     [metamorphosis]|[29, 21, 36, 17, ...|
|      by Franz Kafka|[by,  , franz,  ,...|[18, 41, 1, 22, 3...|
|Translated by Dav...|[translated,  , b...|[36, 34, 17, 30, ...|
|                   I|                 [i]|                [25]|
|One morning, when...|[one,  , morning,...|[31, 30, 21, 1, 2...|
|“What’s happened ...|[whats,  , happen...|[39, 24, 17, 36, ...|
|Gregor then turne...|[gregor,  , then,...|[23, 34, 21, 23, ...|
|“Oh, God”, he tho...|[oh, ,,  , god, ,...|[31, 24, 3, 1, 23...|
|He slid back into...|[he,  , slid,  , ...|[24, 21, 1, 35, 2...|
|And he looked ove...|[and,  , he,  , l...|[17, 30, 20, 1, 2...|
|He was still hurr...|[he,  , was,  , s...|[24, 21, 1, 39, 1...|
|The first thing h...|[the,  , first,  ...|[36, 24, 21, 1, 2...|
|It was a simple m...|[it

In [29]:
%%time

# save
all_model_text.write.mode("overwrite").format("parquet").save("s3a://project17-bucket-alex/stories-and-books-nlp/processed-data/")



                                                                                

CPU times: user 61.1 ms, sys: 35.5 ms, total: 96.5 ms
Wall time: 3min 25s


In [30]:
%%time


# get the mappings for the ml model
char2idx_bytes = pickle.dumps(char2idx)
char2idx_buffer = BytesIO(char2idx_bytes)

# save
s3 = boto3.client('s3')

bucket_name = 'project17-bucket-alex'
object_key = 'stories-and-books-nlp/mapping/char2idx.pkl'

s3.upload_fileobj(char2idx_buffer, bucket_name, object_key)



CPU times: user 305 ms, sys: 40.1 ms, total: 345 ms
Wall time: 442 ms
