# NLP Data Generation

### Bucket checks

In [1]:
!aws s3 ls


2023-08-29 23:43:16 sagemaker-studio-692960231031-wo7kgoszj2g
2023-08-29 23:50:01 sagemaker-us-east-1-692960231031
2023-08-30 00:34:21 vad49
2023-09-16 16:02:10 vad49-labdata


In [2]:
!aws s3 ls s3://project17-bucket-alex/stories-and-books-nlp/


                           PRE mapping/
                           PRE processed-data/


In [3]:
# save books into bucket if needed
if True is False:
    !aws s3 cp ../../data/external-data/books/ s3://project17-bucket-alex/books --recursive --exclude "*" --exclude ".ipynb_checkpoints/*" --include "*.txt"


### Setup

In [4]:
if True is False: # set to true only for the first un
    # Setup - Run only once per Kernel App
    %conda install openjdk -y

    # install PySpark
    #%pip install pyspark==3.2.0 s3fs pyarrow spark-nlp

    %pip install s3fs pyarrow

        # install PySpark
    %pip install pyspark==3.4.0

    # install spark-nlp
    %pip install spark-nlp==5.1.3

    # restart kernel
    from IPython.core.display import HTML
    HTML("<script>Jupyter.notebook.kernel.restart()</script>")


In [5]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.functions import col, length, isnan, when, count, regexp_extract, weekofyear, hour, avg, to_date, unix_timestamp, lit, corr, concat_ws, udf, lower

from pyspark.sql.types import ArrayType, IntegerType, StringType

from pyspark.ml import Pipeline
from pyspark.ml.feature import SQLTransformer

import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
import pickle
import boto3
from io import BytesIO

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline


pd.set_option('display.max_colwidth', 150) 
#pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)


In [6]:
# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.ContainerCredentialsProvider")\
    .getOrCreate()

print(spark.version)




:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-aecaf56f-cded-4680-b7a6-d8516f5c29e9;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in c

3.2.0


### Bring in submissions and comments data

In [7]:
%%time

required_columns = ['subreddit', 'title', 'selftext', 'score', 'created_utc', 'url']


# read the full year

# Read in data from project bucket
bucket = "project17-bucket-alex"

# List of 12 directories each containing 1 month of data
directories = ["project_2022_"+str(i)+"/submissions" for i in range(1,13)]

# Iterate through 12 directories and merge each monthly data set to create one big data set
submissions = None
for directory in directories:
    s3_path = f"s3a://{bucket}/{directory}"
    month_df = spark.read.parquet(s3_path).select(*required_columns)
    
    if submissions is None:
        submissions = month_df
    else:
        submissions = submissions.union(month_df)

        


23/11/18 21:02:49 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
23/11/18 21:02:56 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


CPU times: user 56.6 ms, sys: 3.99 ms, total: 60.6 ms
Wall time: 13.5 s


In [8]:
# create small dfs

submissions_small = submissions.sample(withReplacement=False, fraction=0.001, seed=42)



In [9]:
# choose which to use

use_small = False  # to easily swap between the small and small dfs
submissions_active = submissions_small if use_small else submissions



In [10]:
#cache - only for when working with the small version

#submissions_active.cache()


### Submissions Data


In [11]:
submissions_active.printSchema()

root
 |-- subreddit: string (nullable = true)
 |-- title: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- score: long (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- url: string (nullable = true)



Let's remove submissions without a body should obviously go, but what about the submissions without a self text (deleted, removed or empty). We can keep where the author is empty.

In [12]:

def clean_submissions(df: DataFrame) -> DataFrame:
    
    # define conditions
    conditions = (col('selftext') != "[removed]") & (col('selftext') != "[deleted]") & (col('selftext').isNotNull() & (col('selftext') != ""))

    
    # apply filter
    cleaned_df = df.filter(conditions)
  

    return cleaned_df




In [13]:
submissions_active = clean_submissions(submissions_active)


In [14]:
# uncomment to check if needed

#display(f"submissions shape: ({submissions_active.count()}, {len(submissions_active.columns)})")

# display(submissions_active.limit(5).toPandas())


In [15]:
# use regex the remove text after 'Edit: ' or 'edit: '

# The regular expression pattern
pattern = r"(?i)^(.*?)(?=Edit:|$)"

# Apply the regular expression to create a new column with the modified text
submissions_active = submissions_active.withColumn("selftext_modified", regexp_extract(col("selftext"), pattern, 1))



In [16]:
# define stories as posts longer than a certain length 

story_length = 4500

submissions_active = submissions_active.filter(length(col("selftext")) > story_length)


In [17]:
# keep only the most engaging posts

# Calculate the approximate percentile of the 'score' column
quantile_value = submissions_active.approxQuantile("score", [0.85], 0.05)  # 0.05 is the relative error

# Filter the DataFrame to keep scores above or equal to this value
submissions_active = submissions_active.filter(col("score") >= quantile_value[0])



                                                                                

In [18]:
# uncomment to check if needed

#display(f"submissions shape: ({submissions_active.count()}, {len(submissions_active.columns)})")

#display(submissions_active.limit(5).toPandas())


### Books

In [19]:
%%time


import os
from pyspark.sql import Row

def process_gutenberg_books_to_df(directory_path):
    all_rows = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                start_reading = False
                paragraph = ""
                for line in file:
                    if "*** END OF THE PROJECT GUTENBERG" in line:
                        break
                    if start_reading:
                        if line.strip() == "":
                            if paragraph.strip():
                                all_rows.append(Row(text=paragraph.strip()))
                                paragraph = ""
                        else:
                            paragraph += line
                    elif "*** START OF THE PROJECT GUTENBERG" in line:
                        start_reading = True

    # create a dataframe from list
    return spark.createDataFrame(all_rows)

# run
books = process_gutenberg_books_to_df('../../data/external-data/books')

# check
books.show()

#display(f"shape: ({books.count()}, {len(books.columns)})")



23/11/18 21:07:55 WARN TaskSetManager: Stage 14 contains a task of very large size (1414 KiB). The maximum recommended task size is 1000 KiB.
[Stage 14:>                                                         (0 + 1) / 1]

+--------------------+
|                text|
+--------------------+
|       Metamorphosis|
|      by Franz Kafka|
|Translated by Dav...|
|                   I|
|One morning, when...|
|“What’s happened ...|
|Gregor then turne...|
|“Oh, God”, he tho...|
|He slid back into...|
|And he looked ove...|
|He was still hurr...|
|The first thing h...|
|It was a simple m...|
|The first thing h...|
|So then he tried ...|
|It took just as m...|
|But then he said ...|
|When Gregor was a...|
|After a while he ...|
|“Something’s fall...|
+--------------------+
only showing top 20 rows

CPU times: user 137 ms, sys: 16.1 ms, total: 153 ms
Wall time: 1.35 s


                                                                                

In [20]:
%%time

# keep only the selftext column
submissions_active = submissions_active.select("selftext")





CPU times: user 1.52 ms, sys: 0 ns, total: 1.52 ms
Wall time: 17.7 ms


In [21]:
%%time


# For the Project Gutenberg DataFrame
#books = books.withColumnRenamed("value", "text")

# For the submissions DataFrame
submissions_active = submissions_active.withColumnRenamed("selftext", "text")

#display(f"shape: ({submissions_active.count()}, {len(submissions_active.columns)})")


CPU times: user 497 µs, sys: 0 ns, total: 497 µs
Wall time: 12.2 ms


In [22]:
%%time


# Now you can union them
all_model_text = books.unionByName(submissions_active)

CPU times: user 536 µs, sys: 88 µs, total: 624 µs
Wall time: 16.5 ms


In [23]:
all_model_text.show()

+--------------------+
|                text|
+--------------------+
|       Metamorphosis|
|      by Franz Kafka|
|Translated by Dav...|
|                   I|
|One morning, when...|
|“What’s happened ...|
|Gregor then turne...|
|“Oh, God”, he tho...|
|He slid back into...|
|And he looked ove...|
|He was still hurr...|
|The first thing h...|
|It was a simple m...|
|The first thing h...|
|So then he tried ...|
|It took just as m...|
|But then he said ...|
|When Gregor was a...|
|After a while he ...|
|“Something’s fall...|
+--------------------+
only showing top 20 rows



23/11/18 21:07:57 WARN TaskSetManager: Stage 15 contains a task of very large size (1414 KiB). The maximum recommended task size is 1000 KiB.


### NLP Processing

In [24]:
%%time


# Define a UDF to lowercase all elements in the array
def lowercase_array(arr):
    return [elem.lower() for elem in arr]

# Define a UDF for custom tokenization
def custom_tokenize(text):
    # List of characters to keep
    keep_chars = set(" .,!?;")
    tokens = []
    buffer = ""

    for char in text:
        if char in keep_chars:
            if buffer:
                tokens.append(buffer)
                buffer = ""
            tokens.append(char)
        elif char.isalnum():  # Keep alphanumeric characters
            buffer += char

    if buffer:
        tokens.append(buffer)

    return tokens

lowercase_array_udf = udf(lowercase_array, ArrayType(StringType()))

custom_tokenize_udf = udf(custom_tokenize, ArrayType(StringType()))

# Apply the UDF to your DataFrame
all_model_text = all_model_text.withColumn("custom_tokens", custom_tokenize_udf("text"))

# Concatenate the tokens into a single string
all_model_text = all_model_text.withColumn("concatenated_tokens", concat_ws(" ", "custom_tokens"))

# Document assembler configuration
document_assembler = DocumentAssembler() \
    .setInputCol("concatenated_tokens") \
    .setOutputCol("document")

# Tokenizer configuration (if still needed)
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

# Finisher configuration
finisher = Finisher() \
    .setInputCols(["token"]) \
    .setOutputCols(["tokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(True)

# Define and build the pipeline
pipeline = Pipeline().setStages([
    document_assembler,
    tokenizer,
    finisher
])

# Apply the pipeline
all_model_text = pipeline.fit(all_model_text).transform(all_model_text)

# Apply the lowercase UDF to the custom_tokens column
all_model_text = all_model_text.withColumn("custom_tokens", lowercase_array_udf(col("custom_tokens")))

# Drop the concatenated_tokens and tokens columns
all_model_text = all_model_text.drop("concatenated_tokens", "tokens")




CPU times: user 51.5 ms, sys: 6.53 ms, total: 58 ms
Wall time: 1 s


In [25]:
all_model_text.show()

23/11/18 21:08:00 WARN TaskSetManager: Stage 16 contains a task of very large size (1414 KiB). The maximum recommended task size is 1000 KiB.
[Stage 16:>                                                         (0 + 1) / 1]

+--------------------+--------------------+
|                text|       custom_tokens|
+--------------------+--------------------+
|       Metamorphosis|     [metamorphosis]|
|      by Franz Kafka|[by,  , franz,  ,...|
|Translated by Dav...|[translated,  , b...|
|                   I|                 [i]|
|One morning, when...|[one,  , morning,...|
|“What’s happened ...|[whats,  , happen...|
|Gregor then turne...|[gregor,  , then,...|
|“Oh, God”, he tho...|[oh, ,,  , god, ,...|
|He slid back into...|[he,  , slid,  , ...|
|And he looked ove...|[and,  , he,  , l...|
|He was still hurr...|[he,  , was,  , s...|
|The first thing h...|[the,  , first,  ...|
|It was a simple m...|[it,  , was,  , a...|
|The first thing h...|[the,  , first,  ...|
|So then he tried ...|[so,  , then,  , ...|
|It took just as m...|[it,  , took,  , ...|
|But then he said ...|[but,  , then,  ,...|
|When Gregor was a...|[when,  , gregor,...|
|After a while he ...|[after,  , a,  , ...|
|“Something’s fall...|[something

                                                                                

In [None]:
%%time


# Build a global vocabulary from the entire dataset
all_texts = [row.custom_tokens for row in all_model_text.collect()]  # Adjust as per your DataFrame structure
global_vocab = sorted(set("".join(sum(all_texts, []))))  # Sum flattens the list of lists
char2idx = {char: idx + 1 for idx, char in enumerate(global_vocab)}  # Start indexing from 1
char2idx["UNK"] = 0  # Reserve 0 for unknown characters



def chars_to_ints(text):
    return [char2idx.get(char, char2idx["UNK"]) for char in "".join(text)]

chars_to_ints_udf = udf(chars_to_ints, ArrayType(IntegerType()))

all_model_text = all_model_text.withColumn("text_as_int", chars_to_ints_udf(col("custom_tokens")))


23/11/18 21:08:03 WARN TaskSetManager: Stage 17 contains a task of very large size (1414 KiB). The maximum recommended task size is 1000 KiB.

In [None]:
%%time

all_model_text.show(25)



In [None]:
%%time

all_model_text.write.mode("overwrite").format("parquet").save("s3a://project17-bucket-alex/stories-and-books-nlp/processed-data/")



In [None]:
%%time


# Serialize char2idx to a bytes object
char2idx_bytes = pickle.dumps(char2idx)
char2idx_buffer = BytesIO(char2idx_bytes)

# Initialize a boto3 client
s3 = boto3.client('s3')

# Upload the bytes object to S3
bucket_name = 'project17-bucket-alex'
object_key = 'stories-and-books-nlp/mapping/char2idx.pkl'

s3.upload_fileobj(char2idx_buffer, bucket_name, object_key)

