# NLP Data Generation

### Bucket checks

In [2]:
!aws s3 ls


2023-08-29 23:43:16 sagemaker-studio-692960231031-wo7kgoszj2g
2023-08-29 23:50:01 sagemaker-us-east-1-692960231031
2023-08-30 00:34:21 vad49
2023-09-16 16:02:10 vad49-labdata


In [109]:
!aws s3 ls s3://project17-bucket-alex/stories-and-books-nlp/


2023-11-18 18:55:58          0 _SUCCESS
2023-11-18 18:55:54    2866925 part-00000-e5360052-4545-4fe2-9b12-7c9ce44bf3ec-c000.snappy.parquet
2023-11-18 18:55:55    2564497 part-00001-e5360052-4545-4fe2-9b12-7c9ce44bf3ec-c000.snappy.parquet
2023-11-18 18:55:56      26078 part-00035-e5360052-4545-4fe2-9b12-7c9ce44bf3ec-c000.snappy.parquet
2023-11-18 18:55:56      12157 part-00051-e5360052-4545-4fe2-9b12-7c9ce44bf3ec-c000.snappy.parquet
2023-11-18 18:55:57      28228 part-00053-e5360052-4545-4fe2-9b12-7c9ce44bf3ec-c000.snappy.parquet
2023-11-18 18:55:57      30806 part-00059-e5360052-4545-4fe2-9b12-7c9ce44bf3ec-c000.snappy.parquet


In [4]:
# save books into bucket if needed
if True is False:
    !aws s3 cp ../../data/external-data/books/ s3://project17-bucket-alex/books --recursive --exclude "*" --exclude ".ipynb_checkpoints/*" --include "*.txt"


### Setup

In [None]:
if True is True: # set to true only for the first un
    # Setup - Run only once per Kernel App
    %conda install openjdk -y

    # install PySpark
    #%pip install pyspark==3.2.0 s3fs pyarrow spark-nlp

    %pip install s3fs pyarrow

        # install PySpark
    %pip install pyspark==3.4.0

    # install spark-nlp
    %pip install spark-nlp==5.1.3

    # restart kernel
    from IPython.core.display import HTML
    HTML("<script>Jupyter.notebook.kernel.restart()</script>")


In [6]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.functions import col, length, isnan, when, count, regexp_extract, weekofyear, hour, avg, to_date, unix_timestamp, lit, corr

import json
import sparknlp
import numpy as np
import pandas as pd
from sparknlp.base import *
from pyspark.ml import Pipeline
from sparknlp.annotator import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from sparknlp.pretrained import PretrainedPipeline


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', 150) 
#pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)


In [None]:
# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.ContainerCredentialsProvider")\
    .getOrCreate()

print(spark.version)


### Bring in submissions and comments data

In [8]:
%%time

required_columns = ['subreddit', 'title', 'selftext', 'score', 'created_utc', 'url']


# read the full year

# Read in data from project bucket
bucket = "project17-bucket-alex"

# List of 12 directories each containing 1 month of data
directories = ["project_2022_"+str(i)+"/submissions" for i in range(1,13)]

# Iterate through 12 directories and merge each monthly data set to create one big data set
submissions = None
for directory in directories:
    s3_path = f"s3a://{bucket}/{directory}"
    month_df = spark.read.parquet(s3_path).select(*required_columns)
    
    if submissions is None:
        submissions = month_df
    else:
        submissions = submissions.union(month_df)

        


23/11/18 17:13:09 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
23/11/18 17:13:15 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


CPU times: user 57.3 ms, sys: 7.08 ms, total: 64.4 ms
Wall time: 15.3 s


In [9]:
# create small dfs

submissions_small = submissions.sample(withReplacement=False, fraction=0.001, seed=42)



In [10]:
# choose which to use

use_small = True  # to easily swap between the small and small dfs
submissions_active = submissions_small if use_small else submissions



In [11]:
#cache - only for when working with the small version

#submissions_active.cache()


### Process Submissions Data


In [12]:
submissions_active.printSchema()

root
 |-- subreddit: string (nullable = true)
 |-- title: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- score: long (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- url: string (nullable = true)



### 2.2 Conduct basic data quality checks! Make sure there are no missing values, check the length of the comments, and remove rows of data that might be corrupted. Even if you think all your data is perfect, you still need to demonstrate that with your analysis.



Let's remove submissions without a body should obviously go, but what about the submissions without a self text (deleted, removed or empty). We can keep where the author is empty.

In [13]:

def clean_submissions(df: DataFrame) -> DataFrame:
    
    # define conditions
    conditions = (col('selftext') != "[removed]") & (col('selftext') != "[deleted]") & (col('selftext').isNotNull() & (col('selftext') != ""))

    
    # apply filter
    cleaned_df = df.filter(conditions)
  

    return cleaned_df




In [14]:
submissions_active = clean_submissions(submissions_active)


In [15]:
# uncomment to check if needed

#display(f"submissions shape: ({submissions_active.count()}, {len(submissions_active.columns)})")


# display(submissions_active.limit(5).toPandas())


                                                                                

Unnamed: 0,subreddit,title,selftext,score,created_utc,url
0,relationship_advice,I need advice,So my gf and I are both 19 and live separately but both in college and she works. We will live together within 2ish years after she graduates but ...,1,2022-01-25 17:01:32,https://www.reddit.com/r/relationship_advice/comments/scid7f/i_need_advice/
1,relationship_advice,Too smart to be this stupid; logics vs heart.,"My estranged bf of the last almost 2yrs has vowed to make his life mission, along others aid, to make me have misery and regret. \n Call me cr...",1,2022-01-18 17:12:58,https://www.reddit.com/r/relationship_advice/comments/s72fh8/too_smart_to_be_this_stupid_logics_vs_heart/
2,antiwork,High hopes for the future,"Just kidding, this sub is doomed\n\nHey mods, stop doing interviews \n\nYou don’t speak for the people of this sub. You are a janitor who’s only p...",0,2022-01-27 19:05:44,https://www.reddit.com/r/antiwork/comments/se5xf0/high_hopes_for_the_future/
3,NoStupidQuestions,I think I just ejaculated without trying,Okay so has anyone else have this happen. Right altering peeing. I get extreme pain I mean like eye shutting crouching over pain and there this cl...,1,2022-01-05 19:26:45,https://www.reddit.com/r/NoStupidQuestions/comments/rwuuej/i_think_i_just_ejaculated_without_trying/
4,socialskills,"Is it weird to tell my depressed friend I'm ""proud of him"" after he's overcome a bad mental phase?","One of my best friends suffers from depression and experiences ""down phases"" in irregular intervals. He always has a particularly bad one in winte...",131,2022-01-13 18:13:09,https://www.reddit.com/r/socialskills/comments/s35q0j/is_it_weird_to_tell_my_depressed_friend_im_proud/


In [16]:
# use regex the remove text after 'Edit: ' or 'edit: '

# The regular expression pattern
pattern = r"(?i)^(.*?)(?=Edit:|$)"

# Apply the regular expression to create a new column with the modified text
submissions_active = submissions_active.withColumn("selftext_modified", regexp_extract(col("selftext"), pattern, 1))



In [17]:
# define stories as posts longer than a certain length 

story_length = 4500

submissions_active = submissions_active.filter(length(col("selftext")) > story_length)


In [18]:
# keep only the 25% most engaging posts


# Calculate the approximate percentile of the 'score' column
quantile_value = submissions_active.approxQuantile("score", [0.85], 0.05)  # 0.05 is the relative error

# Filter the DataFrame to keep scores above or equal to this value
submissions_active = submissions_active.filter(col("score") >= quantile_value[0])



                                                                                

In [19]:
# uncomment to check if needed

#display(f"submissions shape: ({submissions_active.count()}, {len(submissions_active.columns)})")


#display(submissions_active.limit(5).toPandas())

                                                                                

Unnamed: 0,subreddit,title,selftext,score,created_utc,url,selftext_modified
0,relationship_advice,Is dating online cheating.. the whole story,Due to the overwhelming messages and requests for the whole story on about my last post. I decided to do a tell-all. So I hope you have a minute. ...,29,2022-07-13 15:48:56,https://www.reddit.com/r/relationship_advice/comments/vy6fh0/is_dating_online_cheating_the_whole_story/,
1,AmItheAsshole,AITA for expecting my son to share his room?,"Background: My (40sf) husband (40sm) and I bought a 3-bedroom house a few years ago, shortly before the panini. We of course took the master bedro...",16459,2022-10-26 08:51:01,https://www.reddit.com/r/AmItheAsshole/comments/ydt2w5/aita_for_expecting_my_son_to_share_his_room/,
2,AmItheAsshole,"WIBTA for taking my (15f) laundry basket full of clothes, which my stepdad (49m) threw out for not putting them away, out of the trash?",Sorry for bad English!\n\nThe morning before my clothes were thrown away my mom put them in a laundry basket beside my door. She did tell me ofcou...,95,2022-11-07 19:59:01,https://www.reddit.com/r/AmItheAsshole/comments/yoyf1r/wibta_for_taking_my_15f_laundry_basket_full_of/,
3,relationship_advice,My housemate (25f) did something I (20f) consider morally unacceptable and its causing a lot of conflict in the house (6 students).,Tl;dr version: I live in a student houseshare. One of my housemates is a student nurse and was extremely inappropriate with a patient which I foun...,421,2022-11-04 20:32:06,https://www.reddit.com/r/relationship_advice/comments/ym9kw7/my_housemate_25f_did_something_i_20f_consider/,
4,socialskills,How to Talk About Yourself (and How to Have Good Conversation),"**TL:DR talk about yourself by saying just a little bit in a way that's relevant to the topic at hand, and give the other person the implicit choi...",27,2022-12-01 14:46:48,https://www.reddit.com/r/socialskills/comments/z9polj/how_to_talk_about_yourself_and_how_to_have_good/,


Books

In [32]:
import os
from pyspark.sql import Row

def process_gutenberg_books_to_df(directory_path):
    all_rows = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):  # Assuming the files are in .txt format
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                start_reading = False
                paragraph = ""
                for line in file:
                    if "*** END OF THE PROJECT GUTENBERG" in line:
                        break
                    if start_reading:
                        if line.strip() == "":
                            if paragraph.strip():
                                all_rows.append(Row(text=paragraph.strip()))
                                paragraph = ""
                        else:
                            paragraph += line
                    elif "*** START OF THE PROJECT GUTENBERG" in line:
                        start_reading = True

    # Create a DataFrame from the list of Rows
    return spark.createDataFrame(all_rows)

# Process the Gutenberg books and create a DataFrame
books = process_gutenberg_books_to_df('../../data/external-data/books')
books.show()

display(f"shape: ({books.count()}, {len(books.columns)})")



23/11/18 17:33:32 WARN TaskSetManager: Stage 65 contains a task of very large size (1417 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+
|                text|
+--------------------+
|       Metamorphosis|
|      by Franz Kafka|
|Translated by Dav...|
|                   I|
|One morning, when...|
|“What’s happened ...|
|Gregor then turne...|
|“Oh, God”, he tho...|
|He slid back into...|
|And he looked ove...|
|He was still hurr...|
|The first thing h...|
|It was a simple m...|
|The first thing h...|
|So then he tried ...|
|It took just as m...|
|But then he said ...|
|When Gregor was a...|
|After a while he ...|
|“Something’s fall...|
+--------------------+
only showing top 20 rows



23/11/18 17:33:32 WARN TaskSetManager: Stage 66 contains a task of very large size (1417 KiB). The maximum recommended task size is 1000 KiB.


'shape: (7825, 1)'

In [None]:
from pyspark.sql.functions import concat_ws, col

# Process `submissions_active` to extract and transform the 'selftext' column
submissions_active = submissions_active.select("selftext")





In [34]:
# For the Project Gutenberg DataFrame
#books = books.withColumnRenamed("value", "text")

# For the submissions DataFrame
submissions_active = submissions_active.withColumnRenamed("selftext", "text")

#display(f"shape: ({submissions_active.count()}, {len(submissions_active.columns)})")


In [89]:


# Now you can union them
all_model_text = books.unionByName(submissions_active)

In [36]:
all_model_text.show()

23/11/18 17:33:54 WARN TaskSetManager: Stage 69 contains a task of very large size (1417 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+
|                text|
+--------------------+
|       Metamorphosis|
|      by Franz Kafka|
|Translated by Dav...|
|                   I|
|One morning, when...|
|“What’s happened ...|
|Gregor then turne...|
|“Oh, God”, he tho...|
|He slid back into...|
|And he looked ove...|
|He was still hurr...|
|The first thing h...|
|It was a simple m...|
|The first thing h...|
|So then he tried ...|
|It took just as m...|
|But then he said ...|
|When Gregor was a...|
|After a while he ...|
|“Something’s fall...|
+--------------------+
only showing top 20 rows



Clean

In [102]:

#########
#clean
#########


from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, IntegerType
import string
from pyspark.ml.feature import SQLTransformer
from pyspark.sql.functions import col, lower

# Define a UDF to lowercase all elements in the array
def lowercase_array(arr):
    return [elem.lower() for elem in arr]

# Define a UDF for custom tokenization
def custom_tokenize(text):
    # List of characters to keep
    keep_chars = set(" .,!?;")
    tokens = []
    buffer = ""

    for char in text:
        if char in keep_chars:
            if buffer:
                tokens.append(buffer)
                buffer = ""
            tokens.append(char)
        elif char.isalnum():  # Keep alphanumeric characters
            buffer += char

    if buffer:
        tokens.append(buffer)

    return tokens

lowercase_array_udf = udf(lowercase_array, ArrayType(StringType()))

custom_tokenize_udf = udf(custom_tokenize, ArrayType(StringType()))

# Apply the UDF to your DataFrame
all_model_text = all_model_text.withColumn("custom_tokens", custom_tokenize_udf("text"))

# Concatenate the tokens into a single string
all_model_text = all_model_text.withColumn("concatenated_tokens", concat_ws(" ", "custom_tokens"))

# Document assembler configuration
document_assembler = DocumentAssembler() \
    .setInputCol("concatenated_tokens") \
    .setOutputCol("document")

# Tokenizer configuration (if still needed)
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

# Finisher configuration
finisher = Finisher() \
    .setInputCols(["token"]) \
    .setOutputCols(["tokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(True)

# Define and build the pipeline
pipeline = Pipeline().setStages([
    document_assembler,
    tokenizer,
    finisher
])

# Apply the pipeline
all_model_text = pipeline.fit(all_model_text).transform(all_model_text)

# Apply the lowercase UDF to the custom_tokens column
all_model_text = all_model_text.withColumn("custom_tokens", lowercase_array_udf(col("custom_tokens")))

# Drop the concatenated_tokens and tokens columns
all_model_text = all_model_text.drop("concatenated_tokens", "tokens")




In [103]:
all_model_text.show()

23/11/18 18:42:17 WARN TaskSetManager: Stage 89 contains a task of very large size (1417 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+--------------------+
|                text|       custom_tokens|
+--------------------+--------------------+
|       Metamorphosis|     [metamorphosis]|
|      by Franz Kafka|[by,  , franz,  ,...|
|Translated by Dav...|[translated,  , b...|
|                   I|                 [i]|
|One morning, when...|[one,  , morning,...|
|“What’s happened ...|[whats,  , happen...|
|Gregor then turne...|[gregor,  , then,...|
|“Oh, God”, he tho...|[oh, ,,  , god, ,...|
|He slid back into...|[he,  , slid,  , ...|
|And he looked ove...|[and,  , he,  , l...|
|He was still hurr...|[he,  , was,  , s...|
|The first thing h...|[the,  , first,  ...|
|It was a simple m...|[it,  , was,  , a...|
|The first thing h...|[the,  , first,  ...|
|So then he tried ...|[so,  , then,  , ...|
|It took just as m...|[it,  , took,  , ...|
|But then he said ...|[but,  , then,  ,...|
|When Gregor was a...|[when,  , gregor,...|
|After a while he ...|[after,  , a,  , ...|
|“Something’s fall...|[something

In [105]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.ml import Pipeline
import string


# Build a global vocabulary from the entire dataset
all_texts = [row.custom_tokens for row in all_model_text.collect()]  # Adjust as per your DataFrame structure
global_vocab = sorted(set("".join(sum(all_texts, []))))  # Sum flattens the list of lists
char2idx = {char: idx + 1 for idx, char in enumerate(global_vocab)}  # Start indexing from 1
char2idx["UNK"] = 0  # Reserve 0 for unknown characters

def chars_to_ints(text):
    return [char2idx.get(char, char2idx["UNK"]) for char in "".join(text)]

chars_to_ints_udf = udf(chars_to_ints, ArrayType(IntegerType()))

all_model_text = all_model_text.withColumn("text_as_int", chars_to_ints_udf(col("custom_tokens")))

23/11/18 18:47:18 WARN TaskSetManager: Stage 91 contains a task of very large size (1417 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [106]:
all_model_text.show(25)

23/11/18 18:51:08 WARN TaskSetManager: Stage 92 contains a task of very large size (1417 KiB). The maximum recommended task size is 1000 KiB.


+--------------------+--------------------+--------------------+
|                text|       custom_tokens|         text_as_int|
+--------------------+--------------------+--------------------+
|       Metamorphosis|     [metamorphosis]|[29, 21, 36, 17, ...|
|      by Franz Kafka|[by,  , franz,  ,...|[18, 41, 1, 22, 3...|
|Translated by Dav...|[translated,  , b...|[36, 34, 17, 30, ...|
|                   I|                 [i]|                [25]|
|One morning, when...|[one,  , morning,...|[31, 30, 21, 1, 2...|
|“What’s happened ...|[whats,  , happen...|[39, 24, 17, 36, ...|
|Gregor then turne...|[gregor,  , then,...|[23, 34, 21, 23, ...|
|“Oh, God”, he tho...|[oh, ,,  , god, ,...|[31, 24, 3, 1, 23...|
|He slid back into...|[he,  , slid,  , ...|[24, 21, 1, 35, 2...|
|And he looked ove...|[and,  , he,  , l...|[17, 30, 20, 1, 2...|
|He was still hurr...|[he,  , was,  , s...|[24, 21, 1, 39, 1...|
|The first thing h...|[the,  , first,  ...|[36, 24, 21, 1, 2...|
|It was a simple m...|[it

In [107]:
all_model_text.write.mode("overwrite").format("parquet").save("s3a://project17-bucket-alex/stories-and-books-nlp/")





23/11/18 18:52:50 WARN TaskSetManager: Stage 93 contains a task of very large size (1417 KiB). The maximum recommended task size is 1000 KiB.
                                                                                