# NLP

### Bucket checks

In [49]:
!aws s3 ls


2023-08-29 23:43:16 sagemaker-studio-692960231031-wo7kgoszj2g
2023-08-29 23:50:01 sagemaker-us-east-1-692960231031
2023-08-30 00:34:21 vad49
2023-09-16 16:02:10 vad49-labdata


In [66]:
!aws s3 ls s3://project17-bucket-alex/books/


2023-11-17 22:45:03     717850 pg1727.txt
2023-11-17 22:45:03    1201729 pg2554.txt
2023-11-17 22:45:03     513776 pg33.txt
2023-11-17 22:45:03     142082 pg5200.txt
2023-11-17 22:45:03     306317 pg64317.txt


In [43]:
# save books into bucket if needed
if True is False:
    !aws s3 cp ../../data/external-data/books/ s3://project17-bucket-alex/books --recursive --exclude "*" --exclude ".ipynb_checkpoints/*" --include "*.txt"


upload: ../../data/external-data/books/pg1727.txt to s3://project17-bucket-alex/books/pg1727.txt
upload: ../../data/external-data/books/pg33.txt to s3://project17-bucket-alex/books/pg33.txt
upload: ../../data/external-data/books/.ipynb_checkpoints/pg64317-checkpoint.txt to s3://project17-bucket-alex/books/.ipynb_checkpoints/pg64317-checkpoint.txt
upload: ../../data/external-data/books/pg2554.txt to s3://project17-bucket-alex/books/pg2554.txt
upload: ../../data/external-data/books/pg5200.txt to s3://project17-bucket-alex/books/pg5200.txt
upload: ../../data/external-data/books/.ipynb_checkpoints/pg2554-checkpoint.txt to s3://project17-bucket-alex/books/.ipynb_checkpoints/pg2554-checkpoint.txt
upload: ../../data/external-data/books/.ipynb_checkpoints/pg1727-checkpoint.txt to s3://project17-bucket-alex/books/.ipynb_checkpoints/pg1727-checkpoint.txt
upload: ../../data/external-data/books/.ipynb_checkpoints/pg33-checkpoint.txt to s3://project17-bucket-alex/books/.ipynb_checkpoints/pg33-check

### Setup

In [10]:
if True is True: # set to true only for the first un
    # Setup - Run only once per Kernel App
    %conda install openjdk -y

    # install PySpark
    %pip install pyspark==3.2.0 s3fs pyarrow spark-nlp

    # restart kernel
    from IPython.core.display import HTML
    HTML("<script>Jupyter.notebook.kernel.restart()</script>")


Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.10.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.10.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.functions import col, length, isnan, when, count, regexp_extract, weekofyear, hour, avg, to_date, unix_timestamp, lit, corr


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', 150) 
#pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)


In [12]:
# Import pyspark and build Spark session

spark = (
    SparkSession.builder.appName("PySparkApp")
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.ContainerCredentialsProvider",
    )
    #.config("spark-jars-packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
    .getOrCreate()
)

print(spark.version)

3.2.0


In [13]:
#from sparknlp.base import DocumentAssembler
#from sparknlp.annotator import LowerCase, Tokenizer
#import sparknlp


from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp

### Bring in submissions and comments data

In [14]:
%%time


required_columns = ['subreddit', 'title', 'selftext', 'score', 'created_utc', 'url']


# read the full year

# Read in data from project bucket
bucket = "project17-bucket-alex"

# List of 12 directories each containing 1 month of data
directories = ["project_2022_"+str(i)+"/submissions" for i in range(1,13)]

# Iterate through 12 directories and merge each monthly data set to create one big data set
submissions = None
for directory in directories:
    s3_path = f"s3a://{bucket}/{directory}"
    month_df = spark.read.parquet(s3_path).select(*required_columns)
    
    if submissions is None:
        submissions = month_df
    else:
        submissions = submissions.union(month_df)

        


23/11/17 22:11:18 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
23/11/17 22:11:25 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


CPU times: user 59.1 ms, sys: 8.47 ms, total: 67.5 ms
Wall time: 16.4 s


In [15]:
submissions_small = submissions.sample(withReplacement=False, fraction=0.001, seed=42)



In [16]:
# create small dfs

use_small = True  # to easily swap between the small and small dfs
submissions_active = submissions_small if use_small else submissions



In [17]:
#cache - only for when working with the small version
#submissions_active.cache()


DataFrame[subreddit: string, title: string, selftext: string, score: bigint, created_utc: timestamp, url: string]

### Process Submissions Data


In [18]:
submissions_active.printSchema()

root
 |-- subreddit: string (nullable = true)
 |-- title: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- score: long (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- url: string (nullable = true)



### 2.2 Conduct basic data quality checks! Make sure there are no missing values, check the length of the comments, and remove rows of data that might be corrupted. Even if you think all your data is perfect, you still need to demonstrate that with your analysis.



Let's remove submissions without a body should obviously go, but what about the submissions without a self text (deleted, removed or empty). We can keep where the author is empty.

In [19]:

def clean_submissions(df: DataFrame) -> DataFrame:
    
    # define conditions
    conditions = (col('selftext') != "[removed]") & (col('selftext') != "[deleted]") & (col('selftext').isNotNull() & (col('selftext') != ""))

    
    # apply filter
    cleaned_df = df.filter(conditions)
  

    return cleaned_df




In [20]:
submissions_active = clean_submissions(submissions_active)


In [21]:
#display(f"submissions shape: ({submissions_active.count()}, {len(submissions_active.columns)})")

display(submissions_active.limit(5).toPandas())

                                                                                

Unnamed: 0,subreddit,title,selftext,score,created_utc,url
0,relationship_advice,I need advice,So my gf and I are both 19 and live separately but both in college and she works. We will live together within 2ish years after she graduates but ...,1,2022-01-25 17:01:32,https://www.reddit.com/r/relationship_advice/comments/scid7f/i_need_advice/
1,relationship_advice,Too smart to be this stupid; logics vs heart.,"My estranged bf of the last almost 2yrs has vowed to make his life mission, along others aid, to make me have misery and regret. \n Call me cr...",1,2022-01-18 17:12:58,https://www.reddit.com/r/relationship_advice/comments/s72fh8/too_smart_to_be_this_stupid_logics_vs_heart/
2,antiwork,High hopes for the future,"Just kidding, this sub is doomed\n\nHey mods, stop doing interviews \n\nYou don’t speak for the people of this sub. You are a janitor who’s only p...",0,2022-01-27 19:05:44,https://www.reddit.com/r/antiwork/comments/se5xf0/high_hopes_for_the_future/
3,NoStupidQuestions,I think I just ejaculated without trying,Okay so has anyone else have this happen. Right altering peeing. I get extreme pain I mean like eye shutting crouching over pain and there this cl...,1,2022-01-05 19:26:45,https://www.reddit.com/r/NoStupidQuestions/comments/rwuuej/i_think_i_just_ejaculated_without_trying/
4,socialskills,"Is it weird to tell my depressed friend I'm ""proud of him"" after he's overcome a bad mental phase?","One of my best friends suffers from depression and experiences ""down phases"" in irregular intervals. He always has a particularly bad one in winte...",131,2022-01-13 18:13:09,https://www.reddit.com/r/socialskills/comments/s35q0j/is_it_weird_to_tell_my_depressed_friend_im_proud/


In [22]:
# use regex the remove text after 'Edit: ' or 'edit: '

# The regular expression pattern
pattern = r"(?i)^(.*?)(?=Edit:|$)"

# Apply the regular expression to create a new column with the modified text
submissions_active = submissions_active.withColumn("selftext_modified", regexp_extract(col("selftext"), pattern, 1))



In [23]:
# define stories as posts longer than a certain length 

story_length = 4500

submissions_active = submissions_active.filter(length(col("selftext")) > story_length)


In [24]:
# keep only the 25% most engaging posts


# Calculate the approximate percentile of the 'score' column
quantile_value = submissions_active.approxQuantile("score", [0.85], 0.05)  # 0.05 is the relative error

# Filter the DataFrame to keep scores above or equal to this value
submissions_active = submissions_active.filter(col("score") >= quantile_value[0])



                                                                                

In [25]:
#display(f"submissions shape: ({submissions_active.count()}, {len(submissions_active.columns)})")


display(submissions_active.limit(5).toPandas())

Unnamed: 0,subreddit,title,selftext,score,created_utc,url,selftext_modified
0,relationship_advice,Is dating online cheating.. the whole story,Due to the overwhelming messages and requests for the whole story on about my last post. I decided to do a tell-all. So I hope you have a minute. ...,29,2022-07-13 15:48:56,https://www.reddit.com/r/relationship_advice/comments/vy6fh0/is_dating_online_cheating_the_whole_story/,
1,AmItheAsshole,AITA for expecting my son to share his room?,"Background: My (40sf) husband (40sm) and I bought a 3-bedroom house a few years ago, shortly before the panini. We of course took the master bedro...",16459,2022-10-26 08:51:01,https://www.reddit.com/r/AmItheAsshole/comments/ydt2w5/aita_for_expecting_my_son_to_share_his_room/,
2,AmItheAsshole,"WIBTA for taking my (15f) laundry basket full of clothes, which my stepdad (49m) threw out for not putting them away, out of the trash?",Sorry for bad English!\n\nThe morning before my clothes were thrown away my mom put them in a laundry basket beside my door. She did tell me ofcou...,95,2022-11-07 19:59:01,https://www.reddit.com/r/AmItheAsshole/comments/yoyf1r/wibta_for_taking_my_15f_laundry_basket_full_of/,
3,relationship_advice,My housemate (25f) did something I (20f) consider morally unacceptable and its causing a lot of conflict in the house (6 students).,Tl;dr version: I live in a student houseshare. One of my housemates is a student nurse and was extremely inappropriate with a patient which I foun...,421,2022-11-04 20:32:06,https://www.reddit.com/r/relationship_advice/comments/ym9kw7/my_housemate_25f_did_something_i_20f_consider/,
4,socialskills,How to Talk About Yourself (and How to Have Good Conversation),"**TL:DR talk about yourself by saying just a little bit in a way that's relevant to the topic at hand, and give the other person the implicit choi...",27,2022-12-01 14:46:48,https://www.reddit.com/r/socialskills/comments/z9polj/how_to_talk_about_yourself_and_how_to_have_good/,


Books

In [26]:
#import os

#def process_gutenberg_books(directory_path):
#    all_text = []
#    for filename in os.listdir(directory_path):
#        if filename.endswith('.txt'):  # Assuming the files are in .txt format
#            file_path = os.path.join(directory_path, filename)
#            with open(file_path, 'r', encoding='utf-8') as file:
#                content = file.read()
#                start_idx = content.find("*** START OF THE PROJECT GUTENBERG")
#                if start_idx != -1:
#                    # Find the end of the line and start from the next line
#                    start_idx = content.find('\n', start_idx) + 1
#                    content = content[start_idx:]
#                all_text.append(content)
#    return ' '.join(all_text)


#gutenberg_text = process_gutenberg_books('../../data/external-data/books')

#display(len(gutenberg_text))
#gutenberg_text[:500]

In [27]:
#submissions_text = submissions_active.select("selftext").rdd.flatMap(lambda x: x).collect()
#submissions_text = ' '.join(submissions_text)

#display(len(submissions_text))
#submissions_text[:500]

In [28]:
#combined_text = gutenberg_text + " " + submissions_text

#display(len(combined_text))
#display(combined_text[:500])

In [71]:
%%writefile ./project-nlp-posts-and-books-processing.py

import os
import logging
import argparse

# Import pyspark and build Spark session
from pyspark.sql.functions import *
from pyspark.sql.types import (
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)
from pyspark.sql import SparkSession
from pyspark.sql.functions import col



logging.basicConfig(format='%(asctime)s,%(levelname)s,%(module)s,%(filename)s,%(lineno)d,%(message)s', level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

def main():
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    #parser.add_argument("--s3_dataset_path_commments", type=str, help="Path of dataset in S3 for reddit comments")
    parser.add_argument("--s3_dataset_path_submissions", type=str, help="Path of dataset in S3 for reddit submissions")
    parser.add_argument("--s3_dataset_path_books", type=str, help="Path of dataset in S3 for gubenberg books")
    parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket")
    parser.add_argument("--s3_output_prefix", type=str, help="s3 output prefix")
    #parser.add_argument("--subreddits", type=str, help="comma separate list of subreddits of interest")
    args = parser.parse_args()

    spark = SparkSession.builder.appName("PySparkApp").getOrCreate()
    logger.info(f"spark version = {spark.version}")
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    sc = spark.sparkContext
    sc._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter"
    )

    ###### Submissions ###
    
    
    required_columns = ['subreddit', 'title', 'selftext', 'score', 'created_utc', 'url']

    # read the full year

    # Read in data from project bucket
    #bucket = "project17-bucket-alex"

    # List of 12 directories each containing 1 month of data
    directories = ["project_2022_"+str(i)+"/submissions" for i in range(1,13)]

    # Iterate through 12 directories and merge each monthly data set to create one big data set
    submissions = None
    for directory in directories:
        s3_path = f"s3a://{args.s3_output_bucket}/{directory}"
        month_df = spark.read.parquet(s3_path).select(*required_columns)

        if submissions is None:
            submissions = month_df
        else:
            submissions = submissions.union(month_df)

    
    #submissions = spark.read.parquet(args.s3_dataset_path_submissions, header=True)
    #submissions = spark.read.text(args.s3_dataset_path_submissions)
    s3_path = f"s3://{args.s3_output_bucket}/{args.s3_output_prefix}/submissions"
    logger.info(f"going to write submissions for submissions in {s3_path}")
    submissions.write.mode("overwrite").parquet(s3_path)
    
    
    
    
    
    
    books = spark.read.text(args.s3_dataset_path_books)
    s3_path = f"s3://{args.s3_output_bucket}/{args.s3_output_prefix}/books"
    logger.info(f"going to write submissions for books in {s3_path}")
    books.write.mode("overwrite").parquet(s3_path)

    
if __name__ == "__main__":
    main()

Overwriting ./project-nlp-posts-and-books-processing.py


In [70]:
%%time
import sagemaker
from sagemaker.spark.processing import PySparkProcessor

# Setup the PySpark processor to run the job. Note the instance type and instance count parameters. SageMaker will create these many instances of this type for the spark job.
role = sagemaker.get_execution_role()
spark_processor = PySparkProcessor(
    base_job_name="sm-spark-project",
    framework_version="3.3",
    role=role,
    instance_count=8,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=3600,
)

# s3 paths
session = sagemaker.Session()
bucket = "project17-bucket-alex"
s3_dataset_path_submissions = "s3://project17-bucket-alex/books/*.txt"
s3_dataset_path_books = "s3://project17-bucket-alex/books/*.txt"
output_prefix_data = "stories-and-books-nlp"
output_prefix_logs = f"spark_logs"
    
# run the job now, the arguments array is provided as command line to the Python script (Spark code in this case).
spark_processor.run(
    submit_app="./project-nlp-posts-and-books-processing.py",
    arguments=[
        "--s3_dataset_path_submissions",
        s3_dataset_path_submissions,
        "--s3_dataset_path_books",
        s3_dataset_path_books,
        "--s3_output_bucket",
        bucket,
        "--s3_output_prefix",
        output_prefix_data,

    ],
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, output_prefix_logs),
    logs=False,
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker:Creating processing-job with name sm-spark-project-2023-11-18-00-41-31-232


...................................................................................!CPU times: user 639 ms, sys: 77.6 ms, total: 716 ms
Wall time: 7min 4s


In [29]:
# Read the Gutenberg books into a Spark DataFrame
gutenberg_df = spark.read.text("../../data/external-data/books/*.txt")
gutenberg_df = gutenberg_df.withColumnRenamed("value", "text")



In [30]:
from pyspark.sql.functions import concat_ws, col

# Process `submissions_active` to extract and transform the 'selftext' column
submissions_df = submissions_active.select("selftext")





In [31]:
# For the Project Gutenberg DataFrame
gutenberg_df = gutenberg_df.withColumnRenamed("value", "text")

# For the submissions DataFrame
submissions_df = submissions_df.withColumnRenamed("selftext", "text")

# Now you can union them
combined_df = gutenberg_df.unionByName(submissions_df)

In [None]:
%%time
import sagemaker
from sagemaker.spark.processing import PySparkProcessor

role = sagemaker.get_execution_role()
spark_processor = PySparkProcessor(
    base_job_name="sm-spark-nlp-project",
    framework_version="3.3",
    role=role,
    instance_count=2,  # Adjust as needed
    instance_type="ml.t3.xlarge",
    max_runtime_in_seconds=7200,
)

session = sagemaker.Session()

# Define your S3 paths and other parameters
s3_dataset_path_submissions_bucket = "project17-bucket-alex"
s3_dataset_path_gutenberg = "s3://project17-bucket-alex/books/*.txt"
output_bucket = "project17-bucket-alex/stories-and-books-nlp/"

# Run the PySpark job
spark_processor.run(
    submit_app="project-nlp-posts-and-books-processing.py",
    arguments=[
        "--s3_dataset_path_submissions_bucket", s3_dataset_path_submissions_bucket,
        "--s3_dataset_path_gutenberg", s3_dataset_path_gutenberg,
        "--s3_output_bucket", output_bucket,
    ],
    spark_event_logs_s3_uri=f"s3://{output_bucket}/spark_event_logs",
    logs=True,
)
