In [None]:
# download, unzip data
"""
!wget -O- https://nlp.cs.princeton.edu/SARC/1.0/main/train-unbalanced.csv.bz2 | bunzip2 -c | gsutil cp - gs://sarc-bucket-2wx3ce6drvftuy/reddit.csv
"""

In [None]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Word2Vec

import pandas as pd

import re

import random

import socket

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
driver_ip = socket.gethostbyname(socket.gethostname())

In [None]:
"""
Test to see if Spark is up and running:

numberRDD = sc.parallelize(range(1, 10000))
evens = numberRDD.filter(lambda x: x % 2 == 0)
doubled = numberRDD.map(lambda x: x * 2)
fives = numberRDD.filter(lambda x: x % 5 == 0)
tens = evens.intersection(fives)
sortedTens = tens.sortBy(lambda x: x)
sortedTens
"""

In [None]:
conf = pyspark.SparkConf().setAll([('spark.kubernetes.authenticate.caCertFile', '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'), \
                                   ('spark.kubernetes.authenticate.oauthTokenFile','/var/run/secrets/kubernetes.io/serviceaccount/token'), \
                                   ('spark.kubernetes.authenticate.driver.serviceAccountName','spark-driver-sa'), \
                                   ('spark.kubernetes.namespace','spark'), \
                                   ('spark.driver.pod.name','spark-driver'), \
                                   ('spark.executor.instances','16'), \
                                   ('spark.kubernetes.container.image','gcr.io/sarcasm-2wx3ce6drvftuy/spark-v2.4.4-worker:latest'), \
                                   ('spark.driver.host','spark-driver.spark.svc.cluster.local'), \
                                   ('spark.driver.port','29413'), \
                                   ('spark.driver.bindAddress',driver_ip), \
                                   ('spark.executor.memory','6500m'), \
                                   ('spark.executor.cores','1'), \
                                   ('spark.kubernetes.driverEnv.GCS_PROJECT_ID', 'sarcasm-2wx3ce6drvftuy'), \
                                   ('spark.kubernetes.driverEnv.GOOGLE_APPLICATION_CREDENTIALS', '/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.kubernetes.driver.secrets.sarc-bucket-sa','/mnt/secrets'), \
                                   ('spark.kubernetes.executor.secrets.sarc-bucket-sa','/mnt/secrets'), \
                                   ('spark.executorEnv.GCS_PROJECT_ID','sarcasm-2wx3ce6drvftuy'), \
                                   ('spark.executorEnv.GOOGLE_APPLICATION_CREDENTIALS','/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.hadoop.google.cloud.auth.service.account.enable','true'), \
                                   ('spark.hadoop.google.cloud.auth.service.account.json.keyfile','/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.hadoop.fs.gs.project.id','sarcasm-2wx3ce6drvftuy'), \
                                   ('spark.hadoop.fs.gs.system.bucket','sarc-bucket-2wx3ce6drvftuy')])

In [None]:
spark = SparkSession.builder.master("k8s://https://kubernetes.default.svc.cluster.local:443").appName("sarc").config(conf=conf).getOrCreate()

In [None]:
sc = spark.sparkContext

In [None]:
# Load in the entire dataset:

%time sarc = spark.read.csv("gs://sarc-bucket-2wx3ce6drvftuy/reddit.csv", inferSchema=True, header=False, sep = '\t')

In [None]:
# Rename columns:

sarc = sarc.select(col('_c0').alias('label'),col('_c1').alias('comment'),col('_c2').alias('author'),
              col('_c4').alias('subreddit'),col('_c5').alias('score'),col('_c6').alias('ups'),
              col('_c6').alias('downs'),col('_c7').alias('date'), col('_c8').alias('created_utc'),
              col('_c9').alias('parent_comment'))

In [None]:
sarc.printSchema()

In [None]:
sarc.show(30)

In [None]:
# Get number of samples:

%time sarc.count()

In [None]:
# Get value count of 'score' feature:

%time sarc.groupBy('score').count().orderBy(["count"], ascending=[0]).show()

In [None]:
# Dropping columns we probably won't need:

sarc = sarc.drop('created_utc')
sarc = sarc.drop('date')
sarc = sarc.drop('ups')
sarc = sarc.drop('downs')

In [None]:
# Count the number of words in a comment:

def n_comment_words(text):
    return len(re.findall("\s+|\.\s|\.\w+(?!\.)|.$",text)) 

spark.udf.register("n_comment_words", n_comment_words, ShortType()) 

In [None]:
n_comment_words_table = F.udf(n_comment_words, ShortType())

In [None]:
# Applying udf, displaying results:

%time sarc.select("comment", n_comment_words_table("comment").alias("n_words")).show()

In [None]:
# Adding new column to the original spark dataframe:

sarc = sarc.withColumn("n_comment_words", n_comment_words_table("comment"))

In [None]:
# Applying the filter for comments fewer than three words:

sarc = sarc.filter(sarc['n_comment_words']>=3) # arbitrarily chosen

In [None]:
%time print("New total number of records: {}".format(sarc.count()))

In [None]:
# Getting rid of over-sized comments:

sarc = sarc.filter(sarc['n_comment_words']<=50) # arbitrarily chosen

In [None]:
%time print("New total number of records: {}".format(sarc.count()))

In [None]:
# Dropping n_words column:

sarc = sarc.drop('n_words')

In [None]:
sarc.show(30)

In [None]:
sarc.write.csv('gs://sarc-bucket-2wx3ce6drvftuy/reddit_trunc.csv')