In [21]:
# download, unzip data

"""
!nohup wget -qO- https://nlp.cs.princeton.edu/SARC/1.0/main/train-unbalanced.csv.bz2 | bunzip2 -c | gsutil cp - gs://sarc-bucket-3wx3ce6drvftuy/reddit.csv &
"""

'\n!nohup wget -qO- https://nlp.cs.princeton.edu/SARC/1.0/main/train-unbalanced.csv.bz2 | bunzip2 -c | gsutil cp - gs://sarc-bucket-3wx3ce6drvftuy/reddit.csv &\n'

In [22]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Word2Vec

import pandas as pd

import re

import random

import socket

import matplotlib.pyplot as plt
%matplotlib inline

In [23]:
driver_ip = socket.gethostbyname(socket.gethostname())

In [24]:
"""
Test to see if Spark is up and running:

numberRDD = sc.parallelize(range(1, 10000))
evens = numberRDD.filter(lambda x: x % 2 == 0)
doubled = numberRDD.map(lambda x: x * 2)
fives = numberRDD.filter(lambda x: x % 5 == 0)
tens = evens.intersection(fives)
sortedTens = tens.sortBy(lambda x: x)
sortedTens
"""

'\nTest to see if Spark is up and running:\n\nnumberRDD = sc.parallelize(range(1, 10000))\nevens = numberRDD.filter(lambda x: x % 2 == 0)\ndoubled = numberRDD.map(lambda x: x * 2)\nfives = numberRDD.filter(lambda x: x % 5 == 0)\ntens = evens.intersection(fives)\nsortedTens = tens.sortBy(lambda x: x)\nsortedTens\n'

In [62]:
conf = pyspark.SparkConf().setAll([('spark.kubernetes.authenticate.caCertFile', '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'), \
                                   ('spark.kubernetes.authenticate.oauthTokenFile','/var/run/secrets/kubernetes.io/serviceaccount/token'), \
                                   ('spark.kubernetes.authenticate.driver.serviceAccountName','spark-driver-sa'), \
                                   ('spark.kubernetes.namespace','spark'), \
                                   ('spark.driver.pod.name','spark-driver'), \
                                   ('spark.executor.instances','16'), \
                                   ('spark.kubernetes.container.image','gcr.io/sarcasm-3wx3ce6drvftuy/spark-v2.4.4-worker:latest'), \
                                   ('spark.driver.host','spark-driver.spark.svc.cluster.local'), \
                                   ('spark.driver.port','29413'), \
                                   ('spark.driver.bindAddress',driver_ip), \
                                   ('spark.executor.memory','6700m'), \
                                   ('spark.executor.cores','1'), \
                                   ('spark.kubernetes.driverEnv.GCS_PROJECT_ID', 'sarcasm-3wx3ce6drvftuy'), \
                                   ('spark.kubernetes.driverEnv.GOOGLE_APPLICATION_CREDENTIALS', '/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.kubernetes.driver.secrets.sarc-bucket-sa','/mnt/secrets'), \
                                   ('spark.kubernetes.executor.secrets.sarc-bucket-sa','/mnt/secrets'), \
                                   ('spark.executorEnv.GCS_PROJECT_ID','sarcasm-3wx3ce6drvftuy'), \
                                   ('spark.executorEnv.GOOGLE_APPLICATION_CREDENTIALS','/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.hadoop.google.cloud.auth.service.account.enable','true'), \
                                   ('spark.hadoop.google.cloud.auth.service.account.json.keyfile','/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.hadoop.fs.gs.project.id','sarcasm-3wx3ce6drvftuy'), \
                                   ('spark.hadoop.fs.gs.system.bucket','sarc-bucket-3wx3ce6drvftuy')])

In [63]:
spark = SparkSession.builder.master("k8s://https://kubernetes.default.svc.cluster.local:443").appName("sarc").config(conf=conf).getOrCreate()

In [64]:
sc = spark.sparkContext

In [73]:
# Load in the entire dataset:

%time sarc = spark.read.csv("gs://sarc-bucket-3wx3ce6drvftuy/reddit.csv", inferSchema=True, header=False, sep = '\t')

CPU times: user 14.8 ms, sys: 5.35 ms, total: 20.1 ms
Wall time: 1min 30s


In [77]:
# Rename columns: 

sarc = sarc.withColumnRenamed('_c0','label').withColumnRenamed('_c1','comment').withColumnRenamed('_c2','author')\
.withColumnRenamed('_c3','subreddit').withColumnRenamed('_c4','score').withColumnRenamed('_c5','ups')\
.withColumnRenamed('_c6','downs').withColumnRenamed('_c7','date').withColumnRenamed('_c8','created_utc')\
.withColumnRenamed('_c9','parent_comment')

In [78]:
sarc1.printSchema()

root
 |-- label: integer (nullable = true)
 |-- comment: string (nullable = true)
 |-- author: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- ups: integer (nullable = true)
 |-- downs: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- parent_comment: string (nullable = true)



In [79]:
sarc.show(10)

+-----+--------------------+----------+---------+-----+---+-----+-------+-----------+--------------------+
|label|             comment|    author|subreddit|score|ups|downs|   date|created_utc|      parent_comment|
+-----+--------------------+----------+---------+-----+---+-----+-------+-----------+--------------------+
|    0|50 and it forces ...|Mattys1174|   NHLHUT|    1| -1|   -1|2017-02| 1485907243|Does anyone know ...|
|    0|     Marchand who ;)|Mattys1174|   NHLHUT|    2| -1|   -1|2017-02| 1486003855|This post wasn't ...|
|    0|Maybe theyll hire...|Mattys1174|   hockey|    5| -1|   -1|2017-02| 1486525469|Bruins Relieve Cl...|
|    0| Thats a lotta dough|Mattys1174|   NHLHUT|    1| -1|   -1|2017-02| 1486584244|Bergeron for 500$...|
|    0|Nah youre the bes...|Mattys1174|   NHLHUT|    1| -1|   -1|2017-02| 1486602648|Second best strea...|
|    0|                  +1|Mattys1174|   NHLHUT|    2| -1|   -1|2017-02| 1486658528|As someone sittin...|
|    0|Yea the newborns ...|Mattys117

In [80]:
# Get number of samples:

%time sarc.count()

CPU times: user 5.24 ms, sys: 3.22 ms, total: 8.46 ms
Wall time: 39.6 s


187401972

In [81]:
# Get value count of 'score' feature:

%time sarc.groupBy('score').count().orderBy(["count"], ascending=[0]).show()

+-----+--------+
|score|   count|
+-----+--------+
|    1|69444930|
|    2|32356724|
|    3|17450987|
|    0| 8140676|
|    4| 7405748|
|    5| 6944588|
|    6| 5232785|
|    7| 3968578|
|    8| 3115913|
|   -1| 2812235|
|    9| 2502566|
|   10| 2051214|
|   11| 1722294|
|   12| 1464941|
|   -2| 1421861|
|   13| 1263615|
|   14| 1101061|
|   15|  966318|
|   16|  855833|
|   -3|  765137|
+-----+--------+
only showing top 20 rows

CPU times: user 15.3 ms, sys: 5.49 ms, total: 20.8 ms
Wall time: 1min 14s


In [105]:
# Dropping columns we probably won't need:

sarc = sarc.drop('created_utc')
sarc = sarc.drop('date')

"""

These features could be helpful in a reddit context but would be hard to generalize 
outside of reddit

"""
sarc = sarc.drop('ups')
sarc = sarc.drop('downs')
sarc = sarc.drop('author')
sarc = sarc.drop('score')

In [83]:
# Count the number of words in a comment:

def n_comment_words(text):
    return len(re.findall("\s+|\.\s|\.\w+(?!\.)|.$",text)) 

spark.udf.register("n_comment_words", n_comment_words, ShortType()) 

<function __main__.n_comment_words(text)>

In [84]:
n_comment_words_table = F.udf(n_comment_words, ShortType())

In [85]:
# Applying udf, displaying results:

%time sarc.select("comment", n_comment_words_table("comment").alias("n_words")).show()

+--------------------+-------+
|             comment|n_words|
+--------------------+-------+
|50 and it forces ...|     11|
|     Marchand who ;)|      3|
|Maybe theyll hire...|     27|
| Thats a lotta dough|      4|
|Nah youre the bes...|     10|
|                  +1|      1|
|Yea the newborns ...|     56|
|Patches isnt gett...|      5|
|                 Hah|      1|
|Thats very true a...|      5|
|Im not a guy if y...|     13|
|         KATTENSHIRK|      1|
|Dude, stop being ...|      5|
|I would not get i...|      9|
|I heard from my b...|     22|
|Torey, and torey'...|     12|
|             I lov u|      3|
|You did this real...|      5|
|        What te fuck|      3|
|Me too honestly, ...|     14|
+--------------------+-------+
only showing top 20 rows

CPU times: user 19.9 ms, sys: 3.96 ms, total: 23.9 ms
Wall time: 1.13 s


In [86]:
# Adding new column to the original spark dataframe:

sarc = sarc.withColumn("n_comment_words", n_comment_words_table("comment"))

In [87]:
# Applying the filter for comments fewer than three words:

sarc = sarc.filter(sarc['n_comment_words']>=3) # arbitrarily chosen

In [88]:
%time print("New total number of records: {}".format(sarc.count()))

New total number of records: 163896185
CPU times: user 53.6 ms, sys: 13.5 ms, total: 67 ms
Wall time: 3min 35s


In [89]:
# Getting rid of over-sized comments:

sarc = sarc.filter(sarc['n_comment_words']<=50) # arbitrarily chosen

In [90]:
%time print("New total number of records: {}".format(sarc.count()))

New total number of records: 163131471
CPU times: user 55.9 ms, sys: 22.7 ms, total: 78.6 ms
Wall time: 5min 2s


In [94]:
# Dropping n_words and n_comment_words columns:

sarc = sarc.drop('n_words')
sarc = sarc.drop('n_comment_words')

In [95]:
sarc.show(10)

+-----+--------------------+----------+---------+-----+--------------------+
|label|             comment|    author|subreddit|score|      parent_comment|
+-----+--------------------+----------+---------+-----+--------------------+
|    0|50 and it forces ...|Mattys1174|   NHLHUT|    1|Does anyone know ...|
|    0|     Marchand who ;)|Mattys1174|   NHLHUT|    2|This post wasn't ...|
|    0|Maybe theyll hire...|Mattys1174|   hockey|    5|Bruins Relieve Cl...|
|    0| Thats a lotta dough|Mattys1174|   NHLHUT|    1|Bergeron for 500$...|
|    0|Nah youre the bes...|Mattys1174|   NHLHUT|    1|Second best strea...|
|    0|Patches isnt gett...|Mattys1174|   NHLHUT|    1|Confessions of a ...|
|    0|Thats very true a...|Mattys1174|   NHLHUT|    1|Burning out a lin...|
|    0|Im not a guy if y...|Mattys1174|   NHLHUT|    1|Just a girl looki...|
|    0|Dude, stop being ...|Mattys1174|   NHLHUT|   -5|I have a post wit...|
|    0|I would not get i...|Mattys1174|   NHLHUT|    1|Lol! Seriously! N...|

In [101]:
# Concatenating parent_comment and comment for full context:

sarc = sarc.withColumn('context',F.concat(col("parent_comment"), F.lit(" "), F.col("comment")))

In [106]:
# Dropping separate parent_comment and comment colunns:

sarc = sarc.drop('comment')
sarc = sarc.drop('parent_comment')

sarc.show(10)

+-----+---------+--------------------+
|label|subreddit|             context|
+-----+---------+--------------------+
|    0|   NHLHUT|Does anyone know ...|
|    0|   NHLHUT|This post wasn't ...|
|    0|   hockey|Bruins Relieve Cl...|
|    0|   NHLHUT|Bergeron for 500$...|
|    0|   NHLHUT|Second best strea...|
|    0|   NHLHUT|Confessions of a ...|
|    0|   NHLHUT|Burning out a lin...|
|    0|   NHLHUT|Just a girl looki...|
|    0|   NHLHUT|I have a post wit...|
|    0|   NHLHUT|Lol! Seriously! N...|
+-----+---------+--------------------+
only showing top 10 rows



In [108]:
# Writing truncated dataframe to gcs:

sarc.write.csv('gs://sarc-bucket-3wx3ce6drvftuy/reddit_trunc.csv')

In [110]:
# Checking out frequencies amongst subreddit categories:

sarc.groupBy('subreddit').count().sort(col("count").desc()).show(100)

+--------------------+--------+
|           subreddit|   count|
+--------------------+--------+
|           AskReddit|13912287|
|            politics| 3987395|
|               funny| 3553485|
|                pics| 3090176|
|                 nfl| 2931774|
|                 nba| 2835460|
|     leagueoflegends| 2474871|
|           worldnews| 2461930|
|       todayilearned| 2230701|
|              gaming| 2141765|
|              videos| 2132303|
|       AdviceAnimals| 1982689|
|                news| 1930852|
|                 WTF| 1883286|
|          The_Donald| 1729085|
|        pcmasterrace| 1638541|
|              hockey| 1602605|
|              soccer| 1551522|
|              movies| 1310982|
|     GlobalOffensive| 1188153|
|       SquaredCircle| 1147085|
|                 CFB| 1142279|
|               DotA2| 1103664|
|                gifs| 1019627|
|             atheism|  809594|
|          technology|  770018|
|      TumblrInAction|  738262|
|            baseball|  736908|
|      S

In [117]:
# Filtering for just politics subreddit:

politics = sarc.where(col('subreddit')=='politics')

In [118]:
politics.show()

+-----+---------+--------------------+
|label|subreddit|             context|
+-----+---------+--------------------+
|    0| politics|I think he's heav...|
|    0| politics|All it takes is t...|
|    0| politics|You know he's nev...|
|    0| politics|Go ahead and wast...|
|    0| politics|"Right when the m...|
|    0| politics|he tweets, you ge...|
|    0| politics|I am Canadian, I ...|
|    0| politics|"Yeah, ""obsessed...|
|    0| politics|I thought better ...|
|    0| politics|same strategy use...|
|    0| politics|CPAC leader: The ...|
|    0| politics|Project Veritas '...|
|    0| politics|It is amusing but...|
|    0| politics|The Trump-tanic h...|
|    0| politics|Vladimir Putin sa...|
|    0| politics|I've heard lots o...|
|    0| politics|That could be an ...|
|    0| politics|Bet he uses Netsc...|
|    0| politics|I'm no lawyer, bu...|
|    0| politics|Why hasn't she be...|
+-----+---------+--------------------+
only showing top 20 rows



In [119]:
%time print("New total number of records: {}".format(politics.count()))

New total number of records: 3987395
CPU times: user 42.1 ms, sys: 12.9 ms, total: 55 ms
Wall time: 1min 50s


In [120]:
# Writing politics dataframe to gcs:

%time politics.write.csv('gs://sarc-bucket-3wx3ce6drvftuy/politics.csv')

CPU times: user 49 ms, sys: 15.8 ms, total: 64.8 ms
Wall time: 3min 19s


In [121]:
# Filtering for just science subreddit:

science = sarc.where(col('subreddit')=='science')
%time print("New total number of records: {}".format(science.count()))

New total number of records: 242514
CPU times: user 44.6 ms, sys: 10.2 ms, total: 54.8 ms
Wall time: 1min 46s


In [122]:
# Writing science dataframe to gcs:

%time science.write.csv('gs://sarc-bucket-3wx3ce6drvftuy/science.csv')

CPU times: user 49.7 ms, sys: 15.5 ms, total: 65.2 ms
Wall time: 3min 14s


In [123]:
sc.stop()