In [2]:
# download, unzip data

"""
!nohup wget -qO- https://nlp.cs.princeton.edu/SARC/1.0/main/train-unbalanced.csv.bz2 | bunzip2 -c | gsutil cp - gs://sarc-bucket-5/reddit.csv &
"""

'\n!nohup wget -qO- https://nlp.cs.princeton.edu/SARC/1.0/main/train-unbalanced.csv.bz2 | bunzip2 -c | gsutil cp - gs://sarc-bucket-5/reddit.csv &\n'

In [1]:
import pyspark

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import pandas as pd

import re

import random

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
"""
Test to see if Spark is up and running:

numberRDD = sc.parallelize(range(1, 10000))
evens = numberRDD.filter(lambda x: x % 2 == 0)
doubled = numberRDD.map(lambda x: x * 2)
fives = numberRDD.filter(lambda x: x % 5 == 0)
tens = evens.intersection(fives)
sortedTens = tens.sortBy(lambda x: x)
sortedTens
"""

'\nTest to see if Spark is up and running:\n\nnumberRDD = sc.parallelize(range(1, 10000))\nevens = numberRDD.filter(lambda x: x % 2 == 0)\ndoubled = numberRDD.map(lambda x: x * 2)\nfives = numberRDD.filter(lambda x: x % 5 == 0)\ntens = evens.intersection(fives)\nsortedTens = tens.sortBy(lambda x: x)\nsortedTens\n'

In [3]:
config = pyspark.SparkConf().setAll([("spark.dynamicAllocation.enabled","True"),
                                    ("spark.executor.cores","2")])

In [4]:
sc = SparkContext(conf=config)
spark = SparkSession(sc)

In [5]:
# Load in the entire dataset:

%time sarc = spark.read.csv("gs://sarc-bucket-5/reddit.csv", inferSchema=True, header=False, sep = '\t')

CPU times: user 21.7 ms, sys: 13.2 ms, total: 35 ms
Wall time: 1min 52s


In [6]:
# Rename columns: 

sarc = sarc.withColumnRenamed('_c0','label').withColumnRenamed('_c1','comment').withColumnRenamed('_c2','author')\
.withColumnRenamed('_c3','subreddit').withColumnRenamed('_c4','score').withColumnRenamed('_c5','ups')\
.withColumnRenamed('_c6','downs').withColumnRenamed('_c7','date').withColumnRenamed('_c8','created_utc')\
.withColumnRenamed('_c9','parent_comment')

In [7]:
sarc.printSchema()

root
 |-- label: integer (nullable = true)
 |-- comment: string (nullable = true)
 |-- author: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- ups: integer (nullable = true)
 |-- downs: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- created_utc: integer (nullable = true)
 |-- parent_comment: string (nullable = true)



In [8]:
%time sarc.show(10)

+-----+--------------------+----------+---------+-----+---+-----+-------+-----------+--------------------+
|label|             comment|    author|subreddit|score|ups|downs|   date|created_utc|      parent_comment|
+-----+--------------------+----------+---------+-----+---+-----+-------+-----------+--------------------+
|    0|50 and it forces ...|Mattys1174|   NHLHUT|    1| -1|   -1|2017-02| 1485907243|Does anyone know ...|
|    0|     Marchand who ;)|Mattys1174|   NHLHUT|    2| -1|   -1|2017-02| 1486003855|This post wasn't ...|
|    0|Maybe theyll hire...|Mattys1174|   hockey|    5| -1|   -1|2017-02| 1486525469|Bruins Relieve Cl...|
|    0| Thats a lotta dough|Mattys1174|   NHLHUT|    1| -1|   -1|2017-02| 1486584244|Bergeron for 500$...|
|    0|Nah youre the bes...|Mattys1174|   NHLHUT|    1| -1|   -1|2017-02| 1486602648|Second best strea...|
|    0|                  +1|Mattys1174|   NHLHUT|    2| -1|   -1|2017-02| 1486658528|As someone sittin...|
|    0|Yea the newborns ...|Mattys117

In [9]:
# Get number of samples:

%time sarc.count()

CPU times: user 9.24 ms, sys: 0 ns, total: 9.24 ms
Wall time: 26 s


187401972

In [10]:
# Get value count of 'score' feature:

%time sarc.groupBy('score').count().orderBy(["count"], ascending=[0]).show()

+-----+--------+
|score|   count|
+-----+--------+
|    1|69444930|
|    2|32356724|
|    3|17450987|
|    0| 8140676|
|    4| 7405748|
|    5| 6944588|
|    6| 5232785|
|    7| 3968578|
|    8| 3115913|
|   -1| 2812235|
|    9| 2502566|
|   10| 2051214|
|   11| 1722294|
|   12| 1464941|
|   -2| 1421861|
|   13| 1263615|
|   14| 1101061|
|   15|  966318|
|   16|  855833|
|   -3|  765137|
+-----+--------+
only showing top 20 rows

CPU times: user 10.8 ms, sys: 11.6 ms, total: 22.4 ms
Wall time: 1min 1s


In [11]:
# Dropping columns we probably won't need:

sarc = sarc.drop('created_utc')
sarc = sarc.drop('date')

"""

These features could be helpful in a reddit context but would be hard to generalize 
outside of reddit

"""
sarc = sarc.drop('ups')
sarc = sarc.drop('downs')
sarc = sarc.drop('author')
sarc = sarc.drop('score')

In [12]:
# Count the number of words in a comment:

def n_comment_words(text):
    return len(re.findall("\s+|\.\s|\.\w+(?!\.)|.$",text)) 

spark.udf.register("n_comment_words", n_comment_words, ShortType()) 

<function __main__.n_comment_words(text)>

In [13]:
n_comment_words_table = F.udf(n_comment_words, ShortType())

In [14]:
# Applying udf, displaying results:

%time sarc.select("comment", n_comment_words_table("comment").alias("n_words")).show()

+--------------------+-------+
|             comment|n_words|
+--------------------+-------+
|50 and it forces ...|     11|
|     Marchand who ;)|      3|
|Maybe theyll hire...|     27|
| Thats a lotta dough|      4|
|Nah youre the bes...|     10|
|                  +1|      1|
|Yea the newborns ...|     56|
|Patches isnt gett...|      5|
|                 Hah|      1|
|Thats very true a...|      5|
|Im not a guy if y...|     13|
|         KATTENSHIRK|      1|
|Dude, stop being ...|      5|
|I would not get i...|      9|
|I heard from my b...|     22|
|Torey, and torey'...|     12|
|             I lov u|      3|
|You did this real...|      5|
|        What te fuck|      3|
|Me too honestly, ...|     14|
+--------------------+-------+
only showing top 20 rows

CPU times: user 3.71 ms, sys: 7.94 ms, total: 11.6 ms
Wall time: 2.34 s


In [15]:
# Adding new column to the original spark dataframe:

sarc = sarc.withColumn("n_comment_words", n_comment_words_table("comment"))

In [16]:
# Applying the filter for comments fewer than three words:

sarc = sarc.filter(sarc['n_comment_words']>=3) # arbitrarily chosen

In [17]:
%time print("New total number of records: {}".format(sarc.count()))

New total number of records: 163896185
CPU times: user 72.7 ms, sys: 36 ms, total: 109 ms
Wall time: 3min 50s


In [18]:
# Getting rid of over-sized comments:

sarc = sarc.filter(sarc['n_comment_words']<=50) # arbitrarily chosen

In [19]:
%time print("New total number of records: {}".format(sarc.count()))

New total number of records: 163131471
CPU times: user 100 ms, sys: 26.3 ms, total: 127 ms
Wall time: 5min 34s


In [20]:
# Dropping n_words and n_comment_words columns:

sarc = sarc.drop('n_words')
sarc = sarc.drop('n_comment_words')

In [21]:
%time sarc.show(10)

+-----+--------------------+---------+--------------------+
|label|             comment|subreddit|      parent_comment|
+-----+--------------------+---------+--------------------+
|    0|50 and it forces ...|   NHLHUT|Does anyone know ...|
|    0|     Marchand who ;)|   NHLHUT|This post wasn't ...|
|    0|Maybe theyll hire...|   hockey|Bruins Relieve Cl...|
|    0| Thats a lotta dough|   NHLHUT|Bergeron for 500$...|
|    0|Nah youre the bes...|   NHLHUT|Second best strea...|
|    0|Patches isnt gett...|   NHLHUT|Confessions of a ...|
|    0|Thats very true a...|   NHLHUT|Burning out a lin...|
|    0|Im not a guy if y...|   NHLHUT|Just a girl looki...|
|    0|Dude, stop being ...|   NHLHUT|I have a post wit...|
|    0|I would not get i...|   NHLHUT|Lol! Seriously! N...|
+-----+--------------------+---------+--------------------+
only showing top 10 rows

CPU times: user 0 ns, sys: 3.65 ms, total: 3.65 ms
Wall time: 532 ms


In [22]:
# Concatenating parent_comment and comment for full context:

sarc = sarc.withColumn('context',F.concat(F.col("parent_comment"), F.lit(" "), F.col("comment")))

In [23]:
# Dropping separate parent_comment and comment colunns:

sarc = sarc.drop('comment')
sarc = sarc.drop('parent_comment')

%time sarc.show(10)

+-----+---------+--------------------+
|label|subreddit|             context|
+-----+---------+--------------------+
|    0|   NHLHUT|Does anyone know ...|
|    0|   NHLHUT|This post wasn't ...|
|    0|   hockey|Bruins Relieve Cl...|
|    0|   NHLHUT|Bergeron for 500$...|
|    0|   NHLHUT|Second best strea...|
|    0|   NHLHUT|Confessions of a ...|
|    0|   NHLHUT|Burning out a lin...|
|    0|   NHLHUT|Just a girl looki...|
|    0|   NHLHUT|I have a post wit...|
|    0|   NHLHUT|Lol! Seriously! N...|
+-----+---------+--------------------+
only showing top 10 rows

CPU times: user 1.45 ms, sys: 242 µs, total: 1.69 ms
Wall time: 465 ms


In [24]:
# Note the null values:

%time sarc.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in sarc.columns]).show()

+-----+---------+-------+
|label|subreddit|context|
+-----+---------+-------+
|    0|      799|    799|
+-----+---------+-------+

CPU times: user 108 ms, sys: 28.3 ms, total: 137 ms
Wall time: 5min 38s


In [25]:
# Remove the null values:

sarc = sarc.where(F.col("context").isNotNull())

In [27]:
# Verifying the result:

%time sarc.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in sarc.columns]).show()

+-----+---------+-------+
|label|subreddit|context|
+-----+---------+-------+
|    0|        0|      0|
+-----+---------+-------+

CPU times: user 110 ms, sys: 22.6 ms, total: 132 ms
Wall time: 5min 45s


In [28]:
# Writing truncated dataframe to gcs:

%time sarc.write.csv('gs://sarc-bucket-5/reddit_processed.csv', mode="overwrite")

CPU times: user 93.5 ms, sys: 42.8 ms, total: 136 ms
Wall time: 6min 41s


In [29]:
# Checking out frequencies amongst subreddit categories:

%time sarc.groupBy('subreddit').count().sort(F.col("count").desc()).show(100)

+--------------------+--------+
|           subreddit|   count|
+--------------------+--------+
|           AskReddit|13912287|
|            politics| 3987395|
|               funny| 3553485|
|                pics| 3090176|
|                 nfl| 2931774|
|                 nba| 2835460|
|     leagueoflegends| 2474871|
|           worldnews| 2461930|
|       todayilearned| 2230701|
|              gaming| 2141765|
|              videos| 2132303|
|       AdviceAnimals| 1982689|
|                news| 1930852|
|                 WTF| 1883286|
|          The_Donald| 1729085|
|        pcmasterrace| 1638541|
|              hockey| 1602605|
|              soccer| 1551522|
|              movies| 1310982|
|     GlobalOffensive| 1188153|
|       SquaredCircle| 1147085|
|                 CFB| 1142279|
|               DotA2| 1103664|
|                gifs| 1019627|
|             atheism|  809594|
|          technology|  770018|
|      TumblrInAction|  738262|
|            baseball|  736908|
|      S

In [30]:
# Filtering for just politics subreddit:

politics = sarc.where(F.col('subreddit')=='politics')

In [31]:
%time politics.show()

+-----+---------+--------------------+
|label|subreddit|             context|
+-----+---------+--------------------+
|    0| politics|I think he's heav...|
|    0| politics|All it takes is t...|
|    0| politics|You know he's nev...|
|    0| politics|Go ahead and wast...|
|    0| politics|"Right when the m...|
|    0| politics|he tweets, you ge...|
|    0| politics|I am Canadian, I ...|
|    0| politics|"Yeah, ""obsessed...|
|    0| politics|I thought better ...|
|    0| politics|same strategy use...|
|    0| politics|CPAC leader: The ...|
|    0| politics|Project Veritas '...|
|    0| politics|It is amusing but...|
|    0| politics|The Trump-tanic h...|
|    0| politics|Vladimir Putin sa...|
|    0| politics|I've heard lots o...|
|    0| politics|That could be an ...|
|    0| politics|Bet he uses Netsc...|
|    0| politics|I'm no lawyer, bu...|
|    0| politics|Why hasn't she be...|
+-----+---------+--------------------+
only showing top 20 rows

CPU times: user 2.79 ms, sys: 548 µs, 

In [32]:
%time print("New total number of records: {}".format(politics.count()))

New total number of records: 3987395
CPU times: user 64.3 ms, sys: 23 ms, total: 87.4 ms
Wall time: 1min 47s


In [33]:
# Writing politics dataframe to gcs:

%time politics.write.csv('gs://sarc-bucket-5/politics.csv', mode="overwrite")

CPU times: user 78.8 ms, sys: 13.2 ms, total: 92 ms
Wall time: 1min 58s


In [34]:
# Filtering for just science subreddit:

teen_sci = sarc.where((F.col('subreddit')=='science') | (F.col('subreddit')=='teenagers'))
%time print("New total number of records: {}".format(teen_sci.count()))

New total number of records: 824297
CPU times: user 62 ms, sys: 22.4 ms, total: 84.4 ms
Wall time: 1min 41s


In [35]:
# Writing science dataframe to gcs:

%time teen_sci.write.csv('gs://sarc-bucket-5/teen_sci.csv', mode="overwrite")

CPU times: user 71.8 ms, sys: 16.1 ms, total: 87.8 ms
Wall time: 1min 49s


In [14]:
sc.stop()