# Reddit

## Dataset, Samples

In [21]:
rc = spark.read.format('json').load('hdfs://orion11:15000/rc/*')

In [25]:
rc

DataFrame[archived: boolean, author: string, author_flair_css_class: string, author_flair_text: string, body: string, controversiality: bigint, created_utc: string, distinguished: string, downs: bigint, edited: string, gilded: bigint, id: string, link_id: string, name: string, parent_id: string, removal_reason: string, retrieved_on: bigint, score: bigint, score_hidden: boolean, stickied: boolean, subreddit: string, subreddit_id: string, ups: bigint]

In [120]:
rc_samp = rc.sample(False, .1)
rc_samp.write.format('json').save('hdfs://orion11:15000/rc_samp')

In [164]:
%%time
rc_samp = spark.read.format('json').load('hdfs://orion11:15000/rc_samp/*')
# rc_samp.cache()
print(rc_samp.count())

26237538
CPU times: user 12 ms, sys: 2.97 ms, total: 15 ms
Wall time: 57.4 s


In [None]:
rc_s = rc_samp.sample(False, .1)
rc_s.write.format('json').save('hdfs://orion11:15000/rc_s')

In [1]:
%%time
rc_s = spark.read.format('json').load('hdfs://orion11:15000/rc_s/*')
rc_s.cache()
print(rc_s.count())

2622716
CPU times: user 5.8 ms, sys: 3.14 ms, total: 8.94 ms
Wall time: 15.8 s


In [126]:
rc_t = rc_s.sample(False, .1)
rc_t.write.format('json').save('hdfs://orion11:15000/rc_t')

In [1]:
%%time
rc_t = spark.read.format('json').load('hdfs://orion11:15000/rc_t/*')
rc_t.cache()
print(rc_t.count())

262372
CPU times: user 7.9 ms, sys: 472 µs, total: 8.37 ms
Wall time: 8.89 s


In [38]:
rc_u = rc_t.sample(False, .1)
rc_u.write.format('json').save('hdfs://orion11:15000/rc_u')

In [39]:
%%time
rc_u = spark.read.format('json').load('hdfs://orion11:15000/rc_u/*')
rc_u.cache()
print(rc_u.count())

26273
CPU times: user 1.71 ms, sys: 1.17 ms, total: 2.88 ms
Wall time: 1.56 s


In [75]:
rc_v = sc.parallelize(rc_t.take(10000))
rc_v.cache()
print(rc_v.count())

10000


### Number of comments

In [135]:
rc_t.rdd \
    .map(lambda comment: ('key', 1)) \
    .reduceByKey(lambda accum, n: accum + n) \
    .collect()

[('key', 262372)]

In [136]:
rc_t.createOrReplaceTempView('rc_t')

In [139]:
spark.sql("\
SELECT COUNT(*) \
FROM rc_t").collect()

[Row(count(1)=262372)]

In [138]:
rc_t.count()

262372

### Number of subreddits, subreddits' comments

In [149]:
sub_count_mr = rc_t.rdd \
    .map(lambda comment: (comment['subreddit'], 1)) \
    .reduceByKey(lambda accum, n: accum + n) \
    .collect()

In [157]:
print(len(sub_count_mr))
print(sorted(sub_count_mr, key=lambda sub_cnt: sub_cnt[1], reverse=True))

4097
[('AskReddit', 38907), ('pics', 18693), ('reddit.com', 16228), ('politics', 10680), ('gaming', 10358), ('funny', 9590), ('IAmA', 7895), ('fffffffuuuuuuuuuuuu', 6876), ('atheism', 6753), ('WTF', 6696), ('trees', 4824), ('worldnews', 4191), ('videos', 3849), ('starcraft', 3246), ('programming', 2715), ('todayilearned', 2622), ('science', 2619), ('Minecraft', 2479), ('technology', 2211), ('gonewild', 1961), ('TwoXChromosomes', 1937), ('leagueoflegends', 1701), ('Music', 1485), ('Fitness', 1370), ('sex', 1273), ('guns', 1188), ('canada', 1144), ('movies', 1142), ('nfl', 1134), ('Android', 1090), ('soccer', 1041), ('tf2', 1027), ('Libertarian', 1023), ('DoesAnybodyElse', 1022), ('mylittlepony', 1022), ('skyrim', 1014), ('Economics', 958), ('askscience', 879), ('offbeat', 816), ('battlefield3', 742), ('AdviceAnimals', 739), ('Christianity', 718), ('apple', 717), ('relationship_advice', 716), ('entertainment', 713), ('self', 682), ('linux', 668), ('wow', 636), ('MensRights', 631), ('news

In [185]:
spark.sql("\
SELECT subreddit, COUNT(*) as count \
FROM rc_t \
GROUP BY subreddit \
ORDER BY count DESC").show()

+-------------------+-----+
|          subreddit|count|
+-------------------+-----+
|          AskReddit|38907|
|               pics|18693|
|         reddit.com|16228|
|           politics|10680|
|             gaming|10358|
|              funny| 9590|
|               IAmA| 7895|
|fffffffuuuuuuuuuuuu| 6876|
|            atheism| 6753|
|                WTF| 6696|
|              trees| 4824|
|          worldnews| 4191|
|             videos| 3849|
|          starcraft| 3246|
|        programming| 2715|
|      todayilearned| 2622|
|            science| 2619|
|          Minecraft| 2479|
|         technology| 2211|
|           gonewild| 1961|
+-------------------+-----+
only showing top 20 rows



In [186]:
spark.sql("\
SELECT COUNT(DISTINCT subreddit) AS count \
FROM rc_t").collect()

[Row(count=4097)]

In [187]:
from pyspark.sql import functions as F
sub_count = rc_t.groupBy('subreddit').count().orderBy('count', ascending=False)
sub_count.show()

+-------------------+-----+
|          subreddit|count|
+-------------------+-----+
|          AskReddit|38907|
|               pics|18693|
|         reddit.com|16228|
|           politics|10680|
|             gaming|10358|
|              funny| 9590|
|               IAmA| 7895|
|fffffffuuuuuuuuuuuu| 6876|
|            atheism| 6753|
|                WTF| 6696|
|              trees| 4824|
|          worldnews| 4191|
|             videos| 3849|
|          starcraft| 3246|
|        programming| 2715|
|      todayilearned| 2622|
|            science| 2619|
|          Minecraft| 2479|
|         technology| 2211|
|           gonewild| 1961|
+-------------------+-----+
only showing top 20 rows



## Screamer Subreddits

In [133]:
%%time

import string

def screamer_sub_mapper(comment):
    n_upper = len(list(filter(lambda c: c in string.ascii_uppercase, comment['body'])))
    n_alpha = len(list(filter(lambda c: c in string.ascii_letters, comment['body'])))
    return (comment['subreddit'], (n_upper, n_alpha))

def screamer_reducer(value_list):
    total_upper = 0
    total_alpha = 0
    for value in value_list:
        (n_upper, n_alpha) = value
        total_upper += n_upper
        total_alpha += n_alpha
    screamer_score = total_upper / total_alpha if total_alpha else 0
    return (screamer_score, len(value_list))
    
screamer_subs = rc_samp.rdd \
    .map(screamer_sub_mapper) \
    .groupByKey() \
    .mapValues(screamer_reducer) \

screamer_subs.cache()

CPU times: user 50.4 ms, sys: 9.97 ms, total: 60.3 ms
Wall time: 59.1 s


In [144]:
screamer_subs \
    .filter(lambda sub_rval: sub_rval[1][1] > 1000) \
    .sortBy(lambda sub_rval: sub_rval[1][0], False) \
    .take(3)

[('spacedicks', (0.5781305686052163, 1385)),
 ('circlejerk', (0.18934631577970992, 45861)),
 ('googleplusinvites', (0.11052298941380945, 1089))]

## Term frequency-Inverse document frequency

In [2]:
%%time
from collections import Counter
import string
import nltk

def term_freq_mapper(comment):
    body = comment['body']
#     tokens = nltk.tokenize.word_tokenize(body.lower())
    tokens = [word.strip(string.punctuation) for word in body.lower().split()]
    counter = Counter(tokens)
    return (comment['subreddit'], counter)

term_freq = rc_t.rdd \
    .map(term_freq_mapper) \
    .reduceByKey(lambda a,b: a+b)
term_freq.cache()

term_freq_res = term_freq.collect()
print(term_freq_res[0:5])

CPU times: user 1.23 s, sys: 211 ms, total: 1.44 s
Wall time: 22.8 s


In [3]:
%%time
num_docs = term_freq.count()
print(num_docs)

4097
CPU times: user 14.4 ms, sys: 5.29 ms, total: 19.7 ms
Wall time: 200 ms


In [4]:
%%time

doc_freq = term_freq \
    .flatMap(lambda sub_counter: list(sub_counter[1])) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda word_docfreq: word_docfreq[1], False)
doc_freq.cache()

print(doc_freq.collect()[0:50])

[('the', 2818), ('i', 2677), ('a', 2612), ('to', 2601), ('and', 2404), ('of', 2290), ('is', 2232), ('it', 2216), ('you', 2186), ('that', 2148), ('in', 2148), ('for', 2087), ('this', 1938), ('but', 1852), ('on', 1840), ('have', 1805), ('be', 1792), ('with', 1700), ('not', 1666), ('are', 1647), ('if', 1646), ('my', 1588), ('just', 1588), ('so', 1560), ('like', 1519), ('as', 1484), ('was', 1477), ('', 1468), ('at', 1468), ('or', 1464), ('can', 1420), ("it's", 1409), ("i'm", 1375), ('they', 1373), ('do', 1359), ('one', 1358), ('what', 1355), ('all', 1349), ('me', 1344), ('out', 1340), ('from', 1325), ('get', 1323), ('your', 1314), ('about', 1311), ('there', 1309), ('deleted', 1303), ('up', 1296), ('would', 1283), ('an', 1279), ("don't", 1264)]
CPU times: user 151 ms, sys: 20.2 ms, total: 171 ms
Wall time: 1.5 s


In [5]:
%%time
import math

inv_doc_freq = doc_freq \
    .map(lambda t_df: (t_df[0], math.log(num_docs / t_df[1]))) \
    .sortBy(lambda t_idf: t_idf[1], True)
inv_doc_freq.cache()

inv_doc_freq_res = inv_doc_freq.collect();
print(inv_doc_freq_res[0:50])

[('the', 0.3742275850882455), ('i', 0.4255582340265186), ('a', 0.45013878715054945), ('to', 0.4543590120982195), ('and', 0.5331209818917729), ('of', 0.5817031809985858), ('is', 0.6073569540456695), ('it', 0.6145512296796967), ('you', 0.6281816088103872), ('that', 0.6457178219181158), ('in', 0.6457178219181158), ('for', 0.6745273704696704), ('this', 0.7485984850961596), ('but', 0.7939888623407463), ('on', 0.8004894269438397), ('have', 0.8196944067798898), ('be', 0.8269226840119954), ('with', 0.8796267475025636), ('not', 0.899829454820083), ('are', 0.9112995473692307), ('if', 0.911906896309856), ('my', 0.94777963573979), ('just', 0.94777963573979), ('so', 0.9655691773032884), ('like', 0.9922027749510982), ('as', 1.0155138538195454), ('was', 1.0202419950154913), ('', 1.0263540683724102), ('at', 1.0263540683724102), ('or', 1.0290825830256143), ('can', 1.0595981269515646), ("it's", 1.067374765648191), ("i'm", 1.0918012674461994), ('they', 1.0932568717789), ('do', 1.1035058633957273), ('one'

In [6]:
%%time
subreddit = 'programming'
sub_term_freq = term_freq.sortByKey().lookup(subreddit)[0]
print(subreddit)

tfidf_list = list(map(lambda t_idf: (t_idf[0], 
                                     t_idf[1] * sub_term_freq[t_idf[0]]), 
                 inv_doc_freq_res))
print(tfidf_list[0:50])

programming
[('the', 1587.4734159443374), ('i', 791.1127570552981), ('a', 1184.3151489930956), ('to', 1349.4462659317119), ('and', 1070.50693163868), ('of', 1186.674489237115), ('is', 1113.8926537197578), ('it', 1021.3841437276559), ('you', 1000.0651212261364), ('that', 1135.8176487539656), ('in', 876.2390843428831), ('for', 762.2159286307275), ('this', 531.5049244182733), ('but', 608.9894574153524), ('on', 575.5518979726207), ('have', 591.8193616950805), ('be', 641.6920027933085), ('with', 691.386623537015), ('not', 699.1674863952045), ('are', 630.6192867795077), ('if', 560.8227412305614), ('my', 378.1640746601762), ('just', 441.6653102547421), ('so', 418.09145377232386), ('like', 438.5536265283854), ('as', 635.7116724910354), ('was', 436.66357386663026), ('', 939.1139725607553), ('at', 387.961837844771), ('or', 532.0356954242426), ('can', 524.5010728410244), ("it's", 520.8788856363171), ("i'm", 316.62236755939784), ('they', 486.4993079416105), ('do', 456.8514274458311), ('one', 319.1

In [7]:
%%time
from nltk.corpus import stopwords
stop_words = stopwords.words()
tfidf_list = list(filter(lambda t_fidf: t_fidf[0] not in stop_words, tfidf_list))

CPU times: user 8.43 s, sys: 1.14 ms, total: 8.43 s
Wall time: 8.46 s


In [75]:
from pprint import pprint
pprint(sorted(tfidf_list, key = lambda t_fidf: t_fidf[1], reverse = True)[0:50])

[('', 939.1139725607553),
 ('code', 818.2860278216028),
 ('gt', 694.967339088719),
 ('use', 579.8298727587509),
 ('language', 522.8502962703028),
 ('languages', 450.835282960784),
 ('java', 442.17261998463636),
 ('like', 438.5536265283854),
 ('programming', 438.0645953916532),
 ('would', 391.27516865782604),
 ('software', 364.9534705988249),
 ('python', 334.20355790808793),
 ('work', 333.7701495370556),
 ('lisp', 330.98764451023504),
 ('one', 319.12592916475836),
 ("i'm", 316.62236755939784),
 ('php', 313.7146518505741),
 ('people', 311.2217260621728),
 ('think', 299.8094768740961),
 ('windows', 295.25457696125284),
 ('get', 290.50075015200224),
 ('using', 287.49192181545527),
 ("that's", 282.1112672119966),
 ('make', 281.87576949528574),
 ('haskell', 281.76003933473635),
 ('something', 281.2034440012708),
 ('write', 279.77596257728527),
 ('web', 275.12358307753107),
 ('even', 273.2319402452463),
 ('linux', 267.6518064333667),
 ('time', 265.88332728246155),
 ('deleted', 257.75678259495

## Sentiment Analysis

In [48]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [70]:
def sentiment_mapper(comment):
    score = sid.polarity_scores(comment['body'])['compound']
    return (comment['subreddit'], score)

sentiment = rc_t.rdd \
    .map(sentiment_mapper)

In [67]:
%%time
avg_sentiment = sentiment \
    .mapValues(lambda v: (v, 1)) \ # score, one
    .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1])) \ # sum(scores), sum(ones)
    .mapValues(lambda v: (v[0]/v[1], v[1])) \ # avg_score=(sum_scores/count), count
    .filter(lambda k_v: k_v[1][1] > 1000) \ # filter by count
    .sortBy(lambda k_v: k_v[1][0], False) # sort by avg_score

CPU times: user 95.8 ms, sys: 7.38 ms, total: 103 ms
Wall time: 10.1 s


In [72]:
avg_sentiment.takeOrdered(10, key=lambda k_v: -k_v[1][0])

[('gonewild', (0.2941499235084141, 1961)),
 ('Fitness', (0.19091240875912405, 1370)),
 ('Music', (0.1868072727272727, 1485)),
 ('mylittlepony', (0.17997485322896284, 1022)),
 ('TwoXChromosomes', (0.1782817759421786, 1937)),
 ('Android', (0.1760810091743119, 1090)),
 ('trees', (0.1685991500829187, 4824)),
 ('soccer', (0.15145398655139292, 1041)),
 ('programming', (0.15078088397790054, 2715)),
 ('leagueoflegends', (0.14420487948265726, 1701))]

In [73]:
avg_sentiment.takeOrdered(10, key=lambda k_v: k_v[1][0])

[('worldnews', (-0.044262133142448104, 4191)),
 ('WTF', (-0.0032822580645161217, 6696)),
 ('politics', (-0.0011439325842696566, 10680)),
 ('Libertarian', (0.02533822091886608, 1023)),
 ('funny', (0.04700525547445254, 9590)),
 ('reddit.com', (0.04863151959576039, 16228)),
 ('fffffffuuuuuuuuuuuu', (0.048675159976730664, 6876)),
 ('todayilearned', (0.049298474446987046, 2622)),
 ('guns', (0.05428072390572392, 1188)),
 ('videos', (0.05748225513120292, 3849))]