In [1]:
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Encoders
import scala.concurrent.duration._
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
val spark = SparkSession
        .builder
        .appName("RedditAnalysis")
        .getOrCreate()
import spark.implicits._

spark = org.apache.spark.sql.SparkSession@705171eb


In [2]:
def fixEncoding(text:String): String = {
    val regex = "[\\xc2-\\xf4][\\x80-\\xbf]+".r
    return regex.replaceAllIn(text, m => new String(m.group(0).getBytes("ISO-8859-1"),"UTF-8"))
}

fixEncoding: (text: String)String


In [3]:
case class Reddit(author: String, body: String, author_flair_text: String, gilded: BigInt, score: BigInt,
                  link_id: String, retrieved_on: Long, author_flair_css_class: String, subreddit: String,
                  edited: String, ups: BigInt, controversiality: BigInt, created_utc: java.sql.Timestamp,
                  parent_id: String, subreddit_id: String, id: String, distinguished: String)

defined class Reddit


In [4]:
val schema = Encoders.product[Reddit].schema

schema = StructType(StructField(author,StringType,true), StructField(body,StringType,true), StructField(author_flair_text,StringType,true), StructField(gilded,DecimalType(38,0),true), StructField(score,DecimalType(38,0),true), StructField(link_id,StringType,true), StructField(retrieved_on,LongType,false), StructField(author_flair_css_class,StringType,true), StructField(subreddit,StringType,true), StructField(edited,StringType,true), StructField(ups,DecimalType(38,0),true), StructField(controversiality,DecimalType(38,0),true), StructField(created_utc,TimestampType,true), StructField(parent_id,StringType,true), StructField(subreddit_id,StringType,true), StructField(id,StringType,true), StructField(distinguished,StringType,true))


StructType(StructField(author,StringType,true), StructField(body,StringType,true), StructField(author_flair_text,StringType,true), StructField(gilded,DecimalType(38,0),true), StructField(score,DecimalType(38,0),true), StructField(link_id,StringType,true), StructField(retrieved_on,LongType,false), StructField(author_flair_css_class,StringType,true), StructField(subreddit,StringType,true), StructField(edited,StringType,true), StructField(ups,DecimalType(38,0),true), StructField(controversiality,DecimalType(38,0),true), StructField(created_utc,TimestampType,true), StructField(parent_id,StringType,true), StructField(subreddit_id,StringType,true), StructField(id,StringType,true), StructField(distinguished,StringType,true))

In [5]:
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [22]:
val reddit = spark.readStream.schema(schema).option("maxFilesPerTrigger", 1).json("reddit_posts_2005/*.json").as[Reddit]

reddit = [author: string, body: string ... 15 more fields]


[author: string, body: string ... 15 more fields]

In [18]:
val user = reddit.groupBy($"author", $"subreddit").agg(count($"author"))

user = [author: string, subreddit: string ... 1 more field]


lastException: Throwable = null


[author: string, subreddit: string ... 1 more field]

In [53]:
val user = reddit.groupBy(window($"created_utc", "30 minutes"), $"author").count()

Name: Unknown Error
Message: <console>:48: error: not found: value sort
       val user = reddit.groupBy(window($"created_utc", "30 minutes"), $"author").agg(count($"author"), sort($"created_utc"))
                                                                                                        ^

StackTrace: 

In [51]:
val redditStream = user.writeStream.format("console").
                   option("truncate", false).
                   trigger(Trigger.ProcessingTime(5.seconds)).
                   outputMode(OutputMode.Complete).
                   queryName("reddit_posts").start

redditStream = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@11507c15


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@11507c15

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+---------+-----+
|window                                    |author   |count|
+------------------------------------------+---------+-----+
|[2005-12-12 09:00:00, 2005-12-12 09:30:00]|AaronSw  |3    |
|[2005-12-12 09:00:00, 2005-12-12 09:30:00]|fnord123 |1    |
|[2005-12-12 09:00:00, 2005-12-12 09:30:00]|ssundar78|1    |
+------------------------------------------+---------+-----+

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+---------+-----+
|window                                    |author   |count|
+------------------------------------------+---------+-----+
|[2005-12-12 09:00:00, 2005-12-12 09:30:00]|AaronSw  |3    |
|[2005-12-12 09:30:00, 2005-12-12 10:00:00]|AaronSw  |1    |
|[2005-12-12 09:00:00, 2005-12-12 09:30:00]|fnord123 |1    |
|[2005-12-12

In [54]:
redditStream.isActive

false

In [27]:
%%SQL
select author, date_format(window.end, "MMM-dd HH:mm") as created_utc, count from reddit_posts order by created_utc, author

+---------+------------+-----+
|   author| created_utc|count|
+---------+------------+-----+
|     frjo|Dec-12 04:00|    1|
|  zse7zse|Dec-12 04:00|    1|
|     b0se|Dec-12 06:00|    1|
| cavedave|Dec-12 06:00|    1|
|  rjoseph|Dec-12 06:00|    1|
|  AaronSw|Dec-12 10:00|    4|
| fnord123|Dec-12 10:00|    1|
| jarsonic|Dec-12 10:00|    1|
|ssundar78|Dec-12 10:00|    2|
|   zlayde|Dec-12 10:00|    1|
+---------+------------+-----+
only showing top 10 rows



In [35]:
%%SQL
select author, date_format(window.end, "MMM-dd-YYYY HH:mm:ss") as created_utc, count from reddit_posts order by created_utc, author

+----------+--------------------+-----+
|    author|         created_utc|count|
+----------+--------------------+-----+
|      frjo|Dec-12-2005 04:00:00|    1|
|   zse7zse|Dec-12-2005 04:00:00|    1|
|      b0se|Dec-12-2005 06:00:00|    1|
|  cavedave|Dec-12-2005 06:00:00|    1|
|   rjoseph|Dec-12-2005 06:00:00|    1|
|     damir|Dec-12-2005 08:00:00|    1|
|  kn0thing|Dec-12-2005 08:00:00|    1|
|richardk74|Dec-12-2005 08:00:00|    1|
|   AaronSw|Dec-12-2005 10:00:00|    4|
|   bugbear|Dec-12-2005 10:00:00|    1|
+----------+--------------------+-----+
only showing top 10 rows



In [52]:
redditStream.stop