In [1]:
from __future__ import print_function

import logging # python logging module
import sys

import json
import pandas as pd

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [2]:
# basic format for logging
logFormat = "%(asctime)s - [%(levelname)s] (%(funcName)s:%(lineno)d) %(message)s"

# logs will be stored in tweepy.log
logging.basicConfig(filename='SparkStreamingRevamped.log', level=logging.INFO, 
                   format=logFormat, datefmt="%Y-%m-%d %H:%M:%S")


In [3]:
# magic function to plot inline
%matplotlib inline

In [4]:
def print_happiest_words(rdd):
    top_list = rdd.take(5)
    print("Happiest topics in the last 5 seconds (%d total):" % rdd.count())
    for tuple in top_list:
        print("%s (%d happiness)" % (tuple[1], tuple[0]))

if __name__ == "__main__":

    sc = SparkContext(appName="TwitterStreamAnalysis")
    ssc = StreamingContext(sc, 60)
    
    # Host port of server which is sending text stream
    host = "localhost"
    port = 8889
    dStream = ssc.socketTextStream(host, port)
    
    def parseForPOI(dStream):
        try:
            data = json.loads(dStream)
            return [(
                     data.get("name", "undefined").decode("utf-8"), 
                     int(data.get("followersCount", 0))
                    )]
        except:
            return []
        
    def displayPOI(time, rdd):
        try:
            print(time)
            print("Top 20 Influential personalities from the twitter across the globe: ")
            print("Rank".center(6, "-") + "|" + "Name".center(40, "-") + "|" + "Followers Count".center(20, "-"))
            for rank, item in enumerate(rdd.distinct().takeOrdered(20, key=lambda x: -x[1])):
                print(str(rank + 1).center(6, " ") + 
                      "|" + item[0].center(40, " ") + 
                      "|" + str(item[1]).rjust(15, " ")
                     )
        except ValueError:
            pass
    
    _influencial = dStream.flatMap(parseForPOI)\
                          .transform(  # Sorting the data
                                     lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False)
                          ).foreachRDD(displayPOI)
    _influencial2 = dStream.flatMap(parseForPOI)\
                      .transform(  # Sorting the data
                                 lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False)
                      ).count()
    _influencial2.pprint(20)
    ssc.start()
    ssc.awaitTermination()

2017-10-05 15:37:00
Top 20 Influential personalities from the twitter across the globe: 
-Rank-|------------------Name------------------|--Followers Count---
  1   |              The Red Tide              |         331194
  2   |              The Red Tide              |         331194
  3   |                 GUESS                  |         130404
  4   |              A . Alshehri              |         108873
  5   |          The London Economic           |          57841
  6   |            itsjohnwetzel39             |          52624
  7   |             Bhadmus Hakeem             |          49779
  8   |             Bhadmus Hakeem             |          49779
  9   |                 Figen                  |          37191
  10  |                Altinget                |          34188
  11  |           Esio G. Moreno R.            |          31227
  12  |           EU Maritime & Fish           |          30356
  13  |             Binnaz Toprak              |          29094
  14  |   

-------------------------------------------
Time: 2017-10-05 15:42:00
-------------------------------------------
1028

2017-10-05 15:43:00
Top 20 Influential personalities from the twitter across the globe: 
-Rank-|------------------Name------------------|--Followers Count---


KeyboardInterrupt: 

In [7]:
# always stop the streamer
# ssc.stop()

from pyspark.rdd import RDD

RDD.distinct

In [None]:
str.ljust