 ## Q1 Find Influential people in twitter:

    - For simplicity assume the algorithm to find influential person is directly proportional to followers.
    - Find top 20 Influential personalities from the twitter across the globe.



In [1]:
from __future__ import print_function

import logging # python logging module
import sys

import json
import pandas as pd

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [2]:
# basic format for logging
logFormat = "%(asctime)s - [%(levelname)s] (%(funcName)s:%(lineno)d) %(message)s"

# logs will be stored in tweepy.log
logging.basicConfig(filename='SparkStreamingRevamped.log', level=logging.INFO, 
                   format=logFormat, datefmt="%Y-%m-%d %H:%M:%S")


In [3]:
# magic function to plot inline
%matplotlib inline

In [4]:
if __name__ == "__main__":

    sc = SparkContext(appName="TwitterStreamAnalysis")
    ssc = StreamingContext(sc, 60 * 60)  # Setting 1hr interval
    
    # Host port of server which is sending text stream
    host = "localhost"
    port = 8889
    socketStream = ssc.socketTextStream(host, port) # Connecting to socket
    dStream = socketStream.window(60 * 60)  # Setting 1hr window
    def parseForPOI(dStream): # Data Manupulation
        try:
            data = json.loads(dStream)  # Load the json data
            return [( # Tuple of name and follower count
                     data.get("name", "undefined").decode("utf-8"), 
                     int(data.get("followersCount", 0))
                    )]
        except:
            return []
        
    def displayPOI(time, rdd): # Print the data in readable format
        try:
            print(time)
            print("Top 20 Influential personalities from the twitter across the globe: ")
            print("Rank".center(6, "-") + "|" + "Name".center(40, "-") + "|" + "Followers Count".center(20, "-"))
            for rank, item in enumerate(rdd.distinct().takeOrdered(20, key=lambda x: -x[1])):
                print(str(rank + 1).center(6, " ") + 
                      "|" + item[0].center(40, " ") + 
                      "|" + str(item[1]).rjust(15, " ")
                     )
        except ValueError:
            pass
    
    _influencial = dStream.flatMap(parseForPOI)\
                          .transform(  # Sorting the data
                                     lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False)
                          ).foreachRDD(displayPOI)
    _influencial2 = dStream.flatMap(parseForPOI)\
                      .transform(  # Sorting the data
                                 lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False)
                      ).count() # To find number of tweets
    _influencial2.pprint() # Number of tweets
    ssc.start()
    ssc.awaitTermination() # Await for keyboard interrupt

2017-10-08 12:30:00
Top 20 Influential personalities from the twitter across the globe: 
-Rank-|------------------Name------------------|--Followers Count---
  1   |             Times of India             |       10707298
  2   |                  NDTV                  |       10177338
  3   |               BBC Sport                |        7040979
  4   |               BBC Sport                |        7040961
  5   |            Sky Sports News             |        5683167
  6   |                 MARCA                  |        4751182
  7   |               Formula 1                |        3210857
  8   |               Formula 1                |        3210830
  9   |               Formula 1                |        3210803
  10  |               Formula 1                |        3210793
  11  |               Formula 1                |        3210786
  12  |               Formula 1                |        3210777
  13  |               Formula 1                |        3210705
  14  |   

KeyboardInterrupt: 