# Insight Data Engineering - Coding Challenge

This challenge is to implement two features:

Clean and extract the text from the raw JSON tweets that come from the Twitter Streaming API, and track the number of tweets that contain unicode.
Calculate the average degree of a vertex in a Twitter hashtag graph for the last 60 seconds, and update this each time a new tweet appears.

## 1. First feature

 Clean and extract the text from the raw JSON tweets that come from the Twitter Streaming API, and track the number of tweets that contain unicode.

To make a tweet's text "clean", we must replace all of the escape characters (e.g. \n, \t , \r ,\",\',  \/ ) and remove the unicode.

In [15]:
# example of program that calculates the number of tweets cleaned

import sys
import json
import string
import pprint


#input_file = open("../data-gen/tweets.txt","r")
#input_file = open("../tweet_input/tweets.txt","r")
input_file = open("../tweet_input/tweets_1.txt","r")


def removeNonAscii(s): 
    if s is not None:
        return "".join(filter(lambda x: ord(x)<128, s)) 

# Convert to JSON format
data = [json.loads(line) for line in input_file]


result = []


unicode_count = 0

for line in data:
    dic = {}
    if  line.get("text") is not None:
        dic['time'] = line.get("created_at")
        
        # Replace non-ASCII characters with a single space
        dic['content'] = filter(None,removeNonAscii(line.get("text")))
        
        # Replace all whitespace escape characters with a single space
        dic['content'] =  str(dic['content']).translate(string.maketrans("\n\t\r", "   "))
       
        # Track the number of tweets containing unicode
        if len(line.get("text")) > len(removeNonAscii(line.get("text"))):
            unicode_count +=1
        result.append(dic)

print '\n',unicode_count ,"tweets contained unicode.", '\n'

print "The ", '\n'
pprint.pprint(result[3])



# extracting the information of text" field and  "created_at" field, 
# then output this tweet with the format of 
# <contents of "text" field> (timestamp: <contents of "created_at" field>)
with open("../tweet_output/ft1.txt", "w") as output_file:
    [output_file.write('{0} (timestamp: {1})\n'.format(dic['content'], dic['time'])) for dic in result]
    output_file.write('\n{0} tweets contained unicode.'.format(unicode_count)) 




31 tweets contained unicode. 

The  

{'content': '@lezlielowe That one for @skimber is *literally* the only name I can take credit (blame) for. Thanks for noticingyou really are magical.',
 'time': u'Thu Oct 29 18:10:49 +0000 2015'}


## 2. Second Feature

Update the Twitter hashtag graph and calculate the average degree of the graph. The graph should just be built using tweets that arrived in the last 60 seconds as compared to the timestamp of the latest tweet

A Twitter hashtag graph is a graph connecting all the hashtags that have been mentioned together in a single tweet.

In [16]:
from datetime import datetime
from collections import Counter
import operator

In [17]:
# Extract hashtag and handle case insensitive hashtag
def extract_hashtags(s):
    htag = [word for word in s.split() if word[0] == "#" ]   
    htag = map(lambda x: x.lower(), htag)
    return list(set(htag))

# Calculate the time difference in timestamp
def time_diff(time1,time2):
    d1 = datetime.strptime(time1, "%a %b %d %H:%M:%S +0000 %Y")
    d2 = datetime.strptime(time2, "%a %b %d %H:%M:%S +0000 %Y")
    return (d2-d1).total_seconds()

In [18]:
for line in data:
    dic = {}
    if  line.get("text") is not None:
        dic['time'] = line.get("created_at")
        
        #content =  str(filter(None,removeNonAscii(line.get("text")))).translate(string.maketrans("\n\t\r", "   "))
        
        # Replace non-ASCII characters with a single space
        content =  str(filter(None,removeNonAscii(line.get("text"))))
        
        # Replace all whitespace escape characters with a single space
        content =  content.translate(string.maketrans("\n\t\r", "   "))
        
        #pprint.pprint(content)
        
        # Extract hashtags from each tweet
        list_hashtags = extract_hashtags(content)
        #pprint.pprint(list_hashtags)
        
        
        # Calculate the degree of each node in each tweet
        dic1 = {}
        for hashtag in list_hashtags:
            dic1[hashtag] = len(list_hashtags) -1
        dic['content'] = dic1
    result.append(dic)

 
    
avg_degree_f =[]
for i in range(len(result)):
    
    # Collect the all hashtags within the 60 Second Window
    hashtag_node = [result[i:][idx]['content'] for idx,_ in enumerate(result[i:]) 
          if  time_diff(result[i:][0]['time'], result[i:][idx]['time']) <= 60]
 

    
    # Calculate the degree of each node within the 60 Second Window
    hashtag_graph= dict(reduce(operator.add, map(Counter, hashtag_node))) 
    #print type(hashtag_graph)
    #pprint.pprint(hashtag_graph) ,'\n'

    
    
    # Calculating the rolling average degree of tweet within the 60 Second Window, 
    # The average degree = sum of the degrees of all nodes in all graphs and 
    # dividing by the total number of nodes in all graphs.
    if bool(hashtag_graph):
        avg_degree = 1.0*sum(hashtag_graph.values())/len(hashtag_graph)
    else:
        avg_degree = 0
        
    #Output this tweet with the format of  the rolling average degree 
    print '%.2f \n' %avg_degree
    #output_file.write('%.2f \n' %avg_degree)


4.60 

3.86 

7.00 

6.37 

3.83 

2.28 

39.25 

39.25 

37.88 

36.92 

36.70 

35.93 

35.52 

35.28 

34.82 

33.52 

33.40 

33.09 

32.43 

32.29 

31.66 

31.47 

31.25 

30.87 

29.57 

29.42 

28.98 

28.40 

27.86 

27.50 

27.32 

27.19 

26.54 

26.20 

25.43 

25.16 

24.66 

23.27 

22.87 

22.62 

22.03 

21.25 

20.41 

19.80 

19.68 

18.30 

17.71 

16.77 

16.80 

15.77 

14.79 

14.14 

13.68 

12.84 

12.08 

10.74 

10.47 

10.56 

9.46 

9.24 

8.90 

8.78 

7.76 

7.01 

7.07 

6.62 

4.97 

4.03 

3.43 

2.53 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

3.56 

3.56 

3.56 

3.56 

3.56 

3.87 

3.87 

3.87 

3.87 

3.87 

3.87 

3.87 

3.87 

3.87 

3.87 

3.87 

3.87 

3.87 

2.00 

2.00 

2.00 

2.00 

2.00 

2.00 

2.00 

2.00 

2.00 

2.00 

2.00 

2.00 

2.00 

2.00 

2.00 

2.00 

2.33 

2.33 

3.00 

3.00 

3.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.00 

0.0