In [1]:
from pipeline import Pipeline, build_csv
from HNdata import HNScraper
from collections import deque
from datetime import datetime 
import csv
import pickle
import io
import string
import stop_words
import pprint as pp

In [2]:
# Initialize the pipeline class
pipeline = Pipeline()

In [3]:
@pipeline.task()
def file_to_stories():
    with open('stories_2016.pickle', 'rb') as pickle_file:
        stories = pickle.load(pickle_file)
    return stories

In [4]:
@pipeline.task(depends_on=file_to_stories)
def filter_stories(stories):
    """
    Filters stories to find those with a score of 50 or more,
    more than one comment and does not begin with 'Ask HN' and returns
    a list
    """
    def is_popular(story):
        if 'dead' in story or 'deleted' in story:
            return False
        else:
            return story['score'] > 50 and story['descendants'] > 1 and not story['title'].startswith('Ask HN')
    return (story for story in stories if is_popular(story))

In [5]:
@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    header = ['objectID', 'created_at', 'url', 'points', 'title']
    lines = []
    for story in stories:
        if 'url' in story:
            lines.append(
                (story['id'], 
                 datetime.fromtimestamp(story['time']).isoformat(), 
                 story['url'], 
                 story['score'], 
                 story['title'])
            )
        else:
            lines.append(
                (story['id'], 
                 datetime.fromtimestamp(story['time']).isoformat(), 
                 'No URL Available', 
                 story['score'], 
                 story['title'])
            )
    return build_csv(lines, header=header, file=io.StringIO())

In [6]:
@pipeline.task(depends_on=json_to_csv)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('title')
    
    return (row[idx] for row in reader)

In [7]:
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    punctuation = set(string.punctuation)
    for title in titles:
        title = title.lower()
        title = ''.join(c for c in title if c not in punctuation)
        yield title

In [8]:
@pipeline.task(depends_on=clean_titles)
def build_keyword_dict(titles):
    stopwords = stop_words.get_stop_words('en')
    word_frequency = {}
    for title in titles:
        for word in title.split():
            if word in stopwords:
                continue
            else:
                if word in word_frequency:
                    word_frequency[word] += 1
                else:
                    word_frequency[word] = 1
    return word_frequency

In [9]:
@pipeline.task(depends_on=build_keyword_dict)
def top_100_words(word_frequency):
    return sorted(word_frequency.items(), key=lambda kv: kv[1], reverse=True)[:100]

In [10]:
pp.pprint(pipeline.tasks.graph)

{<function extract_titles at 0x10ffd9950>: [<function clean_titles at 0x10ffd9c80>],
 <function json_to_csv at 0x10ffd99d8>: [<function extract_titles at 0x10ffd9950>],
 <function file_to_stories at 0x10ffd9a60>: [<function filter_stories at 0x10ffd9bf8>],
 <function top_100_words at 0x10ffd9ae8>: [],
 <function build_keyword_dict at 0x10ffd9b70>: [<function top_100_words at 0x10ffd9ae8>],
 <function filter_stories at 0x10ffd9bf8>: [<function json_to_csv at 0x10ffd99d8>],
 <function clean_titles at 0x10ffd9c80>: [<function build_keyword_dict at 0x10ffd9b70>]}


In [11]:
ran = pipeline.run()

In [12]:
print(ran[top_100_words])

[('–', 37), ('hn', 27), ('show', 27), ('google', 20), ('apple', 17), ('using', 16), ('new', 12), ('linux', 10), ('react', 9), ('data', 9), ('web', 9), ('life', 8), ('san', 8), ('python', 8), ('software', 8), ('can', 7), ('says', 7), ('access', 7), ('now', 7), ('system', 7), ('windows', 7), ('2016', 7), ('us', 7), ('yc', 7), ('code', 7), ('guide', 7), ('app', 7), ('people', 7), ('iphone', 7), ('video', 7), ('phones', 6), ('free', 6), ('c', 6), ('time', 6), ('stop', 6), ('vs', 6), ('fast', 6), ('computer', 6), ('fbi', 6), ('programming', 6), ('swift', 6), ('first', 6), ('2015', 5), ('neural', 5), ('one', 5), ('phone', 5), ('simple', 5), ('go', 5), ('update', 5), ('world', 5), ('better', 5), ('real', 5), ('wifi', 5), ('case', 5), ('bill', 5), ('apps', 5), ('3', 5), ('database', 5), ('hack', 5), ('functional', 5), ('git', 5), ('surveillance', 5), ('just', 5), ('language', 5), ('years', 5), ('dont', 5), ('way', 5), ('internet', 5), ('back', 5), ('security', 5), ('working', 4), ('play', 4), 