In [2]:

from datetime import datetime
import json
import io
import string

from pipeline import build_csv, Pipeline
from stop_words import stop_words
import csv


In [3]:

pipeline = Pipeline()

@pipeline.task()
def file_to_json():
    with open('hn_stories_2014.json', 'r') as f:
        data = json.load(f)
        stories = data['stories']
    return stories

@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    def is_popular(story):
        return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')
    
    return (
        story for story in stories
        if is_popular(story)
    )

@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    lines = []
    for story in stories:
        lines.append(
            (story['objectID'], datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"), story['url'], story['points'], story['title'])
        )
    return build_csv(lines, header=['objectID', 'created_at', 'url', 'points', 'title'], file=io.StringIO())

@pipeline.task(depends_on=json_to_csv)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('title')
    
    return (line[idx] for line in reader)

@pipeline.task(depends_on=extract_titles)
def clean_title(titles):
    for title in titles:
        title = title.lower()
        title = ''.join(c for c in title if c not in string.punctuation)
        yield title
        
@pipeline.task(depends_on=clean_title)
def build_keyword_dictionary(titles):
    word_freq = {}
    for title in titles:
        for word in title.split(' '):
            if word and word not in stop_words:
                if word not in word_freq:
                    word_freq[word] = 1
                word_freq[word] += 1
    return word_freq  

@pipeline.task(depends_on=build_keyword_dictionary)
def top_keywords(word_freq):
    freq_tuple = [
        (word, word_freq[word])
        for word in sorted(word_freq, key=word_freq.get, reverse=True)
    ]
    return freq_tuple[:100]

ran = pipeline.run()


ran

{<function __main__.clean_title>: <generator object clean_title at 0x7f0ac2a3b288>,
 <function __main__.filter_stories>: <generator object <genexpr> at 0x7f0ac2a3b168>,
 <function __main__.json_to_csv>: <_io.StringIO at 0x7f0ac2a964c8>,
 <function __main__.build_keyword_dictionary>: {'contrasting': 2,
  'dockerbased': 2,
  '“don’t': 3,
  'dynosaur': 2,
  'exceptional': 2,
  'test': 16,
  'adobe’s': 2,
  'russian': 6,
  'hawaii': 2,
  'smooth': 2,
  'teletype': 2,
  's10': 2,
  'mckenzie': 2,
  'project': 41,
  'emerges': 3,
  'autoenroll': 2,
  'principle': 4,
  'muscle': 2,
  'streem': 2,
  'oppose': 2,
  'ambitious': 7,
  'mitigation': 2,
  'embassys': 2,
  'germany': 4,
  'template': 3,
  '“stolen”': 2,
  'rare': 8,
  'shut': 8,
  'ghostery': 2,
  'tsunami': 2,
  'cctld': 2,
  'microsystems': 2,
  'background': 6,
  'fusion': 4,
  'board': 14,
  'atom': 6,
  'shutdown': 3,
  'tamiflu': 2,
  'settlement': 3,
  'oatmeal': 2,
  'prototypes': 4,
  'codejquerycom': 2,
  'airlines': 11,
 