# Hacker News Pipeline

In this project we will create a pipeline to clean and summarize a large json file showing the top 100 most common words in story titles. This project also takes advantage of generators (yield) to speed up processing when applicable. Pipelines allow the user to troubleshoot at any step of the process.

In [15]:
from datetime import datetime
import json
import csv
import io
import string
import re
from stop_words import stop_words # for cleaning filler words

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text, flags=re.UNICODE) # for cleaning unicode punctuation

from pipeline import Pipeline, build_csv

pipeline = Pipeline()

@pipeline.task()
def file_to_json():
    with open('hn_stories_2014.json', 'r') as f:
        data = json.load(f)
        stories = data['stories']
    return stories

@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    for story in stories:
        if story['points']>50 and story['num_comments'] > 1 and not story['title'].startswith("Ask HN"):
            yield story

@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    lines = []
    for story in stories:
        lines.append(
            (story['objectID'], datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"), story['url'], story['points'], story['title'])
        )
    return build_csv(lines, header=['objectID', 'created_at', 'url', 'points', 'title'], file=io.StringIO())
    
@pipeline.task(depends_on=json_to_csv)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    next(reader)
    for line in reader:
        yield line[4]
        
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    for title in titles:
        cleaned = remove_punctuation(title.lower().translate(str.maketrans('', '', string.punctuation)))
        yield cleaned
        
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(titles):
    freq_dict = {}
    for title in titles:
        for word in title.split(' '):
            if word and word not in stop_words:
                if word not in freq_dict:
                    freq_dict[word] = 1
                else:
                    freq_dict[word] += 1
    return freq_dict

@pipeline.task(depends_on=build_keyword_dictionary)
def sort_top_words(word_dict):
    top_words = sorted(word_dict.items(), key=lambda total: total[1], reverse=True)[:100]
    return top_words    

In [18]:
result = pipeline.run() # note this returns a dictionary of all functions in the pipeline
top_100 = result[sort_top_words] # viewing a singgle function result in the pipeline

In [19]:
top_100

[('new', 185),
 ('google', 167),
 ('bitcoin', 101),
 ('open', 93),
 ('programming', 90),
 ('web', 89),
 ('data', 86),
 ('video', 79),
 ('python', 75),
 ('facebook', 72),
 ('code', 72),
 ('using', 71),
 ('released', 71),
 ('2013', 65),
 ('javascript', 65),
 ('free', 64),
 ('game', 64),
 ('source', 64),
 ('internet', 63),
 ('microsoft', 59),
 ('linux', 59),
 ('c', 59),
 ('app', 58),
 ('dont', 57),
 ('pdf', 55),
 ('work', 54),
 ('language', 54),
 ('software', 52),
 ('2014', 52),
 ('startup', 51),
 ('apple', 50),
 ('use', 50),
 ('make', 50),
 ('time', 48),
 ('yc', 48),
 ('security', 48),
 ('nsa', 45),
 ('github', 45),
 ('windows', 44),
 ('1', 41),
 ('world', 41),
 ('way', 41),
 ('like', 41),
 ('heartbleed', 41),
 ('project', 40),
 ('computer', 40),
 ('git', 37),
 ('users', 37),
 ('twitter', 37),
 ('design', 37),
 ('ios', 37),
 ('developer', 36),
 ('os', 36),
 ('ceo', 36),
 ('vs', 36),
 ('big', 36),
 ('life', 36),
 ('day', 35),
 ('android', 34),
 ('simple', 34),
 ('online', 34),
 ('years', 