In [8]:
from stop_words import stop_words
from pipeline import build_csv
from pipeline import Pipeline
from datetime import datetime

import string
import json
import csv
import io

pipeline = Pipeline()

@pipeline.task()
def file_to_json():
    with open("hn_stories_2014.json", "r") as f:
        temp_dict = json.load(f)
    
    return temp_dict["stories"]


@pipeline.task(depends_on=file_to_json)
def filter_stories(stories_list):
    def popular_story(story):
        return story["points"] < 50 and story["num_comments"] > 1 and not story['title'].startswith('Ask HN')
    return (story for story in stories_list if popular_story(story))


@pipeline.task(depends_on=filter_stories)
def json_to_csv(filtered_json):
    lines = []
    for story in filtered_json:
        lines.append(
            (story['objectID'], 
             datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"),
             story['url'], 
             story['points'], 
             story['title']
            )
        )
        
    returned_csv = build_csv(
        lines, 
        header=['objectID', 'created_at', 'url', 'points', 'title'], 
        file=io.StringIO()
    )
    
    
    return returned_csv


@pipeline.task(depends_on=json_to_csv)
def extract_titles(filtered_csv):
    reader = csv.reader(filtered_csv)
    header = next(reader)
    idx = header.index('title')
    
    return (line[idx] for line in reader)


@pipeline.task(depends_on=extract_titles)
def clean_titles(title_list):
    for title in title_list:
        title = ''.join(c for c in title if c not in string.punctuation)
        title = title.lower()
        yield title

    
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(cleaned_titles):
    return_dict = {}
    for title in cleaned_titles:
        for word in title.split(" "):
            if word and word not in stop_words:
                if word not in return_dict:
                    return_dict[word] = 1

                return_dict[word] += 1
                
    return return_dict


@pipeline.task(depends_on=build_keyword_dictionary)
def top_100(returned_dict):
    top_100_list = [
        (word, returned_dict[word])
        for word in sorted(returned_dict, key=returned_dict.get , reverse=True)
    ]
    
    return top_100_list[:100]


final_list = pipeline.run()
print(final_list[top_100])

[('new', 311), ('google', 308), ('app', 238), ('bitcoin', 196), ('startup', 190), ('web', 166), ('data', 154), ('facebook', 148), ('open', 135), ('code', 134), ('use', 132), ('using', 131), ('ios', 116), ('free', 114), ('time', 112), ('tech', 111), ('like', 106), ('game', 105), ('apple', 105), ('video', 101), ('people', 99), ('world', 98), ('programming', 98), ('make', 96), ('software', 96), ('apps', 95), ('way', 94), ('javascript', 92), ('microsoft', 91), ('github', 89), ('startups', 89), ('source', 88), ('email', 87), ('2014', 86), ('news', 85), ('twitter', 85), ('project', 84), ('internet', 84), ('windows', 84), ('mobile', 82), ('android', 79), ('vs', 78), ('dont', 77), ('1', 76), ('work', 76), ('2013', 76), ('users', 76), ('just', 75), ('service', 75), ('does', 74), ('better', 74), ('c', 74), ('security', 73), ('api', 71), ('need', 71), ('job', 70), ('help', 70), ('says', 69), ('yc', 68), ('hacker', 68), ('ceo', 68), ('best', 68), ('build', 68), ('online', 66), ('simple', 65), ('bu