Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
123 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[('new', 186), ('google', 168), ('bitcoin', 102), ('open', 93), ('programming', 91), ('web', 89), ('data', 86), ('video', 80), ('python', 76), ('code', 73), ('facebook', 72), ('released', 72), ('using', 71), ('javascript', 66), ('2013', 66), ('free', 65), ('source', 65), ('game', 64), ('internet', 63), ('microsoft', 60), ('c', 60), ('linux', 59), ('app', 58), ('pdf', 56), ('language', 55), ('work', 55), ('software', 53), ('2014', 53), ('startup', 52), ('apple', 51), ('make', 51), ('use', 51), ('time', 49), ('yc', 49), ('security', 49), ('github', 46), ('nsa', 46), ('windows', 45), ('like', 42), ('world', 42), ('way', 42), ('computer', 41), ('heartbleed', 41), ('1', 41), ('project', 41), ('design', 38), ('users', 38), ('dont', 38), ('ios', 38), ('git', 38), ('vs', 37), ('developer', 37), ('os', 37), ('life', 37), ('twitter', 37), ('ceo', 37), ('day', 36), ('big', 36), ('online', 35), ('android', 35), ('years', 34), ('simple', 34), ('court', 34), ('mt', 33), ('apps', 33), ('says', 33), ('api', 33), ('browser', 33), ('guide', 33), ('learning', 33), ('mozilla', 32), ('site', 32), ('gox', 32), ('firefox', 32), ('engine', 32), ('problem', 32), ('server', 32), ('fast', 32), ('amazon', 31), ('year', 31), ('introducing', 31), ('support', 30), ('better', 30), ('stop', 30), ('million', 30), ('text', 30), ('people', 30), ('built', 30), ('does', 29), ('development', 29), ('tech', 29), ('3', 29), ('just', 28), ('inside', 28), ('did', 28), ('library', 28), ('money', 28), ('website', 28), ('chrome', 28), ('2048', 28)]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from datetime import datetime\n", | ||
"import json\n", | ||
"import io\n", | ||
"import string\n", | ||
"\n", | ||
"from pipeline import build_csv, Pipeline\n", | ||
"from stop_words import stop_words\n", | ||
"\n", | ||
"pipeline = Pipeline()\n", | ||
"\n", | ||
"@pipeline.task()\n", | ||
"def file_to_json():\n", | ||
" with open('hn_stories_2014.json', 'r') as f:\n", | ||
" data = json.load(f)\n", | ||
" stories = data['stories']\n", | ||
" return stories\n", | ||
"\n", | ||
"@pipeline.task(depends_on=file_to_json)\n", | ||
"def filter_stories(stories):\n", | ||
" def is_popular(story):\n", | ||
" return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')\n", | ||
" \n", | ||
" return (\n", | ||
" story for story in stories\n", | ||
" if is_popular(story)\n", | ||
" )\n", | ||
"\n", | ||
"@pipeline.task(depends_on=filter_stories)\n", | ||
"def json_to_csv(stories):\n", | ||
" lines = []\n", | ||
" for story in stories:\n", | ||
" lines.append(\n", | ||
" (story['objectID'], datetime.strptime(story['created_at'], \"%Y-%m-%dT%H:%M:%SZ\"), story['url'], story['points'], story['title'])\n", | ||
" )\n", | ||
" return build_csv(lines, header=['objectID', 'created_at', 'url', 'points', 'title'], file=io.StringIO())\n", | ||
"\n", | ||
"@pipeline.task(depends_on=json_to_csv)\n", | ||
"def extract_titles(csv_file):\n", | ||
" reader = csv.reader(csv_file)\n", | ||
" header = next(reader)\n", | ||
" idx = header.index('title')\n", | ||
" \n", | ||
" return (line[idx] for line in reader)\n", | ||
"\n", | ||
"@pipeline.task(depends_on=extract_titles)\n", | ||
"def clean_title(titles):\n", | ||
" for title in titles:\n", | ||
" title = title.lower()\n", | ||
" title = ''.join(c for c in title if c not in string.punctuation)\n", | ||
" yield title\n", | ||
"\n", | ||
"@pipeline.task(depends_on=clean_title)\n", | ||
"def build_keyword_dictionary(titles):\n", | ||
" word_freq = {}\n", | ||
" for title in titles:\n", | ||
" for word in title.split(' '):\n", | ||
" if word and word not in stop_words:\n", | ||
" if word not in word_freq:\n", | ||
" word_freq[word] = 1\n", | ||
" word_freq[word] += 1\n", | ||
" return word_freq\n", | ||
"\n", | ||
"@pipeline.task(depends_on=build_keyword_dictionary)\n", | ||
"def top_keywords(word_freq):\n", | ||
" freq_tuple = [\n", | ||
" (word, word_freq[word])\n", | ||
" for word in sorted(word_freq, key=word_freq.get, reverse=True)\n", | ||
" ]\n", | ||
" return freq_tuple[:100]\n", | ||
"\n", | ||
"ran = pipeline.run()\n", | ||
"print(ran[top_keywords])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"anaconda-cloud": {}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 1 | ||
} |