Add Mission 267 Solutions

dataquestio · Jan 4, 2018 · 4fcdc73 · 4fcdc73
1 parent 178fee9
commit 4fcdc73
Showing 1 changed file with 123 additions and 0 deletions.
diff --git a/Mission267Solutions.ipynb b/Mission267Solutions.ipynb
@@ -0,0 +1,123 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('new', 186), ('google', 168), ('bitcoin', 102), ('open', 93), ('programming', 91), ('web', 89), ('data', 86), ('video', 80), ('python', 76), ('code', 73), ('facebook', 72), ('released', 72), ('using', 71), ('javascript', 66), ('2013', 66), ('free', 65), ('source', 65), ('game', 64), ('internet', 63), ('microsoft', 60), ('c', 60), ('linux', 59), ('app', 58), ('pdf', 56), ('language', 55), ('work', 55), ('software', 53), ('2014', 53), ('startup', 52), ('apple', 51), ('make', 51), ('use', 51), ('time', 49), ('yc', 49), ('security', 49), ('github', 46), ('nsa', 46), ('windows', 45), ('like', 42), ('world', 42), ('way', 42), ('computer', 41), ('heartbleed', 41), ('1', 41), ('project', 41), ('design', 38), ('users', 38), ('dont', 38), ('ios', 38), ('git', 38), ('vs', 37), ('developer', 37), ('os', 37), ('life', 37), ('twitter', 37), ('ceo', 37), ('day', 36), ('big', 36), ('online', 35), ('android', 35), ('years', 34), ('simple', 34), ('court', 34), ('mt', 33), ('apps', 33), ('says', 33), ('api', 33), ('browser', 33), ('guide', 33), ('learning', 33), ('mozilla', 32), ('site', 32), ('gox', 32), ('firefox', 32), ('engine', 32), ('problem', 32), ('server', 32), ('fast', 32), ('amazon', 31), ('year', 31), ('introducing', 31), ('support', 30), ('better', 30), ('stop', 30), ('million', 30), ('text', 30), ('people', 30), ('built', 30), ('does', 29), ('development', 29), ('tech', 29), ('3', 29), ('just', 28), ('inside', 28), ('did', 28), ('library', 28), ('money', 28), ('website', 28), ('chrome', 28), ('2048', 28)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datetime import datetime\n",
+    "import json\n",
+    "import io\n",
+    "import string\n",
+    "\n",
+    "from pipeline import build_csv, Pipeline\n",
+    "from stop_words import stop_words\n",
+    "\n",
+    "pipeline = Pipeline()\n",
+    "\n",
+    "@pipeline.task()\n",
+    "def file_to_json():\n",
+    "    with open('hn_stories_2014.json', 'r') as f:\n",
+    "        data = json.load(f)\n",
+    "        stories = data['stories']\n",
+    "    return stories\n",
+    "\n",
+    "@pipeline.task(depends_on=file_to_json)\n",
+    "def filter_stories(stories):\n",
+    "    def is_popular(story):\n",
+    "        return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')\n",
+    "    \n",
+    "    return (\n",
+    "        story for story in stories\n",
+    "        if is_popular(story)\n",
+    "    )\n",
+    "\n",
+    "@pipeline.task(depends_on=filter_stories)\n",
+    "def json_to_csv(stories):\n",
+    "    lines = []\n",
+    "    for story in stories:\n",
+    "        lines.append(\n",
+    "            (story['objectID'], datetime.strptime(story['created_at'], \"%Y-%m-%dT%H:%M:%SZ\"), story['url'], story['points'], story['title'])\n",
+    "        )\n",
+    "    return build_csv(lines, header=['objectID', 'created_at', 'url', 'points', 'title'], file=io.StringIO())\n",
+    "\n",
+    "@pipeline.task(depends_on=json_to_csv)\n",
+    "def extract_titles(csv_file):\n",
+    "    reader = csv.reader(csv_file)\n",
+    "    header = next(reader)\n",
+    "    idx = header.index('title')\n",
+    "    \n",
+    "    return (line[idx] for line in reader)\n",
+    "\n",
+    "@pipeline.task(depends_on=extract_titles)\n",
+    "def clean_title(titles):\n",
+    "    for title in titles:\n",
+    "        title = title.lower()\n",
+    "        title = ''.join(c for c in title if c not in string.punctuation)\n",
+    "        yield title\n",
+    "\n",
+    "@pipeline.task(depends_on=clean_title)\n",
+    "def build_keyword_dictionary(titles):\n",
+    "    word_freq = {}\n",
+    "    for title in titles:\n",
+    "        for word in title.split(' '):\n",
+    "            if word and word not in stop_words:\n",
+    "                if word not in word_freq:\n",
+    "                    word_freq[word] = 1\n",
+    "                word_freq[word] += 1\n",
+    "    return word_freq\n",
+    "\n",
+    "@pipeline.task(depends_on=build_keyword_dictionary)\n",
+    "def top_keywords(word_freq):\n",
+    "    freq_tuple = [\n",
+    "        (word, word_freq[word])\n",
+    "        for word in sorted(word_freq, key=word_freq.get, reverse=True)\n",
+    "    ]\n",
+    "    return freq_tuple[:100]\n",
+    "\n",
+    "ran = pipeline.run()\n",
+    "print(ran[top_keywords])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}