Add ner-food-ingredients project

explosion · Mar 16, 2020 · 7fd45f1 · 7fd45f1
1 parent b8a12ba
commit 7fd45f1
Show file tree

Hide file tree

Showing 9 changed files with 127,170 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 tmp/
+.ipynb_checkpoints/
+.vscode/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/ner-food-ingredients/01_Preprocess_Reddit.ipynb b/ner-food-ingredients/01_Preprocess_Reddit.ipynb
@@ -0,0 +1,139 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 01: Preprocess Reddit data\n",
+    "\n",
+    "This script reads data extracted from the [Reddit comments corpus](https://files.pushshift.io/reddit/comments/), cleans it and converts it to JSONL with a `\"text\"` property and `\"meta\"` data (by default, the name of the subreddit and timestamp). The resulting JSONL can be used for annotation with [Prodigy](https://prodi.gy)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "INPUT_DATA = \"./raw_data\"       # .gz archive or directory of archives\n",
+    "OUTPUT_FILE = \"./reddit.jsonl\"  # path to output JSONL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install srsly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from pathlib import Path\n",
+    "import gzip\n",
+    "import srsly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Reddit(object):\n",
+    "    \"\"\"Stream cleaned comments from Reddit.\"\"\"\n",
+    "\n",
+    "    pre_format_re = re.compile(r\"^[\\`\\*\\~]\")\n",
+    "    post_format_re = re.compile(r\"[\\`\\*\\~]$\")\n",
+    "    url_re = re.compile(r\"\\[([^]]+)\\]\\(%%URL\\)\")\n",
+    "    link_re = re.compile(r\"\\[([^]]+)\\]\\(https?://[^\\)]+\\)\")\n",
+    "\n",
+    "    def __init__(\n",
+    "        self, file_path, meta_keys={\"subreddit\": \"section\", \"created_utc\": \"utc\"}\n",
+    "    ):\n",
+    "        \"\"\"\n",
+    "        file_path (unicode / Path): Path to archive or directory of archives.\n",
+    "        meta_keys (dict): Meta data key included in the Reddit corpus, mapped\n",
+    "            to display name in Prodigy meta.\n",
+    "        RETURNS (Reddit): The Reddit loader.\n",
+    "        \"\"\"\n",
+    "        self.meta = meta_keys\n",
+    "        self.file_path = Path(file_path)\n",
+    "        if not self.file_path.exists():\n",
+    "            raise IOError(f\"Can't find file path: {self.file_path}\")\n",
+    "\n",
+    "    def __iter__(self):\n",
+    "        for file_path in self.iter_files():\n",
+    "            with gzip.open(str(file_path), \"rb\") as f:\n",
+    "                for line in f:\n",
+    "                    line = line.strip()\n",
+    "                    if not line:\n",
+    "                        continue\n",
+    "                    comment = srsly.json_loads(line)\n",
+    "                    if self.is_valid(comment):\n",
+    "                        text = self.strip_tags(comment[\"body\"])\n",
+    "                        yield {\"text\": text, \"meta\": self.get_meta(comment)}\n",
+    "\n",
+    "    def get_meta(self, item):\n",
+    "        return {name: item.get(key, \"n/a\") for key, name in self.meta.items()}\n",
+    "\n",
+    "    def iter_files(self):\n",
+    "        if not self.file_path.is_dir():\n",
+    "            return [self.file_path]\n",
+    "        yield from self.file_path.glob(\"**/*.gz\")\n",
+    "\n",
+    "    def strip_tags(self, text):\n",
+    "        text = self.link_re.sub(r\"\\1\", text)\n",
+    "        text = text.replace(\"&gt;\", \">\").replace(\"&lt;\", \"<\")\n",
+    "        text = self.pre_format_re.sub(\"\", text)\n",
+    "        text = self.post_format_re.sub(\"\", text)\n",
+    "        text = re.sub(r\"\\s+\", \" \", text)\n",
+    "        return text.strip()\n",
+    "\n",
+    "    def is_valid(self, comment):\n",
+    "        return (\n",
+    "            comment[\"body\"] is not None\n",
+    "            and comment[\"body\"] != \"[deleted]\"\n",
+    "            and comment[\"body\"] != \"[removed]\"\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stream = Reddit(INPUT_DATA)\n",
+    "srsly.write_jsonl(OUTPUT_FILE, stream)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.7.2 64-bit ('.env': venv)",
+   "language": "python",
+   "name": "python37264bitenvvenv9eb7caf448714d3f8d7dc6238703fa1e"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/ner-food-ingredients/02_Process_text_and_counts.ipynb b/ner-food-ingredients/02_Process_text_and_counts.ipynb
@@ -0,0 +1,100 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 02: Process text and counts\n",
+    "\n",
+    "This script uses a pretrained [spaCy](https://spacy.io) model to extract entities from JSONL-formatted data and count them. It expects each record to have a `\"meta\"` dict with a `\"utc\"` value containing the UTC timestamp. Counts are generated by month and for each entity and saved out as a CSV. For example:\n",
+    "\n",
+    "```csv\n",
+    ",2012-01,2012-02\n",
+    "meat,1011.0,873.0\n",
+    "salt,805.0,897.0\n",
+    "chicken,694.0,713.0\n",
+    "```\n",
+    "\n",
+    "> ⚠️ **Important note:** If you have a lot of data, you probably want to split up your raw data and run multiple jobs in parallel. The next script that calculates the final counts and variance can take a directory of `.csv` files as its input, so reconciling the counts afterwards is no problem."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SPACY_MODEL = \"./food_model\"      # path to spaCy model with entity recognizer\n",
+    "DATA_FILE = \"./reddit.jsonl\"      # preprocessed Reddit data created in previous step\n",
+    "OUTPUT_FILE = \"./raw_counts.csv\"  # path to output file\n",
+    "N_PROCESSES = 16                  # number of processes for multiprocessing\n",
+    "ENTITY_LABEL = \"INGRED\"           # label of entity to count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install spacy srsly pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy\n",
+    "from collections import Counter, defaultdict\n",
+    "import srsly\n",
+    "from datetime import datetime\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "counts = defaultdict(Counter)\n",
+    "nlp = spacy.load(SPACY_MODEL)\n",
+    "data = srsly.read_jsonl(DATA_FILE)\n",
+    "\n",
+    "data_tuples = ((eg[\"text\"], eg) for eg in data)\n",
+    "for doc, eg in nlp.pipe(data_tuples, as_tuples=True, n_process=N_PROCESSES):\n",
+    "    timestamp = int(eg[\"meta\"][\"utc\"])\n",
+    "    year_month = datetime.utcfromtimestamp(timestamp).strftime(\"%Y-%m\")\n",
+    "    for ent in doc.ents:\n",
+    "        if ent.label_ == ENTITY_LABEL:\n",
+    "            counts[ent.lower_][year_month] += 1\n",
+    "\n",
+    "df = pd.DataFrame(data=counts).transpose()\n",
+    "df.to_csv(OUTPUT_FILE)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.7.2 64-bit ('.env': venv)",
+   "language": "python",
+   "name": "python37264bitenvvenv9eb7caf448714d3f8d7dc6238703fa1e"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}