Skip to content

Commit

Permalink
Add ner-food-ingredients project
Browse files Browse the repository at this point in the history
  • Loading branch information
ines committed Mar 16, 2020
1 parent b8a12ba commit 7fd45f1
Show file tree
Hide file tree
Showing 9 changed files with 127,170 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
tmp/
.ipynb_checkpoints/
.vscode/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
139 changes: 139 additions & 0 deletions ner-food-ingredients/01_Preprocess_Reddit.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 01: Preprocess Reddit data\n",
"\n",
"This script reads data extracted from the [Reddit comments corpus](https://files.pushshift.io/reddit/comments/), cleans it and converts it to JSONL with a `\"text\"` property and `\"meta\"` data (by default, the name of the subreddit and timestamp). The resulting JSONL can be used for annotation with [Prodigy](https://prodi.gy)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"INPUT_DATA = \"./raw_data\" # .gz archive or directory of archives\n",
"OUTPUT_FILE = \"./reddit.jsonl\" # path to output JSONL"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install srsly"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from pathlib import Path\n",
"import gzip\n",
"import srsly"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class Reddit(object):\n",
" \"\"\"Stream cleaned comments from Reddit.\"\"\"\n",
"\n",
" pre_format_re = re.compile(r\"^[\\`\\*\\~]\")\n",
" post_format_re = re.compile(r\"[\\`\\*\\~]$\")\n",
" url_re = re.compile(r\"\\[([^]]+)\\]\\(%%URL\\)\")\n",
" link_re = re.compile(r\"\\[([^]]+)\\]\\(https?://[^\\)]+\\)\")\n",
"\n",
" def __init__(\n",
" self, file_path, meta_keys={\"subreddit\": \"section\", \"created_utc\": \"utc\"}\n",
" ):\n",
" \"\"\"\n",
" file_path (unicode / Path): Path to archive or directory of archives.\n",
" meta_keys (dict): Meta data key included in the Reddit corpus, mapped\n",
" to display name in Prodigy meta.\n",
" RETURNS (Reddit): The Reddit loader.\n",
" \"\"\"\n",
" self.meta = meta_keys\n",
" self.file_path = Path(file_path)\n",
" if not self.file_path.exists():\n",
" raise IOError(f\"Can't find file path: {self.file_path}\")\n",
"\n",
" def __iter__(self):\n",
" for file_path in self.iter_files():\n",
" with gzip.open(str(file_path), \"rb\") as f:\n",
" for line in f:\n",
" line = line.strip()\n",
" if not line:\n",
" continue\n",
" comment = srsly.json_loads(line)\n",
" if self.is_valid(comment):\n",
" text = self.strip_tags(comment[\"body\"])\n",
" yield {\"text\": text, \"meta\": self.get_meta(comment)}\n",
"\n",
" def get_meta(self, item):\n",
" return {name: item.get(key, \"n/a\") for key, name in self.meta.items()}\n",
"\n",
" def iter_files(self):\n",
" if not self.file_path.is_dir():\n",
" return [self.file_path]\n",
" yield from self.file_path.glob(\"**/*.gz\")\n",
"\n",
" def strip_tags(self, text):\n",
" text = self.link_re.sub(r\"\\1\", text)\n",
" text = text.replace(\"&gt;\", \">\").replace(\"&lt;\", \"<\")\n",
" text = self.pre_format_re.sub(\"\", text)\n",
" text = self.post_format_re.sub(\"\", text)\n",
" text = re.sub(r\"\\s+\", \" \", text)\n",
" return text.strip()\n",
"\n",
" def is_valid(self, comment):\n",
" return (\n",
" comment[\"body\"] is not None\n",
" and comment[\"body\"] != \"[deleted]\"\n",
" and comment[\"body\"] != \"[removed]\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"stream = Reddit(INPUT_DATA)\n",
"srsly.write_jsonl(OUTPUT_FILE, stream)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.2 64-bit ('.env': venv)",
"language": "python",
"name": "python37264bitenvvenv9eb7caf448714d3f8d7dc6238703fa1e"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
100 changes: 100 additions & 0 deletions ner-food-ingredients/02_Process_text_and_counts.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 02: Process text and counts\n",
"\n",
"This script uses a pretrained [spaCy](https://spacy.io) model to extract entities from JSONL-formatted data and count them. It expects each record to have a `\"meta\"` dict with a `\"utc\"` value containing the UTC timestamp. Counts are generated by month and for each entity and saved out as a CSV. For example:\n",
"\n",
"```csv\n",
",2012-01,2012-02\n",
"meat,1011.0,873.0\n",
"salt,805.0,897.0\n",
"chicken,694.0,713.0\n",
"```\n",
"\n",
"> ⚠️ **Important note:** If you have a lot of data, you probably want to split up your raw data and run multiple jobs in parallel. The next script that calculates the final counts and variance can take a directory of `.csv` files as its input, so reconciling the counts afterwards is no problem."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"SPACY_MODEL = \"./food_model\" # path to spaCy model with entity recognizer\n",
"DATA_FILE = \"./reddit.jsonl\" # preprocessed Reddit data created in previous step\n",
"OUTPUT_FILE = \"./raw_counts.csv\" # path to output file\n",
"N_PROCESSES = 16 # number of processes for multiprocessing\n",
"ENTITY_LABEL = \"INGRED\" # label of entity to count"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install spacy srsly pandas"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"from collections import Counter, defaultdict\n",
"import srsly\n",
"from datetime import datetime\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"counts = defaultdict(Counter)\n",
"nlp = spacy.load(SPACY_MODEL)\n",
"data = srsly.read_jsonl(DATA_FILE)\n",
"\n",
"data_tuples = ((eg[\"text\"], eg) for eg in data)\n",
"for doc, eg in nlp.pipe(data_tuples, as_tuples=True, n_process=N_PROCESSES):\n",
" timestamp = int(eg[\"meta\"][\"utc\"])\n",
" year_month = datetime.utcfromtimestamp(timestamp).strftime(\"%Y-%m\")\n",
" for ent in doc.ents:\n",
" if ent.label_ == ENTITY_LABEL:\n",
" counts[ent.lower_][year_month] += 1\n",
"\n",
"df = pd.DataFrame(data=counts).transpose()\n",
"df.to_csv(OUTPUT_FILE)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.2 64-bit ('.env': venv)",
"language": "python",
"name": "python37264bitenvvenv9eb7caf448714d3f8d7dc6238703fa1e"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit 7fd45f1

Please sign in to comment.