-
-
Notifications
You must be signed in to change notification settings - Fork 471
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
127,170 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
tmp/ | ||
.ipynb_checkpoints/ | ||
.vscode/ | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## 01: Preprocess Reddit data\n", | ||
"\n", | ||
"This script reads data extracted from the [Reddit comments corpus](https://files.pushshift.io/reddit/comments/), cleans it and converts it to JSONL with a `\"text\"` property and `\"meta\"` data (by default, the name of the subreddit and timestamp). The resulting JSONL can be used for annotation with [Prodigy](https://prodi.gy)." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"INPUT_DATA = \"./raw_data\" # .gz archive or directory of archives\n", | ||
"OUTPUT_FILE = \"./reddit.jsonl\" # path to output JSONL" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip install srsly" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import re\n", | ||
"from pathlib import Path\n", | ||
"import gzip\n", | ||
"import srsly" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"class Reddit(object):\n", | ||
" \"\"\"Stream cleaned comments from Reddit.\"\"\"\n", | ||
"\n", | ||
" pre_format_re = re.compile(r\"^[\\`\\*\\~]\")\n", | ||
" post_format_re = re.compile(r\"[\\`\\*\\~]$\")\n", | ||
" url_re = re.compile(r\"\\[([^]]+)\\]\\(%%URL\\)\")\n", | ||
" link_re = re.compile(r\"\\[([^]]+)\\]\\(https?://[^\\)]+\\)\")\n", | ||
"\n", | ||
" def __init__(\n", | ||
" self, file_path, meta_keys={\"subreddit\": \"section\", \"created_utc\": \"utc\"}\n", | ||
" ):\n", | ||
" \"\"\"\n", | ||
" file_path (unicode / Path): Path to archive or directory of archives.\n", | ||
" meta_keys (dict): Meta data key included in the Reddit corpus, mapped\n", | ||
" to display name in Prodigy meta.\n", | ||
" RETURNS (Reddit): The Reddit loader.\n", | ||
" \"\"\"\n", | ||
" self.meta = meta_keys\n", | ||
" self.file_path = Path(file_path)\n", | ||
" if not self.file_path.exists():\n", | ||
" raise IOError(f\"Can't find file path: {self.file_path}\")\n", | ||
"\n", | ||
" def __iter__(self):\n", | ||
" for file_path in self.iter_files():\n", | ||
" with gzip.open(str(file_path), \"rb\") as f:\n", | ||
" for line in f:\n", | ||
" line = line.strip()\n", | ||
" if not line:\n", | ||
" continue\n", | ||
" comment = srsly.json_loads(line)\n", | ||
" if self.is_valid(comment):\n", | ||
" text = self.strip_tags(comment[\"body\"])\n", | ||
" yield {\"text\": text, \"meta\": self.get_meta(comment)}\n", | ||
"\n", | ||
" def get_meta(self, item):\n", | ||
" return {name: item.get(key, \"n/a\") for key, name in self.meta.items()}\n", | ||
"\n", | ||
" def iter_files(self):\n", | ||
" if not self.file_path.is_dir():\n", | ||
" return [self.file_path]\n", | ||
" yield from self.file_path.glob(\"**/*.gz\")\n", | ||
"\n", | ||
" def strip_tags(self, text):\n", | ||
" text = self.link_re.sub(r\"\\1\", text)\n", | ||
" text = text.replace(\">\", \">\").replace(\"<\", \"<\")\n", | ||
" text = self.pre_format_re.sub(\"\", text)\n", | ||
" text = self.post_format_re.sub(\"\", text)\n", | ||
" text = re.sub(r\"\\s+\", \" \", text)\n", | ||
" return text.strip()\n", | ||
"\n", | ||
" def is_valid(self, comment):\n", | ||
" return (\n", | ||
" comment[\"body\"] is not None\n", | ||
" and comment[\"body\"] != \"[deleted]\"\n", | ||
" and comment[\"body\"] != \"[removed]\"\n", | ||
" )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"stream = Reddit(INPUT_DATA)\n", | ||
"srsly.write_jsonl(OUTPUT_FILE, stream)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3.7.2 64-bit ('.env': venv)", | ||
"language": "python", | ||
"name": "python37264bitenvvenv9eb7caf448714d3f8d7dc6238703fa1e" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## 02: Process text and counts\n", | ||
"\n", | ||
"This script uses a pretrained [spaCy](https://spacy.io) model to extract entities from JSONL-formatted data and count them. It expects each record to have a `\"meta\"` dict with a `\"utc\"` value containing the UTC timestamp. Counts are generated by month and for each entity and saved out as a CSV. For example:\n", | ||
"\n", | ||
"```csv\n", | ||
",2012-01,2012-02\n", | ||
"meat,1011.0,873.0\n", | ||
"salt,805.0,897.0\n", | ||
"chicken,694.0,713.0\n", | ||
"```\n", | ||
"\n", | ||
"> ⚠️ **Important note:** If you have a lot of data, you probably want to split up your raw data and run multiple jobs in parallel. The next script that calculates the final counts and variance can take a directory of `.csv` files as its input, so reconciling the counts afterwards is no problem." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"SPACY_MODEL = \"./food_model\" # path to spaCy model with entity recognizer\n", | ||
"DATA_FILE = \"./reddit.jsonl\" # preprocessed Reddit data created in previous step\n", | ||
"OUTPUT_FILE = \"./raw_counts.csv\" # path to output file\n", | ||
"N_PROCESSES = 16 # number of processes for multiprocessing\n", | ||
"ENTITY_LABEL = \"INGRED\" # label of entity to count" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip install spacy srsly pandas" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import spacy\n", | ||
"from collections import Counter, defaultdict\n", | ||
"import srsly\n", | ||
"from datetime import datetime\n", | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"counts = defaultdict(Counter)\n", | ||
"nlp = spacy.load(SPACY_MODEL)\n", | ||
"data = srsly.read_jsonl(DATA_FILE)\n", | ||
"\n", | ||
"data_tuples = ((eg[\"text\"], eg) for eg in data)\n", | ||
"for doc, eg in nlp.pipe(data_tuples, as_tuples=True, n_process=N_PROCESSES):\n", | ||
" timestamp = int(eg[\"meta\"][\"utc\"])\n", | ||
" year_month = datetime.utcfromtimestamp(timestamp).strftime(\"%Y-%m\")\n", | ||
" for ent in doc.ents:\n", | ||
" if ent.label_ == ENTITY_LABEL:\n", | ||
" counts[ent.lower_][year_month] += 1\n", | ||
"\n", | ||
"df = pd.DataFrame(data=counts).transpose()\n", | ||
"df.to_csv(OUTPUT_FILE)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3.7.2 64-bit ('.env': venv)", | ||
"language": "python", | ||
"name": "python37264bitenvvenv9eb7caf448714d3f8d7dc6238703fa1e" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
Oops, something went wrong.