Permalink
Browse files

add 1k greek corpus and conversion

  • Loading branch information...
kylepjohnson committed Jul 13, 2017
1 parent ed7bb85 commit 434458aa7d7176387d2433d69bd94de4f8ddc410
Showing with 282 additions and 204 deletions.
  1. +238 −7 2 Import corpora.ipynb
  2. +44 −197 3 Basic NLP.ipynb
View
@@ -33,7 +33,9 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# See https://github.com/cltk for all official corpora\n",
@@ -105,7 +107,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -121,16 +123,17 @@
" 'greek_lexica_perseus',\n",
" 'greek_training_set_sentence_cltk',\n",
" 'greek_word2vec_cltk',\n",
" 'greek_text_lacus_curtius']"
" 'greek_text_lacus_curtius',\n",
" 'greek_text_first1kgreek']"
]
},
"execution_count": 6,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Let's get a Greek corpus, too\n",
"# Let's get some Greek corpora, too\n",
"\n",
"my_greek_downloader = CorpusImporter('greek')\n",
"my_greek_downloader.list_corpora"
@@ -139,7 +142,9 @@
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"my_greek_downloader.import_corpus('greek_text_lacus_curtius')"
@@ -152,6 +157,230 @@
"Likewise, verify with `ls -l ~/cltk_data/greek/text/greek_text_lacus_curtius/plain/`"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded 5% 2.22 MiB | 4.15 MiB/s \r",
"Downloaded 6% 2.22 MiB | 4.15 MiB/s \r",
"Downloaded 7% 2.22 MiB | 4.15 MiB/s \r",
"Downloaded 8% 2.22 MiB | 4.15 MiB/s \r",
"Downloaded 8% 2.22 MiB | 4.15 MiB/s \r",
"Downloaded 9% 2.22 MiB | 4.15 MiB/s \r",
"Downloaded 10% 5.62 MiB | 5.43 MiB/s \r",
"Downloaded 11% 5.62 MiB | 5.43 MiB/s \r",
"Downloaded 12% 5.62 MiB | 5.43 MiB/s \r",
"Downloaded 13% 5.62 MiB | 5.43 MiB/s \r",
"Downloaded 14% 5.62 MiB | 5.43 MiB/s \r",
"Downloaded 15% 5.62 MiB | 5.43 MiB/s \r",
"Downloaded 16% 5.62 MiB | 5.43 MiB/s \r",
"Downloaded 17% 5.62 MiB | 5.43 MiB/s \r",
"Downloaded 18% 5.62 MiB | 5.43 MiB/s \r",
"Downloaded 19% 5.62 MiB | 5.43 MiB/s \r",
"Downloaded 20% 5.62 MiB | 5.43 MiB/s \r",
"Downloaded 21% 14.47 MiB | 9.42 MiB/s \r",
"Downloaded 22% 14.47 MiB | 9.42 MiB/s \r",
"Downloaded 23% 14.47 MiB | 9.42 MiB/s \r",
"Downloaded 24% 14.47 MiB | 9.42 MiB/s \r",
"Downloaded 24% 17.04 MiB | 8.32 MiB/s \r",
"Downloaded 25% 17.04 MiB | 8.32 MiB/s \r",
"Downloaded 26% 17.04 MiB | 8.32 MiB/s \r",
"Downloaded 27% 17.04 MiB | 8.32 MiB/s \r",
"Downloaded 28% 19.54 MiB | 7.33 MiB/s \r",
"Downloaded 29% 19.54 MiB | 7.33 MiB/s \r",
"Downloaded 30% 19.54 MiB | 7.33 MiB/s \r",
"Downloaded 30% 19.54 MiB | 7.33 MiB/s \r",
"Downloaded 31% 19.54 MiB | 7.33 MiB/s \r",
"Downloaded 32% 19.54 MiB | 7.33 MiB/s \r",
"Downloaded 33% 19.54 MiB | 7.33 MiB/s \r",
"Downloaded 34% 19.54 MiB | 7.33 MiB/s \r",
"Downloaded 35% 19.54 MiB | 7.33 MiB/s \r",
"Downloaded 36% 19.54 MiB | 7.33 MiB/s \r",
"Downloaded 37% 23.64 MiB | 7.33 MiB/s \r",
"Downloaded 38% 24.18 MiB | 6.49 MiB/s \r",
"Downloaded 38% 26.77 MiB | 6.18 MiB/s \r",
"Downloaded 39% 26.77 MiB | 6.18 MiB/s \r",
"Downloaded 40% 26.77 MiB | 6.18 MiB/s \r",
"Downloaded 41% 26.77 MiB | 6.18 MiB/s \r",
"Downloaded 42% 26.77 MiB | 6.18 MiB/s \r",
"Downloaded 43% 26.77 MiB | 6.18 MiB/s \r",
"Downloaded 44% 26.77 MiB | 6.18 MiB/s \r",
"Downloaded 44% 29.04 MiB | 5.83 MiB/s \r",
"Downloaded 45% 29.04 MiB | 5.83 MiB/s \r",
"Downloaded 46% 32.96 MiB | 5.45 MiB/s \r",
"Downloaded 46% 32.96 MiB | 5.45 MiB/s \r",
"Downloaded 47% 34.78 MiB | 4.05 MiB/s \r",
"Downloaded 48% 34.78 MiB | 4.05 MiB/s \r",
"Downloaded 49% 34.78 MiB | 4.05 MiB/s \r",
"Downloaded 49% 37.17 MiB | 4.02 MiB/s \r",
"Downloaded 50% 37.17 MiB | 4.02 MiB/s \r",
"Downloaded 51% 39.68 MiB | 4.07 MiB/s \r",
"Downloaded 51% 39.68 MiB | 4.07 MiB/s \r",
"Downloaded 52% 42.10 MiB | 3.68 MiB/s \r",
"Downloaded 53% 42.10 MiB | 3.68 MiB/s \r",
"Downloaded 54% 42.10 MiB | 3.68 MiB/s \r",
"Downloaded 55% 42.10 MiB | 3.68 MiB/s \r",
"Downloaded 56% 42.10 MiB | 3.68 MiB/s \r",
"Downloaded 57% 42.10 MiB | 3.68 MiB/s \r",
"Downloaded 58% 42.10 MiB | 3.68 MiB/s \r",
"Downloaded 59% 42.10 MiB | 3.68 MiB/s \r",
"Downloaded 60% 44.94 MiB | 4.14 MiB/s \r",
"Downloaded 61% 44.94 MiB | 4.14 MiB/s \r",
"Downloaded 62% 44.94 MiB | 4.14 MiB/s \r",
"Downloaded 62% 44.94 MiB | 4.14 MiB/s \r",
"Downloaded 63% 46.48 MiB | 3.99 MiB/s \r",
"Downloaded 64% 46.48 MiB | 3.99 MiB/s \r",
"Downloaded 64% 50.14 MiB | 4.04 MiB/s \r",
"Downloaded 65% 50.14 MiB | 4.04 MiB/s \r",
"Downloaded 65% 52.54 MiB | 4.17 MiB/s \r",
"Downloaded 66% 55.90 MiB | 4.08 MiB/s \r",
"Downloaded 66% 57.35 MiB | 3.87 MiB/s \r",
"Downloaded 67% 59.61 MiB | 3.81 MiB/s \r",
"Downloaded 67% 59.61 MiB | 3.81 MiB/s \r",
"Downloaded 68% 63.14 MiB | 3.48 MiB/s \r",
"Downloaded 68% 63.14 MiB | 3.48 MiB/s \r",
"Downloaded 69% 65.30 MiB | 3.52 MiB/s \r",
"Downloaded 70% 65.30 MiB | 3.52 MiB/s \r",
"Downloaded 70% 69.03 MiB | 3.83 MiB/s \r",
"Downloaded 71% 70.43 MiB | 3.62 MiB/s \r",
"Downloaded 72% 70.43 MiB | 3.62 MiB/s \r",
"Downloaded 72% 70.43 MiB | 3.62 MiB/s \r",
"Downloaded 73% 72.05 MiB | 3.57 MiB/s \r",
"Downloaded 74% 72.05 MiB | 3.57 MiB/s \r",
"Downloaded 75% 74.07 MiB | 3.66 MiB/s \r",
"Downloaded 76% 76.26 MiB | 3.80 MiB/s \r",
"Downloaded 76% 76.26 MiB | 3.80 MiB/s \r",
"Downloaded 77% 78.72 MiB | 3.79 MiB/s \r",
"Downloaded 77% 78.72 MiB | 3.79 MiB/s \r",
"Downloaded 78% 78.72 MiB | 3.79 MiB/s \r",
"Downloaded 79% 78.72 MiB | 3.79 MiB/s \r",
"Downloaded 80% 85.18 MiB | 4.70 MiB/s \r",
"Downloaded 81% 85.18 MiB | 4.70 MiB/s \r",
"Downloaded 82% 85.18 MiB | 4.70 MiB/s \r",
"Downloaded 83% 85.18 MiB | 4.70 MiB/s \r",
"Downloaded 84% 85.18 MiB | 4.70 MiB/s \r",
"Downloaded 85% 85.18 MiB | 4.70 MiB/s \r",
"Downloaded 86% 85.18 MiB | 4.70 MiB/s \r",
"Downloaded 87% 92.76 MiB | 6.02 MiB/s \r",
"Downloaded 88% 92.76 MiB | 6.02 MiB/s \r",
"Downloaded 89% 92.76 MiB | 6.02 MiB/s \r",
"Downloaded 89% 92.76 MiB | 6.02 MiB/s \r",
"Downloaded 90% 92.76 MiB | 6.02 MiB/s \r",
"Downloaded 91% 92.76 MiB | 6.02 MiB/s \r",
"Downloaded 92% 92.76 MiB | 6.02 MiB/s \r",
"Downloaded 93% 102.39 MiB | 7.76 MiB/s \r",
"Downloaded 94% 102.39 MiB | 7.76 MiB/s \r",
"Downloaded 95% 113.15 MiB | 9.19 MiB/s \r",
"Downloaded 95% 113.15 MiB | 9.19 MiB/s \r",
"Downloaded 96% 114.78 MiB | 9.33 MiB/s \r",
"Downloaded 97% 114.78 MiB | 9.33 MiB/s \r",
"Downloaded 98% 123.62 MiB | 10.40 MiB/s \r",
"Downloaded 98% 125.96 MiB | 9.94 MiB/s \r",
"Downloaded 98% 125.96 MiB | 9.94 MiB/s \r",
"Downloaded 98% 136.67 MiB | 11.52 MiB/s \r",
"Downloaded 99% 138.21 MiB | 10.27 MiB/s \r",
"Downloaded 99% 146.75 MiB | 8.54 MiB/s \r",
"Downloaded 99% 146.75 MiB | 8.54 MiB/s \r",
"Downloaded 99% 153.79 MiB | 7.57 MiB/s \r",
"Downloaded 100% 157.59 MiB | 6.26 MiB/s \r",
"Downloaded 100% 160.82 MiB | 5.18 MiB/s \r",
"Downloaded 100% 163.52 MiB | 5.21 MiB/s \r"
]
}
],
"source": [
"my_greek_downloader.import_corpus('greek_text_first1kgreek')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 2176\r\n",
"-rw-r--r-- 1 root root 126919 Jul 13 10:05 Committing Issues using GitHub.docx\r\n",
"-rwxr-xr-x 1 root root 1889 Jul 13 10:05 cselstats.pl\r\n",
"drwxr-xr-x 118 root root 4096 Jul 13 10:05 data\r\n",
"-rwxr-xr-x 1 root root 1955024 Jul 13 10:05 #gelasius-kg.xml#\r\n",
"-rwxr-xr-x 1 root root 2414 Jul 13 10:05 greek-justwork.txt\r\n",
"-rwxr-xr-x 1 root root 3249 Jul 13 10:05 greek.txt\r\n",
"-rwxr-xr-x 1 root root 19777 Jul 13 10:05 Greek-works.txt\r\n",
"-rw-r--r-- 1 root root 19125 Jul 13 10:05 license.md\r\n",
"-rw-r--r-- 1 root root 58346 Jul 13 10:05 new_edition_metadata.csv\r\n",
"-rw-r--r-- 1 root root 697 Jul 13 10:05 pages.sh\r\n",
"-rwxr-xr-x 1 root root 1901 Jul 13 10:05 pnumber.xsl\r\n",
"-rw-r--r-- 1 root root 1658 Jul 13 10:05 README.md\r\n",
"drwxr-xr-x 2 root root 4096 Jul 13 10:05 save\r\n",
"drwxr-xr-x 48 root root 4096 Jul 13 10:05 split\r\n",
"drwxr-xr-x 2 root root 4096 Jul 13 10:05 volume_xml\r\n"
]
}
],
"source": [
"!ls -l ~/cltk_data/greek/text/greek_text_first1kgreek/"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Convert TEI XML corpus\n",
"\n",
"Here we'll convert the 1K Years' Greek corpus from TEI XML to plaintext"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from cltk.corpus.greek.tei import onekgreek_tei_xml_to_text"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#! If you get the following error: 'Install `bs4` and `lxml` to parse these TEI files.'\n",
"# then run: `pip install bs4 lxml`\n",
"\n",
"onekgreek_tei_xml_to_text()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"677\r\n"
]
}
],
"source": [
"# count the converted plaintext files:\n",
"!ls -l ~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/ | wc -l"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -184,7 +413,9 @@
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"my_greek_downloader.import_corpus('tlg', '~/cltk/corpora/TLG_E/')"
Oops, something went wrong.

0 comments on commit 434458a

Please sign in to comment.