In [1]:
import os

from pyspark import SparkContext
from pyspark.sql import SparkSession

parquet_pos = os.path.join(os.getcwd(), "data/wiktionary/parquet-index_2.11-0.4.1-SNAPSHOT.jar")

spark = SparkSession.builder \
                        .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
                        .config("spark.jars", parquet_pos) \
                        .getOrCreate()

spark.sparkContext.addPyFile(parquet_pos)

In [2]:
wiktionary_df = spark.read.json("data/wiktionary/senses_examples_quotations_v2.json")

In [3]:
from pyspark.sql.functions import *

wiktionary_df.filter('word = "run"').select(explode('senses').alias('sense')) \
                .select(explode('sense.subsenses').alias('subsense')).select('subsense.glosses') \
                .collect()

[Row(glosses=['(intransitive) To move forward quickly upon two feet by alternately making a short jump off either foot. (Compare walk.)']),
 Row(glosses=['(intransitive) To go at a fast pace, to move quickly.']),
 Row(glosses=['(transitive) To cause to move quickly or lightly.']),
 Row(glosses=['(transitive) To transport someone or something, notionally at a brisk pace.']),
 Row(glosses=['(transitive or intransitive) To compete in a race.']),
 Row(glosses=['(intransitive) Of fish, to migrate for spawning.']),
 Row(glosses=['(American football transitive or intransitive) To carry (a football) down the field, as opposed to passing or kicking.']),
 Row(glosses=['(transitive) To achieve or perform by running or as if by running.']),
 Row(glosses=['(intransitive) To flee from a danger or towards help.']),
 Row(glosses=['(figurative transitive) To go through without stopping, usually illegally.']),
 Row(glosses=['(transitive juggling colloquial) To juggle a pattern continuously, as opposed t

In [6]:
wiktionary_df.select(explode('senses').alias('sense')).select('sense.quotations').filter(col('sense.quotations').isNotNull()).limit(10).collect()

[Row(quotations=[[Row(author='Jeremy Taylor', day=None, month=None, page=None, publisher=None, tag=' (song)', text="Ag pleez Deddy won't you take us to the wrestling / We wanna see an ou called Sky High Lee", title='{{w|Ag Pleez Deddy}}', year='1962'), Row(author='André Brink', day=None, month=None, page='p. 88:', publisher='Vintage 1998', tag='', text="‘Ag, fuck it,' he said. ‘Let bygones be bygones, man.'", title='A Dry White Season', year='1979'), Row(author='Nelson Mandela', day=None, month=None, page='p. 491:', publisher='Abacus 2010', tag='', text="Finally, after placing four books on the desk, he turned to a sheepish Kathy and said, ‘Ag, there's nothing wrong with these desks,' and walked out.", title='Long Walk to Freedom', year='1994')]]),
 Row(quotations=[[Row(author='John Milton', day=None, month=None, page=None, publisher='lines 203-205', tag='', text='Am I not sung and proverbed for a fool / In every street, do they not say, "How well / Are come upon him his deserts?"', ti

## Count examples

In [14]:
# not-nested
wiktionary_df.select(explode("senses").alias("sense")) \
                .select(explode("sense.examples").alias("example")) \
                .na.replace("", None).na.drop().count()

47298

In [15]:
# nested
wiktionary_df.select(explode("senses").alias("sense")).select(explode("sense.subsenses").alias("subsense")) \
                .select(explode('subsense.examples').alias('example')) \
                .na.replace("", None).na.drop().count()

621

In [16]:
# doubly nested
wiktionary_df.select(explode("senses").alias("sense")).select(explode("sense.subsenses").alias("subsense")) \
                .select(explode('subsense.usages').alias('usage')).select(explode('usage.examples').alias('example')) \
                .na.replace("", None).na.drop().count()

4

In [5]:
from tools.extractors import *

wiktionary_df2 = extract_form(extract_df(wiktionary_df))

wiktionary_df2.write.mode("overwrite").parquet("data/wiktionary/senses_examples_quotations_v2.parquet")