# German vocabulary

In [67]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pathlib import Path
import requests
import toml

In [17]:
spark = SparkSession.builder.appName("GermanVocabulary").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

In [58]:
data_folder = "/opt/spark/data/vocabulary/deu"
github_user = 'dpurge'
github_token = 'github_pat_xxxxx'
github_data = "https://raw.githubusercontent.com/dpurge/jdp-flashcards/main/src/lang-vocabulary/deu"

In [48]:
Path(data_folder).mkdir(parents=True, exist_ok=True)

In [65]:
cfgdata = requests.get(f'{github_data}/jdp-apkg.toml', auth=(github_user, github_token))
cfg = toml.loads(cfgdata.text)
for i in cfg['data']:
    datafile = Path(f'{data_folder}/{i["filename"]}')
    datafile.parent.mkdir(parents=True, exist_ok=True)
    if not datafile.is_file():
        with requests.get(f'{github_data}/{i["filename"]}', stream=True, auth=(github_user, github_token)) as r:
            r.raise_for_status()
            with datafile.open(mode='wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print(f'Downloaded: {datafile}')

Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/02.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/03.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/04.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/05.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/06.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/07.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/08.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/09.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/10.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/11.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/12.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/13.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/14.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/15.csv
Downloaded: /opt/spark/data/vocabulary/deu/gute-reise-1982/16.csv
Downloaded

In [71]:
schema = schema = StructType([
    StructField("Phrase",StringType(),False),
    StructField("Grammar",StringType(),True),
    StructField("Transcription",StringType(),True),
    StructField("Translation",StringType(),False),
    StructField("Notes",StringType(),True)
])

In [108]:
df = spark.read.format("csv").option("header",True).option("recursiveFileLookup", "true").options(delimiter="\t").schema(schema).load(data_folder)
print(f'Number of records: {df.count()}')

Number of records: 1876


In [109]:
df = df.distinct()
print(f'Number of distinct records: {df.count()}')

Number of distinct records: 1609


In [114]:
indexDuplicates = df.groupBy("Phrase").count().where("count > 1").drop("count")
duplicates = df.join(indexDuplicates, ["Phrase"],"left_semi").sort("Phrase")
duplicates.show()

+-------------+-------+-------------+--------------------+--------------------+
|       Phrase|Grammar|Transcription|         Translation|               Notes|
+-------------+-------+-------------+--------------------+--------------------+
|    Bis bald!|   null|         null|           Na razie!|                null|
|    Bis bald!|   null|         null|Do zobaczenia wkr...|                null|
| Guten Abend!|   null|         null|      Dobry wieczór!|                null|
| Guten Abend!|   null|         null|      Dobry wieczór!|    od godziny 18-ej|
|Guten Morgen!|   null|         null|        Dzień dobry!|                rano|
|Guten Morgen!|   null|         null|        Dzień dobry!|od rana do godzin...|
| Platz nehmen|   null|         null|siadać; zajmować ...|                null|
| Platz nehmen|   null|         null|       zająć miejsce|                null|
|      Tschüs!|   null|         null|           Na razie!|                null|
|      Tschüs!|   null|         null|   

In [122]:
duplicates.coalesce(1).write.mode('overwrite').options(header='True', delimiter="\t").option("emptyValue", "").csv(f'{data_folder}/duplicates')