# To do 
- [ ] Add translation of the input phrase and of the words table
- [ ] Add text to speech method
- [ ] Add words learned attribute

# Environment

In [1]:
# web scraping 
import requests
from bs4 import BeautifulSoup
# nlp library
import spacy
# de, en nlp models
import de_core_news_sm, en_core_web_sm
# data wrangling
import pandas as pd
# translation
from google.cloud import translate_v2
# create google application credentials
import os
# text to speech
from gtts import gTTS
from IPython.display import Audio

# Class

In [43]:
class LearningGerman:
    def __init__(self, text='testen'):
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '.projdata/credentials.json'
        self.client = translate_v2.Client()
        self.text = text
        self.language = self.client.detect_language(self.text)['language']
        self.model = 'nmt'
        self.table = self.nlp()
    
    def translate(self):
        if self.language == 'de':
            to = 'en'
        elif self.language == 'en':
            to = 'de'
        return self.client.translate(self.text, source_language=self.language, target_language=to, 
                         model=self.model)['translatedText']
    
    def nlp(self):
        if self.language == 'de':
            nlp = de_core_news_sm.load()
            column_names = ['deutsche']
        elif self.language == 'en':
            nlp = en_core_web_sm.load()
            column_names = ['english']
        doc = nlp(self.text)
        words, lemma, pos, details = ([] for i in range(4))
        columns = ['lemma', 'pos', 'details']
        column_names.extend(columns)
        for token in doc:
            words.append(token.text)
            lemma.append(token.lemma_.lower())
            pos.append(token.pos_)
            details.append(spacy.explain(token.tag_))
        words_table = pd.DataFrame(data=zip(words, lemma, pos, details), columns=column_names)
        self.drop_punct(words_table)
        return words_table
    
    @staticmethod
    def drop_punct(table):
        if table['pos'].str.contains('PUNCT').any():
            table.drop(table[table.pos == 'PUNCT'].index, inplace=True)

# Functions

## Text to speech

In [46]:
def de_say(txt, language='de', slowmode=False):
    speech = gTTS(txt, lang=language, slow=slowmode)
    speech.save('speech.mp3')
    return Audio('speech.mp3', autoplay=False)

## Translation

In [47]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '.projdata/credentials.json'

In [55]:
def translate(txt, target='en', src='en'):
    translate_client = translate_v2.Client()
    result = translate_client.translate(txt, target_language=target, source_language=src, model='nmt')['translatedText']
    return result

In [59]:
def de_to_en(txt):
    print(translate(txt, target='en', src='de'))
    process_de(txt)
    return words_table

In [57]:
def en_to_de(txt):
    print(translate(txt, target='de', src='en'))

## NLP

In [13]:
def process_de(txt):
    nlp = de_core_news_sm.load()
    doc = nlp(txt)
    deutsche, lemma, details = ([] for i in range(3))
    column_names = ['deutsche', 'lemma', 'details']
    for token in doc:
        deutsche.append(token.text)
        lemma.append(token.lemma_.lower())
        details.append(spacy.explain(token.tag_))
    words_table = pd.DataFrame(data=zip(deutsche, lemma, details), columns=column_names)
    return words_table

In [14]:
process_de('Ich bin ein Junge')

Unnamed: 0,deutsche,lemma,details
0,Ich,ich,non-reflexive personal pronoun
1,bin,sein,"finite verb, auxiliary"
2,ein,einen,definite or indefinite article
3,Junge,junge,"noun, singular or mass"


# Web Scraping

In [3]:
# creating a HTTP request to extract the HTML code that contains the phrases
url = 'https://www.rosettastone.com/languages/german-phrases/'
page = requests.get(url)
# checking the status of the request
if page.status_code == 200:
  print('OK!')
else:
  print('404 ERROR!')

OK!


In [4]:
# extracting HTML list tag from the page
page_content = BeautifulSoup(page.content, 'html.parser')
phrase_html_list = page_content.ul
phrase_html_list

<ul>
<li>Guten Tag = Good morning</li>
<li>Hallo = Hello</li>
<li>Ich heiße … = My name is …</li>
<li>Sprechen Sie Englisch? = Do you speak English?</li>
<li>Wie heißt du? = What’s your name?</li>
<li>Wie geht es dir? = How are you?</li>
<li>Gut, danke = Fine, thank you</li>
<li>Nett, Sie kennen zu lernen = Nice to meet you</li>
<li>Tisch für zwei bitte = Table for two, please</li>
<li>Wo ist die Toilette? = Where is the bathroom?</li>
<li>Danke = Thank you</li>
<li>Wie komme ich zu …? = How can I get to …?</li>
<li>Gibt es ein Restaurant in der Nähe? = Is there a restaurant nearby?</li>
<li>Ich liebe dich = I love you</li>
<li>Wie viel kostet das …? = How much is this …?</li>
<li>Es tut mir leid, ich verstehe das nicht = Sorry, I don’t understand</li>
<li>Haben Sie noch Zimmer frei? = Do you have any rooms available?</li>
<li>Auf Wiedersehen = Goodbye</li>
</ul>

In [5]:
# extracting only the deutsche parts of the strings
deutsche_list = []
for phrase in phrase_html_list.stripped_strings:
  deutsche_list.append(phrase.split(' = ')[0])
deutsche_list

['Guten Tag',
 'Hallo',
 'Ich heiße …',
 'Sprechen Sie Englisch?',
 'Wie heißt du?',
 'Wie geht es dir?',
 'Gut, danke',
 'Nett, Sie kennen zu lernen',
 'Tisch für zwei bitte',
 'Wo ist die Toilette?',
 'Danke',
 'Wie komme ich zu …?',
 'Gibt es ein Restaurant in der Nähe?',
 'Ich liebe dich',
 'Wie viel kostet das …?',
 'Es tut mir leid, ich verstehe das nicht',
 'Haben Sie noch Zimmer frei?',
 'Auf Wiedersehen']

# NLP

In [6]:
# loading the deutsche model 
nlp = de_core_news_sm.load()
# processing our sample
doc_list = [nlp(phrase) for phrase in deutsche_list]

**de_text**: The original deutsche word text.  
**lemma**: The base form of the word.   
**pos**: The simple UPOS part-of-speech tag (list of universal POS tags: https://universaldependencies.org/docs/u/pos/)  
**details**: The detailed part-of-speech

In [7]:
# declaring the dataframe columns and containers for the values
column_names = ['deutsche', 'lemma', 'pos', 'details']
deutsche_text, lemma, pos, details = ([] for i in range(4))


# collecting 13th phrase data
for word in doc_list[0]:
  deutsche_text.append(word.text)
  lemma.append(word.lemma_.lower()) 
  pos.append(word.pos_) 
  details.append(spacy.explain(word.tag_))


# creating the words dataframe
words = pd.DataFrame(data=zip(deutsche_text, lemma, pos, details), columns=column_names)
words

Unnamed: 0,deutsche,lemma,pos,details
0,Guten,guten,ADJ,"adjective, attributive"
1,Tag,tag,NOUN,"noun, singular or mass"


In [8]:
# checking for punctuation and removing it if true
if words['pos'].str.contains('PUNCT').any():
  words.drop(words[words.pos == 'PUNCT'].index, inplace=True)
words

Unnamed: 0,deutsche,lemma,pos,details
0,Guten,guten,ADJ,"adjective, attributive"
1,Tag,tag,NOUN,"noun, singular or mass"


# Translation

In [9]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'credentials.json'

translate_client = translate.Client()

translated_words = [translate_client.translate(word, target_language='en', source_language='de', model='nmt')['translatedText'] for word in words['deutsche']]
words['english'] = translated_words
words

Unnamed: 0,deutsche,lemma,pos,details,english
0,Guten,guten,ADJ,"adjective, attributive",Good ones
1,Tag,tag,NOUN,"noun, singular or mass",Day
