# Install necessary packages

In [1]:
# !pip install --upgrade pip
# !pip install bs4
# !pip install requests
# !pip install html5lib
# !pip install lxml
# !pip install selenium==4.8.0
# !pip install webdriver-manager
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install seaborn


In [3]:
url = "https://www.turkishclass101.com/blog/2021/06/10/best-turkish-proverbs/#1"


In [4]:
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
 
# Define a custom user agent
my_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
 
# Set up Chrome options
options = uc.ChromeOptions()
options.add_argument("--headless")
options.add_argument(f"user-agent={my_user_agent}")
 
# Initialize Chrome WebDriver with the specified options
driver = uc.Chrome(options=options)
 
# Make a request to your target website.
driver.get(url=url)

# Close the driver
# driver.quit()


In [5]:
soup = BeautifulSoup(driver.page_source, 'html.parser') # driver.page_source is the html code of the page

In [6]:
idioms_table = soup.find_all('figure', class_='wp-block-table') # find all the tables in the page

In [7]:
# a single idiom table
# first tr is the turkish idiom. Second td of first tr is the idiom
# second tr is the literal translation. Second td of second tr is the literal translation
# third tr is the english equivalent. third tr has only one td. It contains explanation, example sentence and english equivalent sentence.
# fourth tr is the example

html_content = '''
<figure class="wp-block-table">
    <table border="1" cellspacing="0" width="100%">
        <tbody>
            <tr>
                <td rowspan="4">1</td>
                <td width="30%"><strong><span class="has-inline-color has-vivid-purple-color">Turkish</span></strong></td>
                <td><em>Zaman her şeyin ilacıdır.</em></td>
            </tr>
            <tr>
                <td><strong><span class="has-inline-color has-pale-cyan-blue-color">Literally</span></strong></td>
                <td>Time is the medicine of everything.</td>
            </tr>
            <tr>
                <td><strong><span class="has-inline-color has-vivid-cyan-blue-color">Equivalent in English</span></strong></td>
                <td>Time is the best medicine.</td>
            </tr>
            <tr>
                <td colspan="3">
                    As time passes, all the troubles we experience are forgotten or the sorrow we feel decreases.<br/><br/>
                    <u>Example:</u><br/><br/>
                    <em>Üzülme, bugünler de geçecek; zaman her şeyin ilacıdır.</em><br/><br/>
                    “Don’t worry, these days will be over, too; time is the best medicine.”
                </td>
            </tr>
        </tbody>
    </table>
</figure>
'''



In [39]:
import re
import pandas as pd
pd.set_option('display.max_colwidth', None)

def extract_idioms(idioms_table):
    """
    Extracts information from a list of idiom tables.

    Parameters:
    idioms_table (list): A list of BeautifulSoup objects representing idiom tables.

    Returns:
    tuple: A tuple of lists containing the extracted information. The lists are in the following order:
        - idioms: A list of strings representing the Turkish idioms.
        - Literals: A list of strings representing the literal translations of the idioms.
        - equivalents: A list of strings representing the English equivalents of the idioms.
        - explanations: A list of strings representing the explanations of the idioms.
        - tr_sentences: A list of strings representing the example sentences in Turkish.
        - en_sentences: A list of strings representing the example sentences in English.
    """
    idioms = []
    Literals = []
    equivalents = []
    explanations = []
    tr_sentences = []
    en_sentences = []

    for idiom_table in idioms_table:
        for idx,tr in enumerate(idiom_table.find_all('tr')):
            if idx == 0:
                idiom = tr.find_all('td')[2].text
                idioms.append(idiom)
            elif idx == 1:
                literal = tr.find_all('td')[1].text
                Literals.append(literal)
            elif idx == 2:
                if "Equivalent in English" in tr.find_all('td')[0].text:
                        equivalent = tr.find_all('td')[1].text
                        equivalents.append(equivalent)
                else:
                    equivalents.append(None)
                    raw = str(tr.find_all('td')) # convert to string for regex
                    explanation = raw[raw.index('[<td colspan="3">')+len('[<td colspan="3">'):raw.index("<br/><br/>")] # extract explanation
                    explanations.append(explanation)
                    tr_sentence = raw[raw.index('<em>')+len('<em>'):raw.index('</em>')]
                    tr_sentences.append(tr_sentence)    
                    en_sentence = raw[raw.index('</em><br/><br/>')+len('</em><br/><br/>'):raw.index('</td>')]
                    en_sentences.append(en_sentence)
                    
                    continue
                    
            elif idx == 3:
                raw = str(tr.find_all('td')) # convert to string for regex
                explanation = raw[raw.index('[<td colspan="3">')+len('[<td colspan="3">'):raw.index("<br/><br/>")] # extract explanation
                explanations.append(explanation)
                tr_sentence = raw[raw.index('<em>')+len('<em>'):raw.index('</em>')]
                tr_sentences.append(tr_sentence)    
                en_sentence = raw[raw.index('</em><br/><br/>')+len('</em><br/><br/>'):raw.index('</td>')]
                en_sentences.append(en_sentence)

    return pd.DataFrame({'idioms':idioms, 'Literals':Literals, 'equivalents':equivalents, 'explanations':explanations, 'tr_sentences':tr_sentences, 'en_sentences':en_sentences})




In [40]:
len(idioms_table)

30

In [41]:
df = extract_idioms(idioms_table)
df

Unnamed: 0,idioms,Literals,equivalents,explanations,tr_sentences,en_sentences
0,Zaman her şeyin ilacıdır.,Time is the medicine of everything.,Time is the best medicine.,"As time passes, all the troubles we experience are forgotten or the sorrow we feel decreases.","Üzülme, bugünler de geçecek; zaman her şeyin ilacıdır.","“Don’t worry, these days will be over, too; time is the best medicine.”"
1,"Sakla samanı, gelir zamanı.","Save the hay, its time will come.",Keep a thing seven years and you’ll find a use for it.,"If you hold onto something you have for long enough, it will eventually become useful.","İyi ki kızımın bebek arabasını saklamışım, şimdi senin çok işine yarayacak. Eee, sakla samanı gelir zamanı.","“Fortunately, I saved my daughter’s stroller. It will be very useful for you now. See, keep a thing for seven years and you’ll find a use for it.”"
2,Vakit nakittir.,Time is cash.,Time is money.,This proverb emphasizes that time is a valuable resource.,"Bir an önce işe gitmeliyim. Eee, ne de olsa vakit nakittir.","“I have to go to work as soon as possible. Well, after all, time is money.”"
3,Bugünün işini yarına bırakma.,Don’t leave today’s work for tomorrow.,Never put off till tomorrow what you can do today.,This one emphasizes that one should not delay doing something that can be done today.,Ödevimi yarın yaparım deyince babam bugünün işini yarına bırakma dedi.,"“When I said I would do my homework tomorrow, my father said ‘Never put off till tomorrow what you can do today.’ “"
4,Sona kalan dona kalır.,"The one who stays the last, is left for the frost.",The devil takes the hindmost.,The people who lag behind will either lose or not have any benefits.,"Ali amca çocuklara şeker veriyor, koşun; sona kalan dona kalır.","“Uncle Ali is giving candy to the children, run; the devil takes the hindmost.”"
5,Erken kalkan yol alır.,The one who gets up early proceeds.,The early bird catches the worm.,"This proverb advises that if someone does something immediately (or before anyone else), he/she will have an advantage.","Daha 5 saatlik yolumuz var, artık yola çıksak iyi olur. Ne de olsa, erken kalkan yol alır.","“We have five more hours to go, we’d better get going. After all, the early bird catches the worm.”"
6,Çıkmadık candan umut kesilmez.,"If the person didn’t die, there is still hope.","While there’s life, there’s hope.","If something didn’t fail completely, there is still a chance to save it.","Üzülme, son aday henüz açıklanmadı. Çıkmadın candan umut kesilmez.","“Don’t worry, the last candidate has not been announced yet. While there’s life, there’s hope.”"
7,Gün doğmadan neler doğar.,"Before the sun rises, a lot of things rise.",Tomorrow is another day.,"A person should never lose hope, because nobody knows what’s going to happen tomorrow.",Öyle hemen umudunu kaybetme. Gün doğmadan neler doğar.,“Don’t lose your hope. Tomorrow is another day.”
8,Bana arkadaşını söyle sana kim olduğunu söyleyeyim.,"Tell me who your friend is, I will tell you who you are.","Tell me who you go with, and I’ll tell you who you are.",This proverb means that a person’s friends are a reflection of who he/she is.,"John o gruba girdiğinden beri her gün kavga ediyor. Eee, ne demişler ‘Bana arkadaşını söyle sana kim olduğunu söyleyeyim.’","“Since John got into that group, he’s been fighting every day. Well, they say, ‘Tell me who you go with, and I’ll tell you who you are.’ ”"
9,Dost kara günde belli olur.,A real friend is understood on a bad day.,A friend in need is a friend indeed.,A person who stays by your side during difficult times is someone you can really rely on.,"İflas ettiğinden beri Mary dışında hiçbir arkadaşı yanında değil. Eee, dost kara günde belli olur.","“Since she went bankrupt, none of her friends are with her except for Mary. Well, a friend in need is a friend indeed.”"


In [42]:
df.to_csv('turkish_proverbs.csv', index=False)